def setup_tree(cls, update=False): if update: LOGGER.info("Settings modified, performing a check on the " "repertories tree structure, updating it if necessary") if not SettingsManager.is_loaded(): error_msg = ("Attempting to setup repertories " "tree structure before the SettingsManager " "has been loaded") LOGGER.error(error_msg) raise RuntimeError(error_msg) def setup_repertory(dir_path): if not os.path.isdir(dir_path): try: os.makedirs(dir_path, exist_ok=True) LOGGER.debug( "Repertory %s created recursively.", dir_path) except Exception as msg_error: LOGGER.exception( "Couldn't create repertory %s.",dir_path) return dir_path if DEBUG: tempfile.gettempdir() tempfile.tempdir = None harnais_dir = os.path.join(tempfile.gettempdir(), "harnais") else: harnais_dir = SettingsManager.get("harnaisDir") # repertory of temporary dissRequest JSON files cls._repertories["temp_dissRequest_A"] = setup_repertory( join(harnais_dir, REPERTORY_TREE.temp_dissrequest_a) ) cls._repertories["temp_dissRequest_B"] = setup_repertory( join(harnais_dir, REPERTORY_TREE.temp_dissrequest_b) ) cls._repertories["temp_dissRequest_C"] = setup_repertory( join(harnais_dir, REPERTORY_TREE.temp_dissrequest_c) ) cls._repertories["temp_dissRequest_D"] = setup_repertory( join(harnais_dir, REPERTORY_TREE.temp_dissrequest_d) ) cls._repertories["dir_ack"] = dir_ack = SettingsManager.get("harnaisAckDir") if not os.path.isdir(cls._repertories["dir_ack"]): LOGGER.error("Ack repertory %s does " "not exist", dir_ack) # storing the settings file signature cls._checksum = SettingsManager.get_checksum()
def rename(self): """ Rename files using user specified regex """ # Extract all the regex specified by users. # Each regex should be a dictionnary with two keys, # pattern_in and pattern_out for the substitution. # The program can extract up to MAX_REGEX from the settings # like this in the settings.yaml : # fileRegex1: # pattern_in: # pattern_out: # fileRegex2: # pattern_in: # pattern_out: # ... # fileRegex<MAX_REGEX>: # pattern_in: # pattern_out: # Each set of regex is applied one after the other. # Except for tmpregex that has only one value corresponding to # pattern_out regex_settings = [SettingsManager.get( "fileRegex%i" % i, {}) for i in range(1, MAX_REGEX+1)] new_filename = self.original_filename zip_detector = r"^tmp\.zip\..*" # we check if the file is a zipped one # by looking if it matches r"^tmp\.zip\..*" if re.match(zip_detector, new_filename) is not None: zip_regex = SettingsManager.get("tmpregex") if zip_regex is None: LOGGER.error("No regex defined in tmpregex settings !") else: new_filename = self._rename_by_regex(new_filename, zip_detector, zip_regex) else: for idx, regex_instruction in enumerate(regex_settings): reg = regex_instruction.get("pattern_in", None) repl = regex_instruction.get("pattern_out", None) if None not in (reg, repl): new_filename = self._rename_by_regex(new_filename, reg, repl) elif idx == 0: LOGGER.error("No regex defined in fileregex1 settings !") # we check that the file renamed fits difmet standards filename_ok = self.check_filename(new_filename) if filename_ok: # update record with new filename Database.update_field_by_query("final_file", new_filename, **dict(original_file=self.original_filename)) self.new_filename = new_filename # we return if the process returned a fit new filename or not return filename_ok
def __init__(self, req_id, hostname): # path to file on staging post self.staging_path = SettingsManager.get("openwisStagingPath") or "" self.hostname = SettingsManager.get("openwisHost") or hostname self.user = SettingsManager.get("openwisSftpUser") or None self.password = SettingsManager.get("openwisSftpPassword") or None self.port = SettingsManager.get("openwisSftpPort") or None self.req_id = req_id + hostname
def process(cls, max_loops=0): cls.nb_workers = SettingsManager.get("sendFTPlimitConn") # in debug mode, it is possible to set pool_method = cls.get_pool_method() cls.pool = pool_method(processes=cls.nb_workers) counter = 0 cls.setup_process() while cls._running: counter += 1 cls.signal_loop(counter) cls.load_settings() cls.update_workers() # idle time idle_time = SettingsManager.get("sendFTPIdle") sleep(idle_time) # get settings cls.dir_c = dir_c = HarnessTree.get("temp_dissRequest_C") cls.dir_d = dir_d = HarnessTree.get("temp_dissRequest_D") # move back any remaining file from D to C cls.move_back_files() # get files in C max_files = cls.nb_workers list_files_c = cls.get_file_list(dir_c, max_files) files_to_ftp = cls.move_files(list_files_c, dir_d) for file_ in files_to_ftp: file_expired = cls.check_file_age(file_) if file_expired: # TODO we need to find a way to update the info to the database # would require looking at the file compressed though Tools.remove_file(file_, "difmet archive", LOGGER) continue size = os.stat(file_).st_size timeout = cls.compute_timeout(size, file_) # start download # renaming file to prevent any operation on it. cls.lock_file(file_) res = cls.pool.apply_async( cls.abortable_ftp, (cls.upload_file, file_, dir_c, dir_d), dict(timeout=timeout)) # for testing and debugging purpose only cls.check_end_loop(counter, max_loops)
def check_file_age(filename): time_limit = SettingsManager.get("keepFileTimeSender") or None if time_limit is not None: check = (time() - os.stat(filename).st_mtime) > time_limit else: check = False return check
def _get_priority(self): donot_compute_priority = SettingsManager.get("sla") default_priority = SettingsManager.get("defaultPriority", PRIORITIES.default) if donot_compute_priority: highest_priority = default_priority else: priority_list = [] for req_id in self.instructions.keys(): priority = self._get_instr(req_id, "diffpriority") priority_list.append(priority) highest_priority = min(priority_list + [default_priority]) return highest_priority
def update_workers(cls): # update the number of workers if necessary nbw = SettingsManager.get("sendFTPlimitConn") if nbw != cls.nb_workers: #wait for every upload to be finished cls.pool.close() cls.pool.join() #update workers cls.nb_workers = nbw cls.pool = pool_method(processes=nbw)
def check_file_age(filename): """ Discard files that are too old """ time_limit = SettingsManager.get("keepFileTime") or None if time_limit is not None: check = (time() - os.stat(filename).st_mtime) > time_limit else: check = False return check
def check_transfer(cls, filename, ftp): # check if file exists on remote server ftpdir = SettingsManager.get("dissFtpDir") ftp.cwd(ftpdir) if filename in [name for name, data in list(ftp.mlsd())]: upload_ok = True else: upload_ok = False ftp.quit() return upload_ok
def compute_priority(priority, sla): # priority is scaled from 1 (highest) to 4 (lowest) # sla is 0 (BRONZE), 1 (SILVER), 2 (GOLD) priority_activated = SettingsManager.get("sla") if type(priority_activated) == str: priority_activated = strtobool(priority_activated) if priority_activated == False: result = SettingsManager.get("defaultPriority") or PRIORITIES.default elif priority ==1: result = PRIORITIES.maximum elif priority >=2: default_priority = SettingsManager.get("defaultPriority") or PRIORITIES.default result = default_priority + priority - 2*sla result = max(PRIORITIES.maximum, result) result = min(PRIORITIES.minimum, result) return result
def _get_end_date(): limit = SettingsManager.get("fileEndLive") or 0 if limit ==0: end_date = None else: # convert limit in minutes in seconds end_date = datetime.utcnow() + timedelta(seconds=limit*60) end_date = end_date.strftime("%Y-%m-%dT%H:%M:%SZ") return end_date
def upload_file(cls, file_, ftp): upload_ok = False start = time() file_locked = file_ + ".lock" with open(file_locked, 'rb') as file_transfered: ftpdir = SettingsManager.get("dissFtpDir") ftp.cwd(ftpdir) # renaming in tmp file_renamed = basename(file_) + ".tmp" ftp.storbinary('STOR ' + file_renamed, file_transfered) ftp.rename(file_renamed, basename(file_)) upload_ok = True return upload_ok, time() - start
def get_file_list(cls, dirname, maxfiles): overflow = SettingsManager.get("ManagerOverflow") list_entries = os.listdir(dirname) list_entries = [os.path.join(dirname, entry) for entry in list_entries] list_files = [ entry for entry in list_entries if not os.path.isdir(entry)] list_files.sort(key=lambda x: os.stat(x).st_mtime) if overflow is not None and len(list_files) > overflow: LOGGER.warning("%s repertory is overflowing. " "Number of files %i over the limit %i", cls.dir_a, len(list_files), overflow) list_files = list_files[-maxfiles:] return list_files
def compute_timeout(required_bandwith): # compute timeout by doing timeout = size/bandwidth + buffer bandwidth = SettingsManager.get("bandwidth") if bandwidth in [None, 0]: LOGGER.warning("Incorrect value for harness settings bandwidth. " "Sftp timeout set to default TIMEOUT %i s.", TIMEOUT) timeout = TIMEOUT elif DEBUG: timeout = DEBUG_TIMEOUT LOGGER.debug("Sftp debug timeout set to %s s", timeout) else: # conversion in Mbits/s with shift_expr << operator timeout = (required_bandwith/(1 << 17))/bandwidth + TIMEOUT_BUFFER LOGGER.debug("Sftp timeout computed to %s s", timeout) # start download return timeout
def _get_logdir(): if DEBUG: gettempdir() tempfile.tempdir = None log_dir = join(gettempdir(), "harnais") dir_error_msg = ("Incorrect logdir value {v}. " "It should be the path to a valid " "directory.".format(v=log_dir)) try: os.mkdir(log_dir) except (FileExistsError): pass else: log_dir = SettingsManager.get("harnaisLogdir") dir_error_msg = ( "Incorrect logdir value {v} in settings_harnais.yaml. " "It should be the path to a valid " "directory.".format(v=log_dir)) return log_dir, dir_error_msg
def compute_timeout(required_bandwith, file_): # compute timeout bandwidth = SettingsManager.get("bandwidth") if bandwidth in [None, 0]: LOGGER.warning( "Incorrect value for harness settings bandwidth. " "ftp timeout set to default TIMEOUT %i s for file %s.", TIMEOUT, file_) timeout = TIMEOUT elif DEBUG: timeout = DEBUG_TIMEOUT LOGGER.debug("Ftp debug timeout set to %s s for file %s.", timeout, file_) else: # conversion in Mbits/s with shift_expr << operator timeout = (required_bandwith / (1 << 17)) / bandwidth + TIMEOUT_BUFFER LOGGER.debug("Ftp timeout computed to %s s for file %s.", timeout, file_) return timeout
def get_file_list(cls, dirname, maxfiles): overflow = SettingsManager.get("SenderOverflow") list_entries = os.listdir(dirname) # don't takes files into account if they end by .tmp list_entries = [ item for item in list_entries if match(r".*\.tmp$", item) is None ] list_entries = [os.path.join(dirname, entry) for entry in list_entries] # sort by date list_files = [e for e in list_entries if not os.path.isdir(e)] list_files.sort(key=lambda x: os.stat(x).st_mtime) if overflow is not None and len(list_files) > overflow: LOGGER.warning( "%s repertory is overflowing. " "Number of files %i over the limit %i", cls.dir_c, len(list_files), overflow) list_files = list_files[:maxfiles] return list_files
def connect_ftp(): hostname = SettingsManager.get("dissHost") user = SettingsManager.get("dissFtpUser") password = SettingsManager.get("dissFtpPasswd") port = SettingsManager.get("dissFtpPort") try: ftp = FTP() ftp.connect(hostname, port) ftp.login(user, password) if SettingsManager.get("dissFtpMode") == "active": ftp.set_pasv(False) LOGGER.debug("FTP mode set to active") elif SettingsManager.get("dissFtpMode") == "passive": ftp.set_pasv(True) LOGGER.debug("FTP mode set to passive") ftp_connected = True except Exception as e: LOGGER.exception("Couldn't connect to %s", hostname) ftp_connected = False ftp = None return ftp_connected, ftp
def _create_diffmet_instr(self): """ Create the difmet instruction file according to the information contained in self.instructions for all the request Id linked to the file """ def get_prefix(diff): if diff["DiffusionType"] == "FTP": prefix = "standby_ftp_" else: prefix = "standby_email_" return prefix date_str = strftime("%Y%m%d%H%M%S") # difmet instruction file name path_to_file = ",".join((SettingsManager.get("diffFileName"), self.incr, "", date_str, )) path_to_file += ".diffusions.xml" path_to_file = os.path.join(self.dir_b, path_to_file) # create the xml structure root = etree.Element("product_diffusion") product = etree.SubElement(root, "product") etree.SubElement(product,"file_name").text = Tools.ack_str(self.new_filename) etree.SubElement(product,"file_size").text = Tools.ack_str(self._get_file_size()) etree.SubElement(product,"priority").text = Tools.ack_str(self._get_priority()) etree.SubElement(product,"archive").text = "0" end_date = self._get_end_date() # add a end_date only if user specified one if end_date is not None: etree.SubElement(product,"end_to_live_date").text = Tools.ack_str(end_date) # loop on all the requests linked to the file to send to difmet for req_id in self.id_list: diffusion = etree.SubElement(product,"diffusion") # fetch the informations stored from json instruction file instr = self.instructions[req_id] diff = instr["diffusion"] etree.SubElement(diffusion,"diffusion_externalid").text = Database.get_external_id(req_id, self.new_filename) etree.SubElement(diffusion,"archive").text = "0" self.diff_info_to_xml(diffusion, diff) # if an alternative diffusion is requested, it is added if "alternativeDiffusion" in instr.keys(): altdiff = instr["alternativeDiffusion"] prefix = get_prefix(altdiff) self.diff_info_to_xml(diffusion,altdiff, prefix=prefix) etree.SubElement(diffusion,"standby_media").text = altdiff["DiffusionType"] if altdiff["DiffusionType"] == "FTP" and diff["DiffusionType"] == "FTP": etree.SubElement(diffusion, "switch_method_medias_ftp").text = "NTRY" etree.SubElement(diffusion,"standby_switch_try_number").text = "3" etree.SubElement(product, "diffusionnumber").text=str(len(self.id_list)) etree.SubElement(root, "productnumber").text="1" # Dump the xml tree into a file etree.ElementTree(root).write(path_to_file, pretty_print=True, encoding='UTF-8', xml_declaration=True) return path_to_file
def mail_to_xml(element,diff_info,prefix=""): etree.SubElement(element, prefix + "media").text = "EMAIL" etree.SubElement(element, prefix + "email_adress").text = Tools.ack_str(diff_info["address"]) etree.SubElement(element, prefix + "email_to_cc").text = Tools.ack_str(diff_info["dispatchMode"]) etree.SubElement(element, prefix + "email_subject").text = Tools.ack_str(diff_info["subject"]) etree.SubElement(element, prefix + "email_text_in_body").text = "0" if diff_info["fileName"] not in [None, ""]: etree.SubElement(element, prefix + "email_attached_file_name").text = Tools.ack_str(diff_info["fileName"]) else: etree.SubElement(element, prefix + "email_attached_file_name").text = Tools.ack_str(SettingsManager.get("attachmentName"))
def process(cls, max_loops=0): counter = 0 instr_to_process = False # initilization cls.setup_process() loop_time = 0 while cls._running: counter +=1 cls.signal_loop(counter) cls.load_settings() cls.dir_a = dir_a = HarnessTree.get("temp_dissRequest_A") cls.dir_b = dir_b = HarnessTree.get("temp_dissRequest_B") cls.dir_c = HarnessTree.get("temp_dissRequest_C") start_time = time() # idle time idle_time = SettingsManager.get("processFileIdle") # if a loop lasted longer than the idle time, idle time is bypassed. if not loop_time > idle_time: sleep(idle_time) # get the maxDirectiveFile first files max_direc_files = SettingsManager.get("processFileDPmax") list_files_a = cls.get_file_list(dir_a, maxfiles=max_direc_files) instruction_files = cls.move_files(list_files_a, dir_b) if instruction_files == []: if instr_to_process: LOGGER.debug("No instruction file to process, moving on.") instr_to_process = False loop_time = time() - start_time cls.check_end_loop(counter, max_loops) continue else: LOGGER.debug("Fetched %i instruction files from %s", len(instruction_files), dir_a) instr_to_process = True # process instruction files diss_instructions = dict() all_files_fetched = [] for file_to_process in instruction_files: # empty the list, one item at a time process_ok, instructions, files_fetched = cls.process_instruction_file( file_to_process) # if the fetching went ok, we store the rest of # the instructions contained in the instruction files # and update the list of files fetched from the staging # post if process_ok: req_id = instructions["req_id"] hostname = instructions["hostname"] diss_instructions[req_id+hostname] = instructions all_files_fetched += [item for item in files_fetched if item not in all_files_fetched] # the files downloaded are packaged according to the # instructions stored in diss_instructions cls.package_data(all_files_fetched, diss_instructions) # removing instruction files processed cls.clear_instruction_files(instruction_files) # as files should have been packaged and instruction files removed # any file remaining is an orphan cls.clear_orphan_files(dir_b) # for testing and debugging purpose only cls.check_end_loop(counter, max_loops) loop_time = time() - start_time
def sftp_dir(self, dir_path, destination_dir): """ Connect to a remote directory by sftp and list the files to download. Then the size of the files are used to compute a global timeout and all files are then downloaded. Dissemination is a failure if any file is over the size limit or if the staging post doesn't exist. Otheriwse, if the timeout is hit, the function returns a failure to download but there can be another attempt when the instruction file gets reprocessed. """ files_to_sftp = [] try: transport = paramiko.Transport((self.hostname, self.port)) transport.connect(username=self.user, password=self.password) sftp = paramiko.SFTPClient.from_transport(transport) sftp.chdir(dir_path) required_bandwith = 0 # for each file we update the files_to_sftp list. # This list contains a tuple of 4 elements # (staging post path, file_to_download_path, file_downloaded_path, skip) # with skip = False if file has not been downloaded and True otherwise # The reason for skip is that we still want to keep track of files # that should have been retried for an instruction file, even if there was no need # to download it a second time. Keeping track of those files allows # to link them to their request id in the database for item in sftp.listdir(dir_path): file_path = os.path.join(dir_path, item) destination_path = os.path.join(destination_dir, item) # if the file has already been fetched by a previous instruction file, # we don't do it again if os.path.isfile(destination_path): files_to_sftp.append((dir_path, file_path, destination_path, True)) LOGGER.debug("File %s already downloaded, moving on", file_path) continue mode = sftp.stat(file_path).st_mode # ignore directories if S_ISDIR(mode): continue # check for file_size size = sftp.stat(file_path).st_size max_size = SettingsManager.get("processFilesize") # if size > max_size, diffusion failed. # conversion in Mbytes with shift_expr << operator if max_size is not None and size > max_size*(1 << 20): raise FileOverSizeLimit required_bandwith += size # if a file, get it LOGGER.debug('file %s found on openwis staging post', file_path ) destination_path = self.check_zip(item, destination_dir) files_to_sftp.append((dir_path, file_path, destination_path, False)) sftp.close() transport.close() # initialize the multiprocessing manager # this creates a pool of workers that automatically dispatch the # downloads between them nb_workers = SettingsManager.get("getSFTPlimitConn") if DEBUG: pool = DebugSettingsManager.sftp_pool(processes=nb_workers) else: pool = multiprocessing.Pool(processes=nb_workers) # launch the pool of jobs results = pool.starmap_async(self._sftp_file, files_to_sftp) pool.close() timeout = self.compute_timeout(required_bandwith) nb_downloads = sum([not i[3] for i in files_to_sftp]) if nb_downloads == 0: LOGGER.debug("No files to download or required files already downloaded once.") sftp_success = True else: try: LOGGER.debug("Attempting download of %i files, for a total size of " " %f. Timeout is fixed at %s s.", nb_downloads, required_bandwith, timeout) start = time() # activate the timeout. If the jobs are not finished by timeout, # results.get will trigger a multiprocessing.TimeoutError results.get(timeout=timeout) delta_t = time() - start LOGGER.debug("Files downloaded in %f seconds" % delta_t) sftp_success = True except multiprocessing.TimeoutError: LOGGER.error( "Timeout exceeded for fetching files on staging post.") sftp_success = False # check download success and update database sftp_success = self.check_download_success(files_to_sftp, sftp_success) sftp.close() # case where the sftp interface (paramiko) returns an exception except (paramiko.SSHException, paramiko.ssh_exception.NoValidConnectionsError): LOGGER.exception("Couldn't connect to %s", self.hostname) sftp_success = False # case where a file is too big except FileOverSizeLimit: msg = ('file %s found on openwis staging post' 'is over the size limit %f. Dissemination ' 'failed' % (file_path, max_size)) LOGGER.exception(msg) Database.update_field_by_query("requestStatus", REQ_STATUS.failed, **dict(fullrequestId=self.req_id)) Database.update_field_by_query("message", msg, **dict(fullrequestId=self.req_id)) sftp_success = False # case where the staging post path is incorrect except FileNotFoundError: msg = ('Incorrect path %s for openwis staging post' 'Dissemination failed' % dir_path) LOGGER.exception(msg) Database.update_field_by_query("requestStatus", REQ_STATUS.failed, **dict(fullrequestId=self.req_id)) Database.update_field_by_query("message", msg, **dict(fullrequestId=self.req_id)) sftp_success = False # update database # The database is updated with as many new entries as there were files # downloaded files_downloaded = self.update(sftp_success, files_to_sftp) return sftp_success, files_downloaded