def send_yagmail(emails_to, subject, message, email_from = None, attachment_path = None, smtp_server = None, smtp_server_port = None): root_dir = cm.get_project_root() cnf_path = str(root_dir.joinpath(gc.MAIN_CONFIG_FILE)) m_cfg = ConfigData(cnf_path) if not email_from: email_from = m_cfg.get_value('Email/default_from_email') if not smtp_server: smtp_server = m_cfg.get_value('Email/smtp_server') if not smtp_server_port: smtp_server_port = m_cfg.get_value('Email/smtp_server_port') # receiver = emails_to # '[email protected], [email protected], [email protected]' body = message filename = attachment_path # 'test.png' yag = yagmail.SMTP(email_from, host=smtp_server, smtp_skip_login=True, smtp_ssl=False, soft_email_validation=False, port=smtp_server_port) yag.send( to=emails_to, subject=subject, contents=body, attachments=filename, )
def process(data_type, study_id, center_id, center_ids, dataset_type_id, out_file, output_format, server_url): # print ("data_type = {}, study_id = {}, out_file = {}".format(data_type, study_id, out_file)) # get URL of the API server from utils import ConfigData main_cfg = ConfigData('configs/main_config.yaml') api_server_url = main_cfg.get_value('SAMPLEINFO_CLI_URL') if server_url: # print('server_url: {}'.format(api_server_url)) click.echo('server_url: {}'.format(api_server_url)) if check_data_type_value(data_type): api_url, err_msg = identify_api_url(api_server_url, data_type, study_id, center_id, center_ids, dataset_type_id) else: api_url = '' err_msg = 'Unexpected data_type value ({}) was provided. Run --help for the list of expected values.'\ .format(data_type) if len(err_msg) == 0: if len(api_url) > 0: # access api and retrieve the data response = requests.get(api_url) # print ("data_type = {}, study_id = {}, out_file = {}".format(data_type, stu) # print(response.status_code) # json_parsed = output_data(response.json(), out_file, output_format) else: print( 'Error: Cannot identify the database call for the given parameters.' ) else: # report an error print('Error: {}'.format(err_msg))
def convert_sub_aliq_to_aliquot(sa, assay): aliquot = sa fl_cfg_dict = ConfigData(gc.CONFIG_FILE_DICTIONARY) assay_postfixes = fl_cfg_dict.get_value('assay_sub_aliquot_postfix/' + assay) # get_item_by_key if assay_postfixes is not None: for assay_postfix in assay_postfixes: apf_len = len(assay_postfix) if sa[-apf_len:] == assay_postfix: aliquot = sa[:len(sa) - apf_len] break # exit loop if a match was found return aliquot
def load_assay_conf(self, assay, project): assay_cfg_path = gc.CONFIG_FILE_ASSAY.replace('{project}', project) cfg_assay = ConfigData(assay_cfg_path) assay_config = cfg_assay.get_value(assay.upper()) if assay_config: self.logger.info( "Configuration for the {} assay was loaded from the assay config file: {}. " .format(assay.upper(), assay_cfg_path)) else: _str = "Configuration for the {} assay CANNOT be loaded from the assay config file: {}. " \ "Aborting execution.".format(assay.upper(), assay_cfg_path) self.logger.error(_str) self.error.add_error(_str) return assay_config
class Request(File): def __init__(self, filepath, main_cfg, file_type=2, sheet_name=''): # load_configuration (main_cfg_obj) # load global and local configureations File.__init__(self, filepath, file_type) if main_cfg: self.conf_main = main_cfg else: self.conf_main = ConfigData(gc.CONFIG_FILE_MAIN) # if cfg_path=='': # self.conf_main = ConfigData(gc.CONFIG_FILE_MAIN) # else: # self.conf_main = ConfigData(cfg_path) self.error = RequestError(self) self.log_handler = None self.logger = self.setup_logger(self.wrkdir, self.filename) self.logger.info( 'Start working with Submission request file {}'.format(filepath)) # self.file_dict = OrderedDict() # self.rows = OrderedDict() self.columnlist = [] self.samples = [] self.sub_aliquots = [] self.disqualified_sub_aliquots = {} self.aliquots_to_subaliquots_map = { } # holds the map of aliquots to sub-aliquots for interpreting DB responses self.disqualified_request_path = '' # will store path to a request file with disqualified sub-aliquots self.project = '' self.bulk_location = '' self.assay = '' self.center = '' self.center_id = None self.center_code = None self.experiment_id = '' self.data_source_names = '' self.data_source_objects = { } # dictionary to store all collected data sources for the request self.aliquots = None self.qualified_aliquots = None self.raw_data = None self.assay_data = None self.attachments = None self.submission_forms = None self.submission_package = None self.data_source_names = None # will hold value corresponding to the type of data source being used (attachments are not ignored) # possible value 'db' and 'file'. The value of the variable being set based on the first data source being used self.data_source_forms_assignment = None # self.sheet_name = '' self.sheet_name = sheet_name.strip() if not self.sheet_name or len(self.sheet_name) == 0: # if sheet name was not passed as a parameter, try to get it from config file self.sheet_name = gc.REQUEST_EXCEL_WK_SHEET_NAME # 'wk_sheet_name' # print (self.sheet_name) self.logger.info('Data will be loaded from worksheet: "{}"'.format( self.sheet_name)) self.conf_assay = None self.get_file_content() def get_file_content(self): if not self.columnlist: if cm.file_exists(self.filepath): self.logger.debug('Loading file content of "{}"'.format( self.filepath)) with xlrd.open_workbook(self.filepath) as wb: if not self.sheet_name or len(self.sheet_name) == 0: # by default retrieve the first sheet in the excel file sheet = wb.sheet_by_index(0) else: # if sheet name was provided sheets = wb.sheet_names() # get list of all sheets if self.sheet_name in sheets: # if given sheet name in the list of available sheets, load the sheet sheet = wb.sheet_by_name(self.sheet_name) else: # report an error if given sheet name not in the list of available sheets _str = ( 'Given worksheet name "{}" was not found in the file "{}". ' 'Verify that the worksheet name exists in the file.' ).format(self.sheet_name, self.filepath) self.error.add_error(_str) self.logger.error(_str) self.lineList = None self.loaded = False return self.lineList sheet.cell_value(0, 0) lines = [ ] # will hold content of the request file as an array of arrays (rows) for i in range(sheet.ncols): column = [] for j in range(sheet.nrows): if i == 0: lines.append( [] ) # adds an array for each new row in the request file # print(sheet.cell_value(i, j)) cell = sheet.cell(j, i) cell_value = cell.value # take care of number and dates received from Excel and converted to float by default if cell.ctype == 2 and int(cell_value) == cell_value: # the key is integer cell_value = str(int(cell_value)) elif cell.ctype == 2: # the key is float cell_value = str(cell_value) # convert date back to human readable date format # print ('cell_value = {}'.format(cell_value)) if cell.ctype == 3: cell_value_date = xlrd.xldate_as_datetime( cell_value, wb.datemode) cell_value = cell_value_date.strftime( "%Y-%m-%directory") column.append( cell_value ) # adds value to the current column array lines[j].append( '"' + str(cell_value) + '"' ) # adds value in "csv" format for a current row # self.columnlist.append(','.join(column)) self.columnlist.append( column) # adds a column to a list of columns # populate lineList property self.lineList = [] for ln in lines: self.lineList.append(','.join(ln)) wb.unload_sheet(sheet.name) # load passed request parameters (by columns) self.get_request_parameters() # validate provided information self.logger.info( 'Validating provided request parameters. project: "{}", bulk location: "{}", ' 'assay: "{}", db_center_code_or_id: "{}",' 'Sub-Aliquots: "{}"'.format(self.project, self.bulk_location, self.assay, self.center, self.sub_aliquots)) self.validate_request_params() if self.error.exist(): # report that errors exist self.loaded = False # print(self.error.count) # print(self.error.get_errors_to_str()) _str = 'Errors ({}) were identified during validating of the request. \nError(s): {}'.format( self.error.count, self.error.get_errors_to_str()) else: self.loaded = True _str = 'Request parameters were successfully validated - no errors found.' self.logger.info(_str) # combine Experiment_id out of request parameters if self.center_code and len(self.center_code.strip()) > 0: # use center code if available self.experiment_id = "_".join( [self.project, self.center_code, self.assay]) else: # use provided value for the center column from request, if center_code is not available self.experiment_id = "_".join( [self.project, self.center, self.assay]) else: _str = 'Loading content of the file "{}" failed since the file does not appear to exist".'.format( self.filepath) self.error.add_error(_str) self.logger.error(_str) self.columnlist = None self.lineList = None self.loaded = False return self.lineList # get all values provided in the request file def get_request_parameters(self): self.project = self.columnlist[0][1] self.bulk_location = self.columnlist[1][1] self.assay = self.columnlist[2][1].lower() self.center = self.columnlist[3][ 1] # center code (if alpha numeric) or center id (if numeric) self.sub_aliquots = self.columnlist[4] if self.sub_aliquots and len(self.sub_aliquots) > 0: self.sub_aliquots.pop(0) # get rid of the column header # self.samples = self.columnlist[5] # if self.samples and len(self.samples) > 0: # self.samples.pop(0) # get rid of the column header # validates provided parameters (loaded from the submission request file) def validate_request_params(self): _str_err = '' _str_warn = '' if len(self.sub_aliquots) == 0: _str_err = '\n'.join([ _str_err, 'List of provided sub-samples is empty. ' 'Aborting processing of the submission request.' ]) # Check if empty sub-samples were provided if '' in self.sub_aliquots: i = 0 cleaned_cnt = 0 for s in self.sub_aliquots: # check for any empty sub-aliquot values and remove them if len(s.strip()) == 0: self.sub_aliquots.pop(i) cleaned_cnt += 1 else: i += 1 if cleaned_cnt > 0: _str_warn = '\n'.join([ _str_warn, 'Empty sub-aliqouts (count = {}) were removed from the list. ' 'Here is the list of sub-aliqouts after cleaning (count = {}): "{}" ' .format(cleaned_cnt, len(self.sub_aliquots), self.sub_aliquots) ]) # check for empty values if len(self.project) == 0: _str_err = '\n'.join([ _str_err, 'No Program name was provided. Aborting processing of the submission request.' ]) if len(self.bulk_location) == 0: _str_err = '\n'.join([ _str_err, 'No Bulk Location was provided. Aborting processing of the submission request.' ]) if len(self.assay) == 0: _str_err = '\n'.join([ _str_err, 'No Assay was provided. Aborting processing of the submission request.' ]) if len(self.center) == 0: _str_err = '\n'.join([ _str_err, 'No DB Center information was provided. Aborting processing of the submission request.' ]) # check for values that should match some predefined values from a dictionary # check assay value if not cm2.key_exists_in_dict(self.assay, 'assay'): _str_err = '\n'.join([ _str_err, 'Provided Assay name "{}" is not matching a list of expected assay names ' '(as stored in "{}" dictionary file). ' 'Aborting processing of the submission request.'.format( self.assay, gc.CONFIG_FILE_DICTIONARY) ]) else: # if provided assay name is expected, convert it to the name expected by the Submission logic self.assay = cm2.get_dict_value(self.assay, 'assay') # check project value if not cm2.key_exists_in_dict(self.project.lower(), 'project'): _str_err = '\n'.join([ _str_err, 'Provided Program name "{}" is not matching a list of expected names ' '(as stored in "{}" dictionary file). ' 'Aborting processing of the submission request.'.format( self.project, gc.CONFIG_FILE_DICTIONARY) ]) else: # if provided assay name is expected, convert it to the name expected by the Submission logic self.project = cm2.get_dict_value(self.project.lower(), 'project') # validate center_code or center_id value self.logger.info( 'Start validation of center value "{}" provided in the request'. format(self.center)) db = DBAccess(self.logger, self.error, self.conf_main) # create DBAccess object db.open_connection() # test center value assuming center code was provided dataset = db.validate_center_code(self.center, self.project, 'code', 'code') _str_err_out1, center_id_out1 = self.check_validation_dataset_outcome( dataset, 'center_id', 'center_code') if center_id_out1: # center id was returned, meaning center was validated fine self.center_id = center_id_out1 # get center code value from the current DB dataset _str_err_out3, center_code = self.get_field_value_from_dataset( dataset, 'center_code') if center_code: # center code retrieved OK self.center_code = center_code else: # report an error during retrieving center_code _str_err = '\n'.join([_str_err, _str_err_out3]) else: # if center code was not validated at first attempt, validate it assuming the center id was given dataset = db.validate_center_code(self.center, self.project, 'id', 'code') _str_err_out2, center_id_out2 = self.check_validation_dataset_outcome( dataset, 'center_id', 'center_id') if center_id_out2: # center id was validated at the 2nd attempt, ignore the 1st failed center code validation self.center_id = center_id_out2 # get center code value from the current DB dataset _str_err_out3, center_code = self.get_field_value_from_dataset( dataset, 'center_code') if center_code: # center code retrieved OK self.center_code = center_code else: # report an error during retrieving center_code _str_err = '\n'.join([_str_err, _str_err_out3]) else: # center validation attempts failed, report both failures _str_err = '\n'.join([_str_err, _str_err_out1, _str_err_out2]) # get list of aliquots from list of sub-aliquots self.aliquots = [ cm2.convert_sub_aliq_to_aliquot(al, self.assay) for al in self.sub_aliquots ] # create a map to convert aliquot value to sub_aliquot value (for processing DB responses given for aliquots) for sa, a in zip(self.sub_aliquots, self.aliquots): self.aliquots_to_subaliquots_map[a] = sa if self.center_id: self.logger.info('Start validation of aliquot ids vs DB') # if center id was validated in the above code, validate received aliquots vs manifest dataset in DB dataset = db.validate_aliquot_ids(self.center_id, self.aliquots) if dataset: # create dictionary of received aliquots/sample ids aliquots_to_samples_map = {} for row in dataset: if '_aliquot_id' in row and '_sample_id' in row: aliquots_to_samples_map[ row['_aliquot_id']] = row['_sample_id'] # check if each aliquot id was returned from a database and get the sample id from the dataset for sa, a in zip(self.sub_aliquots, self.aliquots): if a in aliquots_to_samples_map: if len(str(aliquots_to_samples_map[a]).strip()) > 0: self.samples.append(aliquots_to_samples_map[a]) else: _str = 'Blank Sample Id value was returned from DB for the sub-aliquot id "{}". ' \ 'The sub-aliquot was disqualified'.format(sa) self.disqualify_sub_aliquot(sa, _str) _str_warn = '\n'.join([_str_warn, _str]) else: _str = 'Sub-aliquot id "{}" was not found in the database and was disqualified'.format( sa) self.disqualify_sub_aliquot(sa, _str) _str_warn = '\n'.join([_str_warn, _str]) else: _str_err = '\n'.join([ _str_err, 'Aliquot ids cannot be validated since no data was returned from DB for ' 'center_id = "{}" and aliquot ids as following: {} '. format(self.center_id, self.aliquots) ]) db = None # report any collected errors if len(_str_err) > 0: _str_err = 'Validation of request parameters:' + _str_err self.error.add_error(_str_err) self.logger.error(_str_err) # report any collected warnings if len(_str_warn) > 0: _str_warn = 'Validation of request parameters:' + _str_warn self.logger.warning(_str_warn) def check_validation_dataset_outcome(self, dataset, validation_id_column, validation_id_name): _str_err = '' row_num = 1 validation_id_out = None if dataset: if len(dataset) >= row_num: row = dataset[row_num - 1] # get the first row of the dataset if 'status' in row: status = row['status'] if 'description' in row: description = row['description'] if validation_id_column in row: # center_id validation_id = row[validation_id_column] if status == 'OK': # validation was successful validation_id_out = validation_id elif status == 'Failed': # validation has failed _str_err = '\n'.join([ _str_err, 'Validation of the provided {} value vs DB has Failed, description: {}' .format(validation_id_name, description) ]) else: # unexpected status value was returned _str_err = '\n'.join([ _str_err, 'Validation of the provided {} value vs DB returned unexpected status {}' .format(validation_id_name, status) ]) else: _str_err = '\n'.join([ _str_err, 'Unexpected error was reported during validating {} in the DB. ' 'Check earlier entries in the log file.'.format( validation_id_name) ]) return _str_err, validation_id_out def get_field_value_from_dataset(self, dataset, field_name, row_num=None): # set default values if row_num is None: row_num = 1 # default row is #1 _str_err = '' value_out = None if dataset: if len(dataset) >= row_num: row = dataset[row_num - 1] if field_name in row: value_out = row[field_name] else: _str_err = '\n'.join([ _str_err, 'Unexpected error was reported during retrieving value of "{}" (row #{})from the dataset. ' .format(field_name, row_num) ]) return _str_err, value_out def setup_logger(self, wrkdir, filename): # m_cfg = ConfigData(gc.CONFIG_FILE_MAIN) log_folder_name = gc.REQ_LOG_DIR # gc.LOG_FOLDER_NAME # m_logger_name = gc.MAIN_LOG_NAME # m_logger = logging.getLogger(m_logger_name) logger_name = gc.REQUEST_LOG_NAME logging_level = self.conf_main.get_value('Logging/request_log_level') # if a relative path provided, convert it to the absolute address based on the application working dir if not os.path.isabs(log_folder_name): log_folder_path = Path(wrkdir) / log_folder_name else: log_folder_path = Path(log_folder_name) lg = setup_logger_common( logger_name, logging_level, log_folder_path, # Path(wrkdir) / log_folder_name, str(filename) + '_' + time.strftime("%Y%m%d_%H%M%S", time.localtime()) + '.log') self.log_handler = lg['handler'] return lg['logger'] def load_request_configuration(self): # update main config file with the project/environmetn specific details from additional config files self.load_project_config_into_main( self.project ) # loads project specific config and merges it into main config # load project specific assay config file self.conf_assay = self.load_assay_conf(self.assay, self.project) if self.conf_assay: # update loaded assay config file with project/environment specific config assay_locatoin_config.yaml self.conf_assay = self.update_cfg_dictionary_with_location_details( gc.CONFIG_FILE_ASSAY_LOCATION, self.project, self.conf_assay) def process_request(self): self.data_source_names = cm.get_value_from_dictionary( 'data_sources', self.conf_assay) # self.conf_assay['data_sources'] # path to the folder where created submission packages will be located. # since this location can be provided in the project config file, this assignment is happening # after loading the project config gc.OUTPUT_PACKAGES_DIR = self.conf_main.get_value( 'Submission_location/output_packages') for data_source_name in self.data_source_names: # if isinstance(data_source_name, tuple) if isinstance(data_source_name, str): if data_source_name == 'attachment': self.attachments = Attachment(self) elif data_source_name[-3:] == "_db": self.data_source_objects[data_source_name] = DataSourceDB( self, data_source_name, data_source_name) if not self.data_source_forms_assignment: self.data_source_forms_assignment = 'db' else: self.data_source_objects[data_source_name] = DataSource( self, data_source_name, data_source_name) if not self.data_source_forms_assignment: self.data_source_forms_assignment = 'file' elif isinstance(data_source_name, tuple): if data_source_name[0][-3:] == "_db": self.data_source_objects[ data_source_name[0]] = DataSourceDB( self, data_source_name[0], data_source_name[1]) else: self.data_source_objects[data_source_name[0]] = DataSource( self, data_source_name[0], data_source_name[1]) else: self.logger.error( 'Provided data source name ({}) is of unexpected format and cannot be processed.' .format(data_source_name)) # if data_source_forms_assignment was not assigned with any value in code before, assign a default to it # this a case when an assay submits only attachments and do not use any assay or QC data if not self.data_source_forms_assignment: self.data_source_forms_assignment = gc.DEFAULT_DATA_SOURCE_FORMS_ASSIGNMENT self.submission_package = SubmissionPackage(self) self.create_request_for_disqualified_sub_aliquots() self.create_trasfer_script_file() # check for errors and put final log entry for the request. if self.error.exist(): _str = 'Processing of the current request was finished with the following errors: {}\n'.format( self.error.get_errors_to_str()) self.logger.error(_str) else: _str = 'Processing of the current request was finished successfully.\n' self.logger.info(_str) def load_assay_conf(self, assay, project): assay_cfg_path = gc.CONFIG_FILE_ASSAY.replace('{project}', project) cfg_assay = ConfigData(assay_cfg_path) assay_config = cfg_assay.get_value(assay.upper()) if assay_config: self.logger.info( "Configuration for the {} assay was loaded from the assay config file: {}. " .format(assay.upper(), assay_cfg_path)) else: _str = "Configuration for the {} assay CANNOT be loaded from the assay config file: {}. " \ "Aborting execution.".format(assay.upper(), assay_cfg_path) self.logger.error(_str) self.error.add_error(_str) return assay_config # def update_cfg_assay_with_location_details(self, project, cfg_assay): # cfg_assay_location = ConfigData(gc.CONFIG_FILE_ASSAY_LOCATION.replace('{project}', project)) # if cfg_assay_location.loaded: # self.logger.info('Local config file "{}" was loaded and being used.'.format(cfg_assay_location.cfg_path)) # cfg_assay = cm.update_dictionary_matching_keys(cfg_assay, cfg_assay_location.get_whole_dictionary()) # else: # _str = 'Local config file "{}" was NOT loaded. Aborting processing of the current request file.'\ # .format(cfg_assay_location.cfg_path) # self.logger.error(_str) # self.error.add_error(_str) # return cfg_assay def update_cfg_dictionary_with_location_details(self, location_path, project, cfg_to_update): cfg_location = ConfigData(location_path.replace('{project}', project)) if cfg_location.loaded: self.logger.info( 'Local config file "{}" was loaded and being used.'.format( cfg_location.cfg_path)) cfg_to_update = cm.update_dictionary_matching_keys( cfg_to_update, cfg_location.get_whole_dictionary()) else: _str = 'Local config file "{}" was NOT loaded. Aborting processing of the current request file.'\ .format(cfg_location.cfg_path) self.logger.error(_str) self.error.add_error(_str) return cfg_to_update def load_project_config_into_main(self, project): # load project specific "project_config" config file cfg_project = ConfigData( gc.CONFIG_FILE_PROJECT.replace('{project}', project)) if cfg_project.loaded: # if cfg_project was loaded, update it with the environment specific settings (from project_location config) cfg_project_updated = self.update_cfg_dictionary_with_location_details( gc.CONFIG_FILE_PROJECT_LOCATION, self.project, cfg_project.get_whole_dictionary()) # update main config with the outcome of the previous updates self.conf_main.update(cfg_project_updated) def create_trasfer_script_file(self): self.logger.info("Start preparing transfer_script.sh file.") # path for the script file being created sf_path = Path(self.submission_package.submission_dir + "/transfer_script.sh") # get script file template with open('scripts/' + self.project + '/transfer_script.sh', 'r') as ft: scr_tmpl = ft.read() # update placeholders in the script with the actual values smtp_server = cm.get_environment_variable( self.conf_main.get_item_by_key('Email/smtp_server_env_name')) smtp_port = cm.get_environment_variable( self.conf_main.get_item_by_key('Email/smtp_server_port_env_name')) scr_tmpl = cm.replace_value_in_string( scr_tmpl, "{!smtp!}", smtp_server + ":" + str(smtp_port)) scr_tmpl = cm.replace_value_in_string( scr_tmpl, "{!to_email!}", ','.join(self.conf_main.get_value("Email/sent_to_emails"))) scr_tmpl = cm.replace_value_in_string( scr_tmpl, "{!from_email!}", self.conf_main.get_value("Email/default_from_email")) scr_tmpl = cm.replace_value_in_string( scr_tmpl, "{!send_email_flag!}", str(self.conf_main.get_value("Email/send_emails"))) scr_tmpl = cm.replace_value_in_string( scr_tmpl, "{!cmd!}", self.conf_main.get_value("DataTransfer/transfer_command")) # the following will be utilized if mount point is being used by the transfer script (i.e. for Peerless) scr_tmpl = cm.replace_value_in_string( scr_tmpl, "{!mp_cmd!}", self.conf_main.get_value("DataTransfer/mount_point_command")) scr_tmpl = cm.replace_value_in_string( scr_tmpl, "{!mount_local_dir!}", self.conf_main.get_value("DataTransfer/mount_local_dir")) scr_tmpl = cm.replace_value_in_string( scr_tmpl, "{!mount_remote_dir!}", self.conf_main.get_value("DataTransfer/mount_remote_dir")) scr_tmpl = cm.replace_value_in_string( scr_tmpl, "{!source_dir!}", self.submission_package.submission_dir) scr_tmpl = cm.replace_value_in_string( scr_tmpl, "{!target_dir!}", self.conf_main.get_value("DataTransfer/remote_target_dir")) ssh_server = cm.get_environment_variable( self.conf_main.get_item_by_key('DataTransfer/ssh_server_env_name')) scr_tmpl = cm.replace_value_in_string(scr_tmpl, "{!ssh_server!}", str(ssh_server)) # apply user name as the very last replacement statement, since it can be used as part of previous replacements ssh_user = cm.get_environment_variable( self.conf_main.get_item_by_key('DataTransfer/ssh_user_env_name')) scr_tmpl = cm.replace_value_in_string(scr_tmpl, "{!ssh_user!}", str(ssh_user)) set_permissions = False set_perm_value = self.conf_main.get_value("DataTransfer/exec_permis") if set_perm_value: try: exec_permission = eval(set_perm_value.strip()) set_permissions = True except Exception as ex: _str = 'Unexpected error Error "{}" occurred during evaluating of "DataTransfer/exec_permis" value ' \ '"{}" retrieved from the main config file. Permission setup operation will be skipped. \n{} '\ .format(ex, set_perm_value, traceback.format_exc()) self.logger.warning(_str) # self.error.add_error(_str) set_permissions = False with open(sf_path, "w") as sf: sf.write(scr_tmpl) if set_permissions: try: # if permissions to be set were retrieved from config file, set them here st = os.stat(sf_path) os.chmod(sf_path, st.st_mode | exec_permission) #stat.S_IXUSR except Exception as ex: _str = 'Unexpected error Error "{}" occurred during setting up permissions "{}" for the script file ' \ '"{}". \n{} '\ .format(ex, set_perm_value, sf_path, traceback.format_exc()) self.logger.warning(_str) self.error.add_error(_str) else: _str = 'Permission setup was skipped for the transfer script file. ' \ 'Note: value of "DataTransfer/exec_permis" from main config was set to "{}".'\ .format(set_perm_value) self.logger.warning(_str) self.logger.info("Finish preparing '{}' file.".format(sf_path)) def disqualify_sub_aliquot(self, sa, details): # adds a sub aliquots to the disctionary of disqualified sub_aliquots # key = sub-aliquot, value = array of details for disqualification; 1 entry can have multiple detail reasons if sa in self.disqualified_sub_aliquots.keys(): self.disqualified_sub_aliquots[sa].append(details) else: arr_details = [details] self.disqualified_sub_aliquots[sa] = arr_details self.logger.warning( 'Sub-aliquot "{}" was disqualified with the following details: "{}"' .format(sa, details)) def populate_qualified_aliquots(self): # reset self.qualified_aliquots array self.qualified_aliquots = [] #select only aliquots that were not disqualified for sa, a in zip(self.sub_aliquots, self.aliquots): if not sa in self.disqualified_sub_aliquots.keys(): self.qualified_aliquots.append(a) def create_request_for_disqualified_sub_aliquots(self): # proceed only if some disqualified sub-aliquots are present if self.disqualified_sub_aliquots: self.logger.info( "Start preparing a request file for disqualified sub-aliquots '{}'." .format([val for val in self.disqualified_sub_aliquots.keys()])) wb = xlwt.Workbook() # create empty workbook object sh = wb.add_sheet( 'Submission_Request' ) # sheet name can not be longer than 32 characters cur_row = 0 # first row for 0-based array cur_col = 0 # first col for 0-based array #write headers to the file headers = self.get_headers() for val in headers: sh.write(cur_row, cur_col, val) cur_col += 1 cur_row += 1 for sa in self.sub_aliquots: if sa in self.disqualified_sub_aliquots.keys(): sh.write(cur_row, 0, self.project) sh.write(cur_row, 1, self.bulk_location) sh.write(cur_row, 2, self.assay) sh.write(cur_row, 3, self.center) sh.write(cur_row, 4, sa) cur_row += 1 self.disqualified_request_path = Path( gc.DISQUALIFIED_REQUESTS + '/' + time.strftime("%Y%m%d_%H%M%S", time.localtime()) + '_reprocess_disqualified _' + Path(self.filename).stem + '.xls') # if DISQUALIFIED_REQUESTS folder does not exist, it will be created os.makedirs(gc.DISQUALIFIED_REQUESTS, exist_ok=True) wb.save(str(self.disqualified_request_path)) self.logger.info( "Successfully prepared the request file for disqualified sub-aliquots and saved in '{}'." .format(str(self.disqualified_request_path)))
import traceback from utils import Monitor from utils import ConfigData, common as cm, common2 as cm2, global_const as gc, send_yagmail #, send_email as email # if executed by itself, do the following if __name__ == '__main__': gc.CURRENT_PROCCESS_LOG_ID = 'monitor_file' # load main config file and get required values m_cfg = ConfigData(gc.MAIN_CONFIG_FILE) # setup application level logger cur_dir = Path(os.path.dirname(os.path.abspath(__file__))) mlog, log_handler = cm.setup_logger(m_cfg, cur_dir, gc.CURRENT_PROCCESS_LOG_ID) monitor_path = m_cfg.get_value('Location/monitor_configs') # Verify that target directory (df_path) is accessible for the current user (under which the app is running) # Identify the user under which the app is running if the df_path is not accessible if not os.path.exists(monitor_path): _str = 'Directory "{}" does not exist or not accessible for the current user. Aborting execution. ' \ 'Expected user login: "******", Effective user: "******"'.format(monitor_path, os.getlogin(),getpass.getuser()) mlog.error(_str) # send notification email alerting about the error case email_subject = 'Error occurred during running file_monitoring tool.' email_body = 'The following error caused interruption of execution of the application<br/>' \ + str(Path(os.path.abspath(__file__))) \ + '<br/><br/><font color="red">' \ + _str + '</font>' try:
class MappingFileText(File): def __init__(self, filepath, conf_source, log_obj, file_type=None, file_delim=None): # setup default parameters if file_type is None: file_type = 1 if file_delim is None: file_delim = ',' #'\t' File.__init__(self, filepath, file_type, file_delim) self.conf_src = ConfigData('', conf_source) self.logger = log_obj self.map = { } # it will hold a dict where key is an aliquot id and value is the relative path to the file # set file properties before loading it self.file_delim = self.conf_src.get_value('file_delim') \ if self.conf_src.get_value('file_delim') else self.file_delim self.header_row_num = self.conf_src.get_value('header_row_num') \ if self.conf_src.get_value('header_row_num') else self.header_row_num # load the file self.get_file_content() def load_map(self, data_loc): disqualify = None aliquot_id_col_num = self.conf_src.get_value('aliquot_id_column_num') template_fields_col_num = self.conf_src.get_value( 'template_fields_col_num') file_path = self.conf_src.get_value('file_path_template') # raw_file_name = self.conf_src.get_value('file_name_template') if aliquot_id_col_num is None: disqualify = ('' if disqualify is None else disqualify + '| ') disqualify = disqualify + 'Expected map file\'s configuration parameter "aliquot_id_col_num" was not provided.' if template_fields_col_num is None: disqualify = ('' if disqualify is None else disqualify + '| ') disqualify = disqualify + 'Expected map file\'s configuration parameter "template_fields_col_num" was not provided.' if template_fields_col_num is None: disqualify = ('' if disqualify is None else disqualify + '| ') disqualify = disqualify + 'Expected map file\'s configuration parameter "file_path_template" was not provided.' if file_path is None: disqualify = ('' if disqualify is None else disqualify + '| ') disqualify = disqualify + 'Expected map file\'s configuration parameter "file_path_template" was not provided.' if not isinstance(aliquot_id_col_num, int): disqualify = ('' if disqualify is None else disqualify + '| ') disqualify = disqualify + 'Non-integer value was provided for the map file\'s "aliquot_id_col_num" parameter.' for entry in template_fields_col_num: if not isinstance(template_fields_col_num[entry], int): disqualify = ('' if disqualify is None else disqualify + '| ') disqualify = disqualify + 'Non-integer value was provided for the map file\'s {} parameter.'.format( entry) if disqualify is None: row_num = 0 for row in self.lineList: row_num += 1 if row_num <= self.header_row_num: continue cur_aliquot_id = row[aliquot_id_col_num - 1] cur_fields = copy.deepcopy(template_fields_col_num) cur_raw_file_path = file_path # cur_raw_file_name = raw_file_name # combine path of the data file for the current row of mapping file for fld_name in cur_fields: fld_val = row[cur_fields[fld_name] - 1] cur_raw_file_path = cur_raw_file_path.replace( '{' + fld_name + '}', fld_val) # print (str(Path(data_loc) / cur_raw_file_path)) files = glob.glob(str(Path(data_loc) / cur_raw_file_path)) if files: for file in files: if not cur_aliquot_id in self.map: self.map[cur_aliquot_id] = [] self.map[cur_aliquot_id].append(file) return disqualify
def process_submission(): # load main config file m_cfg = ConfigData(gc.CONFIG_FILE_MAIN) if not m_cfg.loaded: print( 'Specified main config file ({}) was not loaded. Aborting execution.' .format(gc.CONFIG_FILE_MAIN)) return 1 # load location config file (with local value specific for the location) cfg_location = ConfigData(gc.CONFIG_FILE_LOCATION) if not cfg_location.loaded: print( 'Specified location config file ({}) was not loaded. Aborting execution.' .format(gc.CONFIG_FILE_LOCATION)) return 1 # if both config were loaded, load update the main config with the location config m_cfg.update(cfg_location.get_whole_dictionary()) # assign values common_logger_name = gc.MAIN_LOG_NAME # m_cfg.get_value('Logging/main_log_name') # get path configuration values logging_level = m_cfg.get_value('Logging/main_log_level') # path to the folder where all new request files will be posted requests_loc = m_cfg.get_value('Location/requests') gc.DISQUALIFIED_REQUESTS = m_cfg.get_value( 'Location/requests_disqualified') # get path configuration values and save them to global_const module # path to the folder where all application level log files will be stored (one file per run) gc.APP_LOG_DIR = m_cfg.get_value('Location/app_logs') # path to the folder where all log files for processing request files will be stored # (one file per request) gc.REQ_LOG_DIR = m_cfg.get_value('Location/request_logs') # path to the folder where all processed (and renamed) requests will be stored gc.REQ_PROCESSED_DIR = m_cfg.get_value('Location/requests_processed') # path to the folder where created submission packages will be located. One package sub_folder per request. # gc.OUTPUT_PACKAGES_DIR = m_cfg.get_value('Location/output_packages') # tarball approach to be used for the current deployment gc.TARBALL_APPROACH = m_cfg.get_value('Tar_ball/approach') # flag to save calculated md5sum to a physical file gc.TARBALL_SAVE_MD5SUM_FILE = m_cfg.get_value('Tar_ball/save_md5sum_file') # tarball ignore directories ignore_dirs = m_cfg.get_value('Tar_ball/ignore_dirs') if ignore_dirs: # update default ignore_dirs value with the value from a config file gc.TARBALL_IGNORE_DIRS = ignore_dirs log_folder_name = gc.APP_LOG_DIR # gc.LOG_FOLDER_NAME processed_folder_name = gc.REQ_PROCESSED_DIR # gc.PROCESSED_FOLDER_NAME prj_wrkdir = os.path.dirname(os.path.abspath(__file__)) email_msgs = [] email_attchms = [] transfers = [] # requests_loc = 'E:/MounSinai/MoTrPac_API/ProgrammaticConnectivity/MountSinai_metadata_file_loader/DataFiles' requests_path = Path(requests_loc) # get current location of the script and create Log folder # if a relative path provided, convert it to the absolute address based on the application working dir if not os.path.isabs(log_folder_name): logdir = Path(prj_wrkdir) / log_folder_name else: logdir = Path(log_folder_name) # logdir = Path(prj_wrkdir) / log_folder_name # 'logs' lg_filename = time.strftime("%Y%m%d_%H%M%S", time.localtime()) + '.log' lg = setup_logger_common(common_logger_name, logging_level, logdir, lg_filename) # logging_level mlog = lg['logger'] log_warnings = False mlog.info( 'Start processing submission requests in "{}"'.format(requests_path)) try: (_, _, requests) = next(walk(requests_path)) # print('Study requests: {}'.format(requests)) mlog.info( 'Submission requests to be processed (count = {}): {}'.format( len(requests), requests)) req_proc_cnt = 0 errors_present = 'OK' req_path = '' # '~$' should filter out temp file created when excel is open requests = [file for file in requests if not file.startswith('~$')] for req_file in requests: if req_file.endswith(('xlsx', 'xls')): req_path = Path(requests_path) / req_file # transfer_path = '' # set a default transfer path transfer_details = { 'transfer_path': '', 'request_file': req_file, 'process_handler': None, 'return_code': None, 'return_status': None } # email_msgs = [] # email_attchms = [] try: # print('--------->Process file {}'.format(req_path)) mlog.info( 'Request file {} was selected for processing.'.format( req_path)) # save timestamp of beginning of the file processing ts = time.strftime("%Y%m%d_%H%M%S", time.localtime()) req_obj = Request(req_path, m_cfg) if req_obj and req_obj.loaded: # proceed processing request mlog.info( 'Submission request loading status: Success. Submission request file: "{}".' .format(req_path)) mlog.info( 'Loading local and project related configs for processing the request.' ) req_obj.load_request_configuration() if not req_obj.error.exist(): mlog.info( 'Local config files were loaded with no errors, proceeding to process ' 'the request file.') req_obj.process_request() else: mlog.info( 'Errors were reported during loading local config files. Aborting processing ' 'this request.') mlog.info( 'Processing of Submission request was finished for {}' .format(req_path)) req_proc_cnt += 1 # print (req_obj.logger._cache) if hasattr(req_obj.logger, '_cache' ): #verify that _cache attribute is present # check if any warning were recorded to the log file and set a flag log_warnings if 30 in req_obj.logger._cache and req_obj.logger._cache[ 30]: log_warnings = True # else: # log_warnings = False else: mlog.warning( 'The current logger object has no "_cache" attribute - thus cannot determine ' 'if any Warnings were reported during the process.' ) # identify if any errors were identified and set status variable accordingly if not req_obj.error.exist(): if not req_obj.disqualified_sub_aliquots: # no disqualified sub-aliquots present if not log_warnings: fl_status = 'OK' _str = 'Processing status: "{}". Submission Request: {}'.format( fl_status, req_path) # errors_present = 'OK' else: fl_status = 'OK with Warnings' _str = 'Processing status: "{}". Submission Request: {}'.format( fl_status, req_path) else: # some disqualified sub-aliquots are presetn fl_status = 'OK with Disqualifications' _str = 'Processing status: "{}". Submission Request: {}'.format( fl_status, req_path) if not errors_present == 'ERROR': errors_present = 'DISQUALIFY' else: fl_status = 'ERROR' _str = 'Processing status: "{}". Check processing log file for this request: {}' \ .format(fl_status, req_obj.logger.handlers[0]) errors_present = 'ERROR' if fl_status == "OK": mlog.info(_str) # if transfer on completion was requested through the command line argument if gc.TRANSFER_ON_COMPLETION: # update transfer details dictionary with the path to the transfer file transfer_details['transfer_path'] = \ Path(req_obj.submission_package.submission_dir) / 'transfer_script.sh' transfers.append( transfer_details ) # add transfer details to transfers list mlog.info( 'Since the last request was processed with "{}" status and transfer on ' 'completion was requested ("--execute_transfer" argument was set to "yes"), ' 'the following path was put in queue for execution: ' '{}'.format(fl_status, transfer_details['transfer_path'])) else: mlog.warning(_str) # if transfer on completion was requested through the command line argument if gc.TRANSFER_ON_COMPLETION: mlog.info( 'The transfer on completion request ("--execute_transfer" argument was set to ' '"yes") will be ignored since the last request was processed with "{}" status.' .format(fl_status)) processed_dir = Path(processed_folder_name) req_processed_name = ts + '_' + fl_status + '_' + req_file file_name_new_path = cm.move_file_to_processed( req_path, req_processed_name, processed_dir, req_obj.logger, req_obj.error) if file_name_new_path: mlog.info( 'Processed Submission request "{}" was moved and renamed as: "{}"' .format(req_path, processed_dir / req_processed_name)) else: mlog.warning( 'Moving the processed request "{}" was not successful due to some errors ' 'reported in the request\'s log file {}.'.format( req_path, req_obj.log_handler.baseFilename)) # deactivate the current Request logger deactivate_logger_common(req_obj.logger, req_obj.log_handler) if req_obj.submission_package and req_obj.submission_package.submission_dir: # save transfer path to a local variable transfer_path = Path( req_obj.submission_package.submission_dir ) / 'transfer_script.sh' else: transfer_path = None # preps for email notification email_msgs.append(( '-------------------------------------<br/>' 'Requested project: {}'.format(req_obj.project) + '<br/>Requested Experiment: {}.'.format( req_obj.experiment_id) + ('<br/>Request file <br/>{} <br/> was processed and moved/renamed to <br/> {}.' .format(req_path, processed_dir / req_processed_name) if file_name_new_path else '<br/> Request file <br/>{} <br/> was processed but <font color="red">NOT moved due ' 'to some errors</font> reported in the request\'s log file.' .format(req_path)) + '<br/> <b>Errors summary:</b> {}' '<br/> <b>Warning(s) reported:</b> {}' '<br/> <i>Log file location: <br/>{}</i>' '<br/> Submission package locatoin:<br/>{}' '<br/> Data source locatoin:<br/>{}' '<br/> Processed Aliquots:<br/>{}' '<br/> Disqualified Aliquots (if present, see the log file for more details):<br/>{}' '<br/> A request file for re-processing Disqualified Aliquots was prepared in:<br/>{}' '<br/> Automatic data transferring: {}' '<br/> Command line to run data transferring manually: <br/> {}' ''.format( '<font color="red">Check Errors in the log file.</font>' if req_obj.error.exist() else '<font color="green">No Errors</font> ', '<font color="red">Yes - check the log file.</font>' if log_warnings else 'No', req_obj.log_handler.baseFilename, req_obj.submission_package.submission_dir if req_obj.submission_package else 'N/A', req_obj.attachments.data_loc if req_obj.attachments else 'N/A', req_obj.qualified_aliquots if req_obj.qualified_aliquots else 'None', [ val for val in req_obj.disqualified_sub_aliquots.keys() ] if req_obj.disqualified_sub_aliquots else 'None', req_obj.disqualified_request_path, '<font color="green">Performed.</font> ' 'Additional email should be sent upon data transfer completion.' if len( str(transfer_details['transfer_path']).strip()) > 0 else 'Not performed.', str( Path(req_obj.submission_package.submission_dir) / 'transfer_script.sh') if req_obj.submission_package else 'N/A'))) email_attchms.append(req_obj.log_handler.baseFilename) # print ('email_msgs = {}'.format(email_msgs)) req_obj = None except Exception as ex: # report an error to log file and proceed to next file. mlog.error( 'Error "{}" occurred during processing file: {}\n{} '. format(ex, req_path, traceback.format_exc())) raise mlog.info('Number of processed Submission requests = {}'.format( req_proc_cnt)) if req_proc_cnt > 0: # collect final details and send email about this study results email_subject = 'processing of Submission Requests ' if errors_present == 'OK': if not log_warnings: email_subject = 'SUCCESSFUL ' + email_subject else: email_subject = 'SUCCESSFUL (wit Warnings) ' + email_subject elif errors_present == 'DISQUALIFY': email_subject = 'SUCCESSFUL (with disqualifications) ' + email_subject else: email_subject = 'ERROR(s) present during ' + email_subject email_body = ( 'Number of requests processed: {}.'.format(req_proc_cnt) + '<br/><br/>' + '<br/><br/>'.join(email_msgs)) # print ('email_subject = {}'.format(email_subject)) # print('email_body = {}'.format(email_body)) try: if m_cfg.get_value('Email/send_emails'): email.send_yagmail( emails_to=m_cfg.get_value('Email/sent_to_emails'), subject=email_subject, message=email_body, main_conf=m_cfg # commented adding attachements, since some log files go over 25GB limit and fail email sending # ,attachment_path=email_attchms ) except Exception as ex: # report unexpected error during sending emails to a log file and continue _str = 'Unexpected Error "{}" occurred during an attempt to send email upon ' \ 'finishing processing "{}" study: {}\n{} ' \ .format(ex, req_path, os.path.abspath(__file__), traceback.format_exc()) mlog.critical(_str) # perform transfers, if anything qualifies for it if transfers and len(transfers) > 0: transfer_status_checking_delay = m_cfg.get_value( 'General/transfer_status_checking_delay') if transfer_status_checking_delay and str( transfer_status_checking_delay).isnumeric(): if transfer_status_checking_delay > 0: pass else: transfer_status_checking_delay = None else: transfer_status_checking_delay = None mlog.info( 'Starting processing requested transfers. Total count: {} transfers.' .format(len(transfers))) # process all collected transfer requests cm.process_transfers(transfers, mlog, transfer_status_checking_delay) # assess results of the transfer processing transfer_ok = 0 transfer_err = 0 transfer_nd = 0 for transfer in transfers: if transfer['return_status']: if transfer['return_status'][:2] == 'OK': transfer_ok += 1 elif transfer['return_status'][:5] == 'ERROR': transfer_err += 1 else: transfer_nd += 1 else: transfer_nd += 1 _str = 'Finish processing transfers with the following statuses: "OK" - {} transfer(s), "ERROR" - {} ' \ 'transfer(s)'.format(transfer_ok, transfer_err) if transfer_nd > 0: _str = _str + ', "ND" - {}'.format(transfer_nd) mlog.info(_str) # send email with the status of the transfers if transfers and len(transfers) > 0: if transfer_err > 0: email_subject = 'Errors produced during automated transfer(s) of prepared Submission Request(s)' else: email_subject = 'Completion of automated transfer(s) of prepared Submission Request(s)' email_transfer_msgs = [] for transfer in transfers: email_transfer_msgs.append( ('Transfer process for the request file: "{}" ' '<br/>Transfer script file:<br/>{}' '<br/>Completion status:<br/>{}'.format( transfer['request_file'], transfer['transfer_path'], transfer['return_status']))) email_body = ( 'Summary of transfer of prepared submissions:' '<br/>Total count of completed transfers: {}. ' '<br/>Status "OK": {} transfer(s)' '<br/>Status "ERROR": {} transfer(s)' '<br/>Status "Not Defined": {} transfer(s)' '<br/><br/>The following are details for each performed transfer:' '<br/><br/>'.format( len(transfers), '<font color="green">' + str(transfer_ok) + '</font>' if transfer_ok > 0 else transfer_ok, '<font color="red">' + str(transfer_err) + '</font>' if transfer_err > 0 else transfer_err, transfer_nd) + '<br/><br/>'.join(email_transfer_msgs)) try: if m_cfg.get_value('Email/send_emails'): email.send_yagmail(emails_to=m_cfg.get_value( 'Email/sent_to_emails'), subject=email_subject, message=email_body, main_conf=m_cfg) except Exception as ex: # report unexpected error during sending emails to a log file and continue _str = 'Unexpected Error "{}" occurred during an attempt to send email upon ' \ 'finishing automated transfers. \n{} '\ .format(ex, traceback.format_exc()) mlog.critical(_str) except Exception as ex: # report unexpected error to log file _str = 'Unexpected Error "{}" occurred during processing file: {}\n{} ' \ .format(ex, os.path.abspath(__file__), traceback.format_exc()) mlog.critical(_str) raise sys.exit()
class SubmissionForm: def __init__(self, form_name, request, sub_aliquot, aliquot, sample, form_file_name_id=None): self.form_name = form_name if not form_file_name_id: form_file_name_id = form_name self.form_file_name_id = form_file_name_id self.req_obj = request # reference to the current request object self.sub_aliquot = sub_aliquot self.aliquot = aliquot self.sample = sample self.error = self.req_obj.error self.logger = self.req_obj.logger self.conf_assay = request.conf_assay self.fl_json = None self.fl_json_schema = None self.fl_cfg_common = None self.fl_cfg_assay = None # self.fl_cfg_dict = None self.prepare_form(form_name) def prepare_form(self, form_name): forms_location = Path(gc.SUBMISSION_FORMS_DIR + '/' + form_name + '/' + self.req_obj.project) # identify paths for json and config (yaml) files fl_path_json_common = forms_location / (form_name + '.json') fl_path_json_assay = forms_location / ( form_name + '_' + str(self.req_obj.assay).lower() + '.json') fl_path_json_schema = forms_location / (form_name + '_schema.json') fl_path_cfg_common = forms_location / (form_name + '.yaml') # fl_path_json_common = Path(gc.SUBMISSION_FORMS_DIR + '/' + form_name + '/' + form_name + '.json') # fl_path_json_assay = Path(gc.SUBMISSION_FORMS_DIR + '/' + form_name + '/' + form_name + '_' + # str(self.req_obj.assay).lower() + '.json') # fl_path_json_schema = Path(gc.SUBMISSION_FORMS_DIR + '/' + form_name + '/' + form_name + '_schema.json') # fl_path_cfg_common = Path(gc.SUBMISSION_FORMS_DIR + '/' + form_name + '/' + form_name + '.yaml') # check the value assigned to the current request's data_source_forms_assignment # and select assay config file accordingly if self.req_obj.data_source_forms_assignment == 'file': fl_path_cfg_assay = forms_location / ( form_name + '_' + str(self.req_obj.assay).lower() + '.yaml') elif self.req_obj.data_source_forms_assignment == 'db': fl_path_cfg_assay = forms_location / ( form_name + '_' + str(self.req_obj.assay).lower() + '_db.yaml') else: # data_source_forms_assignment = 'db' will be treated as a default assignment fl_path_cfg_assay = forms_location / ( form_name + '_' + str(self.req_obj.assay).lower() + '.yaml') # check if assay specific json exists; if yes - use it, if not - use common one if cm.file_exists(fl_path_json_assay): fl_path_json = fl_path_json_assay else: fl_path_json = fl_path_json_common # load json and config files self.fl_json = FileJson(fl_path_json, self.req_obj.error, self.req_obj.logger) self.fl_json_schema = FileJson(fl_path_json_schema, self.req_obj.error, self.req_obj.logger) self.fl_cfg_common = ConfigData(fl_path_cfg_common) self.fl_cfg_assay = ConfigData(fl_path_cfg_assay) # self.fl_cfg_dict = ConfigData(gc.CONFIG_FILE_DICTIONARY) # print(self.fl_json.json_data) # loop through all json keys and fill those with associated data self.get_json_keys(self.fl_json.json_data) # print(self.fl_json.json_data) # validate final json file against json schema (if present) self.validate_json(self.fl_json, self.fl_json_schema) def get_json_keys(self, json_node, parent_keys=''): for key, val in json_node.items(): # TODO: add functionality to handle JSON arrays (if those are needed) if isinstance(val, dict): if parent_keys: cur_parents = '/'.join([parent_keys, key]) else: cur_parents = key self.get_json_keys(val, cur_parents) else: if parent_keys: full_key_name = '/'.join([parent_keys, key]) else: full_key_name = key # json_node[key] = 'print("{}")'.format(full_key_name) # json_node[key] = eval(json_node[key]) # print("JSON file - {} : {}".format(full_key_name, val)) # val # json_node[key] # print("Config Common - {} = {}".format(key, self.fl_cfg_common.get_value(key))) # print("Config Assay - {} = {}".format(key, self.fl_cfg_assay.get_value(key))) val = self.eval_cfg_value( full_key_name, self.fl_cfg_assay.get_value(full_key_name), self.fl_cfg_common.get_value(full_key_name)) if str(val).strip() == '': # if returned value is blank, create a warning in the log file self.logger.warning( 'Blank value was reported for field "{}" '.format( full_key_name)) # check if the assigned value is a special expected blank value that don't need to be reported in log if str(val).strip( ) == gc.SUBMISSION_FORM_EXPECTED_BLANK_VALUE: # '!!blank!!' json_node[key] = '' self.logger.info( 'Field "{}" was assigned with the expected blank ("") value' .format(key)) else: # assign retrieved key back to associated json key json_node[key] = val self.logger.info( 'Field "{}" was assigned with "{}" value'.format( key, val)) # print(key, '==>', json_node[key]) pass def eval_cfg_value(self, key, assay_cfg_val, common_cfg_val): # if assay config key is not provided, use common assay val if assay_cfg_val: cfg_val = assay_cfg_val else: cfg_val = common_cfg_val eval_flag = gc.SUBMISSION_YAML_EVAL_FLAG # 'eval!' # check if some configuration instruction/key was retrieved for the given "key" if cfg_val: if eval_flag in str(cfg_val): cfg_val = cfg_val.replace(eval_flag, '') # replace 'eval!' flag key try: out_val = eval(cfg_val) except Exception as ex: _str = 'Error "{}" occurred during preparing submission form "{}" for sub-aliquot "{}" ' \ 'while attempting to interpret configuration key "{}" provided for the form\'s key ' \ '"{}". \n{} ' \ .format(ex, self.form_name, self.sub_aliquot, cfg_val, key, traceback.format_exc()) self.logger.error(_str) self.error.add_error(_str) out_val = '' else: out_val = cfg_val else: # requested "key" does not exist neither in assay or common config files _str = 'No value was assigned to "{}" key during preparing submission form "{}" for sub-aliquot "{}".' \ .format(key, self.form_name, self.sub_aliquot) self.logger.warning(_str) out_val = '' return out_val def get_tarball_property(self, sa, val_type): value = '' if self.req_obj.attachments: tar_obj = self.req_obj.attachments.aliquots_tarball_dict[sa] if tar_obj: if val_type == 'name': value = os.path.basename(tar_obj['path']) elif val_type == 'md5': value = tar_obj['md5'] return value # it will retrieve any existing property_val from the request object def get_request_value(self, property_name, check_dict=False): return self.get_property_value_from_object(self.req_obj, property_name, check_dict) # it will retrieve any existing property_val from the submission_form object def get_submission_form_value(self, property_name, check_dict=False): return self.get_property_value_from_object(self, property_name, check_dict) # it will retrieve any existing property_val from rawdata object def get_rawdata_value(self, property_name, check_dict=False): # return self.get_property_value_from_object(self.req_obj.raw_data.aliquots_data_dict[self.sub_aliquot], # property_name, check_dict, 'dict') return self.get_sourcedata_value('rawdata', property_name, check_dict) # it will retrieve any existing property_val from assay data object def get_assaydata_value_by_col_number(self, col_num, check_dict=False): # obj = list(self.req_obj.assay_data.aliquots_data_dict[self.sub_aliquot].items()) # val = self.get_property_value_from_object(obj, col_num - 1, check_dict, 'dict', 'number') # if isinstance(val, tuple): # return val[1] # else: # return val return self.get_sourcedata_value_by_col_number('assaydata', col_num, check_dict) # it will retrieve any existing property_val from assay data object def get_assaydata_value(self, property_name, check_dict=False): # return self.get_property_value_from_object(self.req_obj.assay_data.aliquots_data_dict[self.sub_aliquot], # property_name, check_dict, 'dict') return self.get_sourcedata_value('assaydata', property_name, check_dict) # it will retrieve any existing property_val (specified by the name) from the data source object # specified by the data_source_name def get_sourcedata_value(self, data_source_name, property_name, check_dict=False): if data_source_name in self.req_obj.data_source_names: return self.get_property_value_from_object( self.req_obj.data_source_objects[data_source_name]. aliquots_data_dict[self.sub_aliquot], property_name, check_dict, 'dict') else: _str = 'Data source name ({}) requested during populating json submission form "{}" for aliquot id ' \ '"{}" does not exists for the current assay.'.format(data_source_name, self.form_name, self.aliquot) self.logger.error(_str) self.error.add_error(_str) return '#ERROR#' # it will retrieve any existing property_val (specified by the column number) from the data source object # specified by the data_source_name def get_sourcedata_value_by_col_number(self, data_source_name, col_num, check_dict=False): if data_source_name in self.req_obj.data_source_names: obj = list(self.req_obj.data_source_objects[data_source_name]. aliquots_data_dict[self.sub_aliquot].items()) val = self.get_property_value_from_object(obj, col_num - 1, check_dict, 'dict', 'number') if isinstance(val, tuple): return val[1] else: return val else: _str = 'Data source name ({}) requested during populating json submission form "{}" for aliquot id ' \ '"{}" does not exists for the current assay.'.format(data_source_name, self.form_name, self.aliquot) self.logger.error(_str) self.error.add_error(_str) return '#ERROR#' # it will retrieve a key of a property_val named in "property_val" parameter # from the object passed as a reference in "obj" parameter # obj_type possible values: "class" (type of "obj" is class), # "dict" (type of "obj" is dictionary) # property_type possible values: "name" ("property_val" is name of property_val), # "number" ("property_val" is number of items in dictionary) # noinspection PyUnusedLocal def get_property_value_from_object(self, obj, property_val, check_dict=False, obj_type='class', property_type='name'): property_val = str(property_val) if property_type == 'name': # if property_val name is given, proceed here if obj_type == 'class': get_item = 'obj.' + property_val + ' if hasattr(obj, "' + property_val + '") else ""' elif obj_type == 'dict': get_item = 'obj["' + property_val + '"] if "' + property_val + '" in obj else ""' else: get_item = None else: # if column number is given, proceed here get_item = 'obj[' + property_val + ']' try: out = eval(get_item) if check_dict: out = cm2.get_dict_value(out, property_val) except Exception as ex: _str = 'Error "{}" occurred during preparing submission form "{}" for sub-aliquot "{}" ' \ 'while attempting to evaluate property_val: "{}". \n{} ' \ .format(ex, self.form_name, self.sub_aliquot, get_item, traceback.format_exc()) self.logger.error(_str) self.error.add_error(_str) out = '' return out # converts an array of values (i.e. list of aliquots) in to list of dictionaries with a given key name # For example: [1, 2, 3] => [{name: 1}, {name: 2}, {name: 3}] @staticmethod def convert_simple_list_to_list_of_dict(sm_arr, key_name): out = [] for a in sm_arr: dict_ob = {key_name: a} out.append(dict_ob) return out def validate_json(self, json_file, schema_file): try: validate(json_file.json_data, schema_file.json_data) _str = 'Validation of "{}" against "{}" was successful.'.format( json_file.filepath, schema_file.filepath) self.logger.info(_str) except jsonschema.exceptions.ValidationError as ve: _str = 'Validation of "{}" file against schema "{}" failed with the following error: \n{}' \ .format(json_file.filepath, schema_file.filepath, ve) self.logger.error(_str) self.error.add_error(_str)
class Inquiry(File): def __init__(self, filepath, conf_main=None, file_type=2, sheet_name=''): # load_configuration (main_cfg_obj) # load global and local configureations File.__init__(self, filepath, file_type) self.sheet_name = sheet_name # .strip() if conf_main: self.conf_main = conf_main else: self.conf_main = ConfigData(gc.CONFIG_FILE_MAIN) self.error = InquiryError(self) self.log_handler = None self.logger = self.setup_logger(self.wrkdir, self.filename) self.logger.info( 'Start working with Download Inquiry file {}'.format(filepath)) self.inq_match_arr = [] self.columns_arr = [] self.inq_sources = {} self.inq_line_sources = {} # load common for all programs dictionary config self.conf_dict = DictConfigData(gc.CONFIG_FILE_DICTIONARY) if not self.conf_dict.loaded: # disqualify the current inquiry file _str = 'Aborting processing of the inquiry file - the following common dictionary config file cannot ' \ 'be loaded: {}.'.format(gc.CONFIG_FILE_MAIN) self.error.add_error(_str) self.logger.error(_str) return # save inquiry file structure into a dedicated variables self.file_structure_by_col_num = self.conf_dict.get_inqury_file_structure( 'by_col_num') self.file_structure_by_col_name = self.conf_dict.get_inqury_file_structure( 'by_col_name') self.processed_folder = gc.INQUIRY_PROCESSED_DIR # if a relative path provided, convert it to the absolute address based on the application working dir if not os.path.isabs(self.processed_folder): self.processed_folder = Path(self.wrkdir) / self.processed_folder else: self.processed_folder = Path(self.processed_folder) self.download_request_path = None self.disqualified_items = {} self.disqualified_inquiry_path = '' # will store path to a inquiry file with disqualified sub-aliquots if not self.sheet_name or len(self.sheet_name) == 0: # if sheet name was not passed as a parameter, try to get it from config file self.sheet_name = gc.INQUIRY_EXCEL_WK_SHEET_NAME # 'wk_sheet_name' # print (self.sheet_name) self.logger.info('Data will be loaded from worksheet: "{}"'.format( self.sheet_name)) self.conf_process_entity = None self.db_access = DBAccess(self.logger, self.conf_main, self.error) self.get_file_content() def get_file_content(self): if not self.columns_arr or not self.lines_arr: self.columns_arr = [] self.lines_arr = [] if cm.file_exists(self.filepath): self.logger.debug('Loading file content of "{}"'.format( self.filepath)) with xlrd.open_workbook(self.filepath) as wb: if not self.sheet_name or len(self.sheet_name) == 0: # by default retrieve the first sheet in the excel file sheet = wb.sheet_by_index(0) else: # if sheet name was provided sheets = wb.sheet_names() # get list of all sheets if self.sheet_name in sheets: # if given sheet name in the list of available sheets, load the sheet sheet = wb.sheet_by_name(self.sheet_name) else: # report an error if given sheet name not in the list of available sheets _str = ( 'Given worksheet name "{}" was not found in the file "{}". ' 'Verify that the worksheet name exists in the file.' ).format(self.sheet_name, self.filepath) self.error.add_error(_str) self.logger.error(_str) self.lines_arr = None self.loaded = False return self.lines_arr sheet.cell_value(0, 0) lines = [ ] # will hold content of the inquiry file as an array of arrays (rows) columns = [] for i in range(sheet.ncols): column = [] for j in range(sheet.nrows): if i == 0: lines.append( [] ) # adds an array for each new row in the inquiry file # print(sheet.cell_value(i, j)) cell = sheet.cell(j, i) cell_value = cell.value # take care of number and dates received from Excel and converted to float by default if cell.ctype == 2 and int(cell_value) == cell_value: # the key is integer cell_value = str(int(cell_value)) elif cell.ctype == 2: # the key is float cell_value = str(cell_value) # convert date back to human readable date format # print ('cell_value = {}'.format(cell_value)) if cell.ctype == 3: cell_value_date = xlrd.xldate_as_datetime( cell_value, wb.datemode) cell_value = cell_value_date.strftime( "%Y-%m-%directory") column.append( cell_value ) # adds value to the current column array # lines[j].append('"' + cell_value + '"') # adds value in "csv" format for a current row lines[j].append(cell_value) # self.columns_arr.append(','.join(column)) columns.append( column) # adds a column to a list of columns # populate lines_arr and columns_arr properties self.lines_arr = lines self.columns_arr = columns # populate lineList value as required for the base class self.lineList = [] for ln in lines: self.lineList.append(','.join(str(ln))) wb.unload_sheet(sheet.name) # perform validation of the current inquiry file self.validate_inquiry_file() if self.error.exist(): # report that errors exist self.loaded = False # print(self.error.count) # print(self.error.get_errors_to_str()) _str = 'Errors ({}) were identified during validating of the inquiry. \nError(s): {}'.format( self.error.count, self.error.get_errors_to_str()) else: self.loaded = True else: _str = 'Loading content of the file "{}" failed since the file does not appear to exist".'.format( self.filepath) self.error.add_error(_str) self.logger.error(_str) self.columns_arr = None self.lines_arr = None self.loaded = False return self.lineList def validate_inquiry_file(self): self.logger.info( 'Start validating the current inquiry file "{}".'.format( self.filepath)) row_count = 1 failed_cnt = 0 valid_aliquot_flag = self.conf_main.get_value( 'Validate/aliquot_id_vs_manifest') valid_inquiry_values_flag = self.conf_main.get_value( 'Validate/inquiry_values_vs_dictionary') inquiry_min_number_columns = self.conf_main.get_value( 'Validate/inquiry_min_number_columns') inquiry_validate_number_columns = self.conf_main.get_value( 'Validate/inquiry_validate_number_columns') if not inquiry_min_number_columns or not isinstance( inquiry_min_number_columns, int): inquiry_min_number_columns = 6 # set a default value if it is not provided in the config file if not inquiry_validate_number_columns or not isinstance( inquiry_validate_number_columns, int): inquiry_validate_number_columns = 6 # set a default value if it is not provided in the config file for row in self.lines_arr: if row_count == self.header_row_num: # 1 # skip the first column as it is a header row_count += 1 continue sub_al = 'ND' # set blank value as default assay = '' # set blank value as default valid_aliquot_performed = False skip_final_check = False # check if inquiry file contain min number of columns if len(row) < inquiry_min_number_columns: # disqualify the current inquiry file _str = 'The current inquiry file has {} columns while {} are expected and will be disqualified.' \ .format(len(row), inquiry_min_number_columns) self.error.add_error(_str) self.logger.error(_str) return # create a local DictConfigData object and copy there a dictionary object conf_dict = DictConfigData(None, self.conf_dict.get_dictionary_copy()) # get sub-aliquot value before looping through all fields, so it can be used for reporting errors # also get program_code assigned to the row program_code = self.get_inquiry_value_by_field_name( 'program_code', row) sub_al = self.get_inquiry_value_by_field_name( 'sub-aliquot', row, False) # validate program_code value if conf_dict.key_exists_in_dict( str(program_code).lower(), 'program_code'): # update dictionary config (conf_dict) with the program specific settings loaded into conf_dict_program conf_dict_program_path = gc.CONFIG_FILE_DICTIONARY_PROGRAM\ .replace('{program}', conf_dict.get_dict_value(str(program_code).lower(), 'program_code')) conf_dict_program = ConfigData(conf_dict_program_path) conf_dict.update(conf_dict_program.get_whole_dictionary()) else: _str = 'Unexpected value "{}" was provided for "program_code" (line #{})' \ .format(program_code, row_count) self.logger.critical(_str) # disqualify an inquiry file row, if unexpected value was provided self.disqualify_inquiry_item(sub_al, _str, row) failed_cnt += 1 skip_final_check = True if not skip_final_check: # go through fields and validate the provided values for i in range(len(row)): if i + 1 > inquiry_validate_number_columns: # if number of columns in the inquiry file > expected maximum, exit the loop break col_category = conf_dict.get_dict_value( str(i + 1), 'inquiry_file_structure') if col_category in ('program_code', 'sub-aliquot'): # no checking is needed for the listed field, proceed further continue elif col_category == 'db_center_id': # get center id value and validate it db_center_id = row[i] # validate center_code or center_id value self.logger.info( 'Start validation of center value "{}" provided for the current row' .format(db_center_id)) db = DBAccess(self.logger, self.conf_main, self.error) # create DBAccess object db.open_connection() # test center value assuming center code was provided dataset = db.validate_center_code( db_center_id, program_code, 'code', 'code') _str_err_out1, center_id_out1 = self.check_validation_dataset_outcome( dataset, 'center_id', 'center_code') if center_id_out1: # center id was returned, meaning center was validated fine db_center_id = center_id_out1 else: # if center code was not validated at first attempt, validate it assuming the center id was given dataset = db.validate_center_code( db_center_id, program_code, 'id', 'code') _str_err_out2, center_id_out2 = self.check_validation_dataset_outcome( dataset, 'center_id', 'center_id') if center_id_out2: # center id was validated at the 2nd attempt, ignore the 1st validation attempt db_center_id = center_id_out2 else: # center validation attempts failed, report both failures _str = 'Provided center value cannot be interpreted neither as code nor id; ' \ 'here are both validation outcomes: ' + \ ' | '.join([_str_err_out1, _str_err_out2]) self.logger.warning(_str) self.disqualify_inquiry_item(sub_al, _str, row) failed_cnt += 1 skip_final_check = True break # if the aliquot validation is required, validate the sub-aliquot value using the db_center_id value if valid_aliquot_flag: # aliquot id validation is required valid_aliquot_performed = True # flag that aliquot validation was done if isinstance(db_center_id, int): # db_center_id.isnumeric(): # since center is numeric, proceed here # get aliquot id based on the verified earlier assay value and given sub_aliquot id aliquot = conf_dict.convert_sub_aliq_to_aliquot( sub_al, assay) valid_status, valid_desc = self.db_access.validate_aliquot_id( aliquot, db_center_id) if valid_status != 'OK': # disqualify an inquiry file row, if returned status is not OK _str = 'No match was found for the aliquot id "{}" (row #{}) in the manifest dataset ' \ 'of the database. DB response => Status: "{}"; Description: "{}".'\ .format(aliquot, row_count, valid_status, valid_desc) self.logger.warning(_str) self.disqualify_inquiry_item( sub_al, _str, row) failed_cnt += 1 skip_final_check = True break else: # report unexpected center id value _str = 'Unexpected value "{}" was provided for "db_center_id" (line #{}, column #{}). This is a ' \ 'critical error because this value is required (based on the configuration setting ' \ '"Validate/aliquot_id_vs_manifest") to validate the provided aliquot id "{}"' \ .format(db_center_id, row_count, i + 1, sub_al) self.logger.warning(_str) # disqualify an inquiry file row, if unexpected value was provided self.disqualify_inquiry_item(sub_al, _str, row) failed_cnt += 1 skip_final_check = True # break else: self.logger.info( 'Validating of the provided aliquot_id "{}" is not required based on the ' 'value of the config parameter "Validate/aliquot_id_vs_manifest": "{}".' .format(sub_al, valid_aliquot_flag)) else: if col_category == 'assay': assay = row[i].strip().lower( ) # save assay value to a dedicated variable if valid_inquiry_values_flag: # if validation of the inquiry values vs dictionary is required validate_values = [] validate_categories = [] if col_category == 'bulk_location': # get inquiry_file_structure_bulk_location value bulk_value_delim = conf_dict.get_dict_value( 'inquiry_file_structure_bulk_location_delim', '') validate_values = str( row[i]).split(bulk_value_delim) validate_categories = conf_dict.get_dict_object( 'inquiry_file_structure_bulk_location', '') else: validate_values.append(str(row[i]).lower()) validate_categories.append(col_category) for vv, vc in zip(validate_values, validate_categories): if not conf_dict.key_exists_in_dict( vv.lower(), vc): if col_category == 'bulk_location': _str = 'Unexpected value "{}" was provided for "{}" as a part of ' \ 'the "bulk_location" value (line #{}, column #{})' \ .format(vv, vc, row_count, i + 1) else: _str = 'Unexpected value "{}" was provided for "{}" (line #{}, column #{})'\ .format(vv, vc, row_count, i+1) self.logger.critical(_str) # disqualify an inquiry file row, if unexpected value was provided self.disqualify_inquiry_item( sub_al, _str, row) failed_cnt += 1 skip_final_check = True break if skip_final_check: break # check that if aliquot validation is required it was actually performed if not skip_final_check: if valid_aliquot_flag and not valid_aliquot_performed: _str = 'Required aliquot validation vs. database manifest was not performed for the current row ' \ '(#{}) and it is considered a disqualification reason (most likely the db_center_id column ' \ 'was not provided). ' \ .format(row_count) self.logger.critical(_str) # disqualify an inquiry file row, if unexpected value was provided self.disqualify_inquiry_item(sub_al, _str, row) failed_cnt += 1 row_count += 1 self.logger.info('Finish validating the inquiry file with{}.'.format( ' no errors' if failed_cnt == 0 else ' errors; {} records were disqualified - see earlier log entries for details' .format(failed_cnt))) def check_validation_dataset_outcome(self, dataset, validation_id_column, validation_id_name): _str_err = '' validation_id_out = None if dataset: for row in dataset: if 'status' in row: status = row['status'] if 'description' in row: description = row['description'] if validation_id_column in row: # center_id validation_id = row[validation_id_column] break # read only first row of the dataset if status == 'OK': # validation was successful validation_id_out = validation_id elif status == 'Failed': # validation has failed _str_err = 'Validation of the provided {} value vs DB has Failed, description: {}'\ .format(validation_id_name, description) else: # unexpected status value was returned _str_err = 'Validation of the provided {} value vs DB returned unexpected status {}'\ .format(validation_id_name, status) else: _str_err = 'Unexpected error was reported during validating {} in the DB. ' \ 'Check earlier entries in the log file.'\ .format(validation_id_name) return _str_err, validation_id_out def setup_logger(self, wrkdir, filename): # m_cfg = ConfigData(gc.CONFIG_FILE_MAIN) log_folder_name = gc.INQUIRY_LOG_DIR # gc.LOG_FOLDER_NAME # m_logger_name = gc.MAIN_LOG_NAME # m_logger = logging.getLogger(m_logger_name) logger_name = gc.INQUIRY_LOG_NAME logging_level = self.conf_main.get_value('Logging/inquiry_log_level') # if a relative path provided, convert it to the absolute address based on the application working dir if not os.path.isabs(log_folder_name): log_folder_path = Path(wrkdir) / log_folder_name else: log_folder_path = Path(log_folder_name) lg = setup_logger_common( logger_name, logging_level, log_folder_path, # Path(wrkdir) / log_folder_name, str(filename) + '_' + time.strftime("%Y%m%d_%H%M%S", time.localtime()) + '.log') self.log_handler = lg['handler'] return lg['logger'] # function will combine the datasource_id for the current inquiry line # it is possible that different lines will have the same datasource_id and thus can share the datasource def get_inquiry_line_datasource_id(self, inq_line): datasource_id = '' for col in self.file_structure_by_col_name: if col in ['program_code', 'assay', 'source_id']: datasource_id += '|' + self.get_inquiry_value_by_field_name( col, inq_line) elif 'source_' in col: datasource_id += '|' + self.get_inquiry_value_by_field_name( col, inq_line, False) return datasource_id def get_inquiry_value_by_field_name(self, field_name, inq_line, validate_by_dictionary=None): if validate_by_dictionary is None: validate_by_dictionary = True # set default value to True if field_name in self.file_structure_by_col_name: col_num = self.file_structure_by_col_name[field_name] value = inq_line[col_num - 1].strip() else: value = '' # validate the provided program code through the dictionary if validate_by_dictionary: value = self.conf_dict.get_dict_value( str(value).lower(), field_name) return value def process_inquiry_sources(self): cur_row = 0 for inq_line in self.lines_arr: if cur_row == self.header_row_num - 1: # skip the header row cur_row += 1 continue # get program code assigned to the current row program_code = self.get_inquiry_value_by_field_name( 'program_code', inq_line) # get assay assigned to the current row assay = self.get_inquiry_value_by_field_name('assay', inq_line) # get source id assigned to the current row source_id = self.get_inquiry_value_by_field_name( 'source_id', inq_line) # get source config file # 2 values are saved in tuple: program name specific path and default one. # if program name specific path does not exist, the default will be used cfg_source_path = ( # configuration path for the current program by name gc.CONFIG_FILE_SOURCE_PATH\ .replace('{program}', program_code)\ .replace('{assay}', assay)\ .replace('{source_id}', source_id), # configuration path for the default program (used if no program specific path is present) gc.CONFIG_FILE_SOURCE_PATH \ .replace('{program}', 'default') \ .replace('{assay}', assay) \ .replace('{source_id}', source_id) ) # get the source location config file path cfg_source_location_path = gc.CONFIG_FILE_SOURCE_LOCATION_PATH.replace( '{source_id}', source_id) # attempt to load configuration for the program specific path cfg_source = ConfigData(Path(cfg_source_path[0])) if not cfg_source.loaded: # if config was not loaded from the program specific path, load the default one cfg_source = ConfigData(Path(cfg_source_path[1])) if cfg_source.loaded: # proceed here if the source config was loaded # load source location config with location specific settings for the current source cfg_source_location = ConfigData( Path(cfg_source_location_path)) if cfg_source_location.loaded: # if the source location config was loaded, update cfg_source config with the source location config cfg_source.update( cfg_source_location.get_whole_dictionary()) # get unique id of the datasource and check if the same id was used already, reuse that in such case inq_line_datasource_id = self.get_inquiry_line_datasource_id( inq_line) self.logger.info( 'Current inquiry row #{} was identified with the following data source id: {}' .format(cur_row, inq_line_datasource_id)) # assign source id (inq_line_datasource_id) to the current inquiry line self.inq_line_sources[cur_row] = inq_line_datasource_id if inq_line_datasource_id in self.inq_sources: # reuse existing datasource self.logger.info( 'Identified data source id for the current inquiry row #{} was identified as ' 'earlier retrieved one (for this or another row) and will be re-used for ' 'the current row.'.format(cur_row)) else: # create a new datasource object inq_line_datasource = DataSource(self, cfg_source, inq_line, inq_line_datasource_id) self.inq_sources[ inq_line_datasource_id] = inq_line_datasource else: sub_al = self.get_inquiry_value_by_field_name( 'sub-aliquot', inq_line, False) _str = 'Datasource config file for the row #{} (sub_aliquot: {}) cannot be loaded. ' \ 'None of the expected to exist files is accessible: {}'\ .format(cur_row, sub_al, ' | '.join(cfg_source_path)) self.logger.warning(_str) self.disqualify_inquiry_item( sub_al, _str, cur_row ) # TODO: verify if inq_line should be used instead of curr_row cur_row += 1 pass def process_inquiry(self): self.process_inquiry_sources() self.match_inquiry_items_to_sources() self.create_download_request_file() self.create_inquiry_file_for_disqualified_entries() # check for errors and put final log entry for the inquiry. if self.error.exist(): _str = 'Processing of the current inquiry was finished with the following errors: {}\n'.format( self.error.get_errors_to_str()) self.logger.error(_str) else: _str = 'Processing of the current inquiry was finished successfully.\n' self.logger.info(_str) def match_inquiry_items_to_sources(self): cur_row = -1 for inq_line in self.lines_arr: cur_row += 1 # increase row counter if cur_row == self.header_row_num - 1: continue # program_code = str(inq_line[0]) # get program code that must be a first column program_code = self.get_inquiry_value_by_field_name( 'program_code', inq_line) # create a local DictConfigData object and copy there a dictionary object conf_dict = DictConfigData(None, self.conf_dict.get_dictionary_copy()) # update dictionary config (conf_dict) with the program specific settings loaded into conf_dict_program conf_dict_program_path = gc.CONFIG_FILE_DICTIONARY_PROGRAM.replace( '{program}', program_code) conf_dict_program = ConfigData(conf_dict_program_path) conf_dict.update(conf_dict_program.get_whole_dictionary()) # print (inq_study_path) bulk_location = self.get_inquiry_value_by_field_name( 'bulk_location', inq_line, False) assay = self.get_inquiry_value_by_field_name('assay', inq_line) sub_al = self.get_inquiry_value_by_field_name( 'sub-aliquot', inq_line, False) # inq_study_path = '/'.join([program_code, bulk_location, assay]) inq_study_path = self.conf_main.get_value( 'Destination/study_path_template') inq_study_path = inq_study_path.replace('{program_code}', program_code) inq_study_path = inq_study_path.replace('{bulk_location}', bulk_location) inq_study_path = inq_study_path.replace('{assay}', assay) # check if current sub-aliquot is not part of disqualified items array if self.disqualified_items and sub_al in self.disqualified_items.keys( ): # if sub-aliquot was disqualifed already, skip this line continue # identify aliquot for the given sub-aliquot al = conf_dict.convert_sub_aliq_to_aliquot( sub_al, assay) # identify aliquot for the current inquiry line match = False # get reference to the Datasource object assigned to the current row if cur_row in self.inq_line_sources: cur_source = self.inq_sources[self.inq_line_sources[cur_row]] else: # if the data source was not assigned to the current row, skip the row using this datasource cur_source = None continue # check if any source types were disqualified during loading the datasource if cur_source.disqualified_data_sources: # if at least one source of the datasource was disqualified, skip the row using this datasource # and disqualify the current sub-aliquot as well self.disqualify_inquiry_item( sub_al, 'Datasource associated with this aliquot_id was marked as disqualified.', inq_line) continue # get a copy of the source type ids of the current datasource; # it will track number of items found for each source type cur_source_types = copy.deepcopy(cur_source.source_types) # loop through items of the source for src_item in cur_source.source_content_arr: match_out = False # attempt match by the sub-aliquot match_out, match_details = \ self.is_item_found_soft_match(sub_al, src_item['name'], src_item['soft_comparisions'], sub_al) if match_out: match = True # if sub-aliquot match was not success, attempt to match by the aliquot elif src_item['aliquot_match']: match_out, match_details = \ self.is_item_found_soft_match(al, src_item['name'], src_item['soft_comparisions'], sub_al) if match_out: match = True # if a match was found using one of the above methods, record the item to inq_match_arr if match_out: # since a match was found, verify that the source path is accessible (except for web locations) web_loc = src_item['web_location'] # real_path = os.path.realpath(src_item['path']) # real path of the current item if web_loc or not web_loc and os.path.exists( src_item['path']): item_details = { 'sub-aliquot': sub_al, 'study': inq_study_path, # 'source': src_item, 'source_item_name': src_item['name'], 'target_subfolder': src_item['target_subfolder'], 'real_path': src_item['path'], 'target_copied_item_name': src_item['target_copied_item_name'], 'match_details': match_details, 'source_type_id': src_item['source_type_id'], 'obj_type': src_item['obj_type'], 'source_name_generic': cur_source.source_name_generic } self.inq_match_arr.append(item_details) # record the source type id of an item to track quantity of found matches for each source type cur_source_types[ src_item['source_type_id']]['items_count'] += 1 else: self.disqualify_inquiry_item( sub_al, 'A match was found, but the identified source path is not accessible. Match details: {}. ' 'Source path: "{}". Real source path: "{}".'. format(match_details, src_item['path'], src_item['path']), inq_line) # report if no match was found and # verify that a match was found for each of the source types of the current datasource if not match: # no matches were found for the current datasource self.disqualify_inquiry_item( sub_al, 'No matching items (files/folders) were found in the current data source.', inq_line) else: if not cur_source.allow_nomatch_per_sourcetype: # some matches were found; verify that a match was found for each of the source types for src_type in cur_source_types: if cur_source_types[src_type]['items_count'] == 0: # no matches were found for this source type self.disqualify_inquiry_item( sub_al, 'No matches were found for the "{}" source type id in the datasource.' .format(src_type), inq_line) def is_item_found_soft_match(self, srch_item, srch_in_str, soft_match_arr, item_to_be_reported): out = False _str = '' # identify if the search is performed for sub_aliquot (full value) or aliquot (partial value) if srch_item == item_to_be_reported: entity = 'sub-aliquot' else: entity = 'aliquot' soft_match = False self.logger.debug("srch_item = {}| srch_in_str = {}".format( srch_item, srch_in_str)) if srch_item in srch_in_str: out = True self.logger.debug("Exact match found between: {} | {}".format( srch_item, srch_in_str)) else: if soft_match_arr: self.logger.debug("Starting soft match for: {} | {}".format( srch_item, srch_in_str)) for item in soft_match_arr: srch_in_str = srch_in_str.replace(item['find'], item['replace']) srch_item = srch_item.replace(item['find'], item['replace']) self.logger.debug( "Updated for soft match: srch_item = {}| srch_in_str = {}". format(srch_item, srch_in_str)) if srch_item in srch_in_str: out = True soft_match = True self.logger.debug( "Soft match found between: {} | {}".format( srch_item, srch_in_str)) # prepare log entry if out: _str = str('Loose' if soft_match else 'Exact') + \ ' match was ' + \ 'found for {} item "{}". Match values are as following: "{}" and "{}".'\ .format(entity, item_to_be_reported, srch_item, srch_in_str) # log outcome of the match process, the "soft" match will logged as warning if out: if entity == 'aliquot': # if match was found by aliquot (partial id value), always report it as "warning" self.logger.warning(_str) else: # proceed here if match was found by sub-aliquot (full id value) if soft_match: self.logger.warning(_str) else: self.logger.info(_str) # prepare match details to output from this function match_type = '' if soft_match: # this was a soft match if entity == 'aliquot': match_type = 'loose/aliquot' else: match_type = 'loose' else: # this was an exact match if entity == 'aliquot': match_type = 'exact/aliquot' else: match_type = 'exact' out_details = {'match_type': match_type, 'details': _str} return out, out_details def create_download_request_file(self): self.logger.info("Start preparing download_request file.") # path for the script file being created rf_path = Path(gc.OUTPUT_REQUESTS_DIR + "/" + time.strftime("%Y%m%d_%H%M%S", time.localtime()) + '_' + self.filename.replace(' ', '') + '.tsv') self.download_request_path = rf_path if not self.inq_match_arr: self.logger.warning( 'No inquiries with matched datasources exists for the current inquiry file. ' 'Skipping creating a download request file.') return with open(rf_path, "w") as rf: # write headers to the file headers = '\t'.join([ 'Source', 'Destination', 'Aliquot_id', 'Obj_Type', 'Target_Item_Name' ]) rf.write(headers + '\n') for item in self.inq_match_arr: src_path = item['real_path'] # item['source']['path'] #prepare values for the current inquiry row to put into the outcome file # project_path = self.conf_process_entity.get_value('Destination/location/project_path') bulk_data_path = self.conf_main.get_value( 'Destination/bulk_data_path') study_path = item['study'] target_subfolder = item[ 'target_subfolder'] # item['source']['target_subfolder'] sub_aliquot = item['sub-aliquot'] obj_type = item['obj_type'] target_copied_item_name = item['target_copied_item_name'] # check if current sub-aliquot is not part of disqualified items array if self.disqualified_items and sub_aliquot in self.disqualified_items.keys( ): # if sub-aliquot was disqualifed already, skip this line continue # get template for the destination path and replace placeholders with values # "{project_path}/{study_path}/{target_subfolder}" dest_path = self.conf_main.get_value( 'Destination/path_template') dest_path = dest_path.replace('{bulk_data_path}', bulk_data_path) dest_path = dest_path.replace('{study_path}', study_path) dest_path = dest_path.replace('{target_subfolder}', target_subfolder) line = '\t'.join([ str(src_path), str(Path(dest_path)), str(sub_aliquot), str(obj_type), target_copied_item_name ]) rf.write(line + '\n') self.logger.info( "Finish preparing download_request file '{}'.".format(rf_path)) def disqualify_inquiry_item(self, sa, disqualify_status, inquiry_item): # adds a sub aliquots to the dictionary of disqualified items # key = sub-aliquot, values: dictionary with 2 values: # 'status' - reason for disqualification # 'inquiry_item: array of values for inquiry row from an inquiry file details = {'status': disqualify_status, 'inquiry_item': inquiry_item} if not sa in self.disqualified_items: self.disqualified_items[sa] = details self.logger.warning( 'Sub-aliquot "{}" was disqualified with the following status: "{}"' .format(sa, disqualify_status)) else: self.logger.warning( 'Sub-aliquot "{}" was already disqualified earlier. ' 'The following disqualification call will be ignored: "{}"'. format(sa, disqualify_status)) def create_inquiry_file_for_disqualified_entries(self): if self.disqualified_items: self.logger.info( "Start preparing inquiry file for disqualified sub-aliquots.") # path for the script file being created wb = xlwt.Workbook() # create empty workbook object sh = wb.add_sheet( 'Re-process_inquiry' ) # sheet name can not be longer than 32 characters cur_row = 0 # first row for 0-based array cur_col = 0 # first col for 0-based array # write headers to the file headers = self.lines_arr[0] for val in headers: sh.write(cur_row, cur_col, val) cur_col += 1 cur_row += 1 for di in self.disqualified_items: fields = self.disqualified_items[di]['inquiry_item'] cur_col = 0 for val in fields: sh.write(cur_row, cur_col, val) cur_col += 1 cur_row += 1 if not os.path.isabs(gc.DISQUALIFIED_INQUIRIES): disq_dir = Path(self.wrkdir) / gc.DISQUALIFIED_INQUIRIES else: disq_dir = Path(gc.DISQUALIFIED_INQUIRIES) # if DISQUALIFIED_INQUIRIES folder does not exist, it will be created os.makedirs(disq_dir, exist_ok=True) # identify path for the disqualified inquiry file self.disqualified_inquiry_path = Path( str(disq_dir) + '/' + time.strftime("%Y%m%d_%H%M%S", time.localtime()) + '_reprocess_disqualified_' + # .stem method is used to get file name without an extension Path(self.filename).stem.replace(' ', '') + '.xls') wb.save(str(self.disqualified_inquiry_path)) self.logger.info( "Successfully prepared the inquiry file for disqualified sub-aliquots and saved in '{}'." .format(str(self.disqualified_inquiry_path)))
class Monitor(): def __init__(self, cfg_monitor_path, log_obj): self.action_completed = False self.status = [] self.mtr_cfg_path = cfg_monitor_path self.log = log_obj self.error = MonitorError(self) self.mtr_cfg = ConfigData(cfg_monitor_path) if self.validate_config_file(): self.loaded = True else: self.loaded = False cur_cfg_dir = os.path.dirname(cfg_monitor_path) cur_cfg_file_name = Path(os.path.abspath(cfg_monitor_path)).name stamp_dir = Path(str(cur_cfg_dir) + '/' + gc.STAMPS_FILES_FOLDER_NAME) if not os.path.exists(stamp_dir): os.mkdir(stamp_dir) stamp_file = Path( str(stamp_dir) + '/' + cur_cfg_file_name.replace('.yaml', '_stamp.yaml')) self.verify_config_stamp_file(stamp_file) self.mtr_cfg_stamp = ConfigData(stamp_file) self.mtr_source = None self.mtr_source_path = None if self.loaded: # get config file values self.mtr_source_dir = Path( cm.eval_cfg_value( self.mtr_cfg.get_value('Location/source_dir'), self.log, None)) self.mtr_source_file = Path( self.mtr_cfg.get_value('Location/source_file')) found_files = cm.find_file_in_dir(self.mtr_source_dir, self.mtr_source_file, False) if found_files: ff_stamp = None for file_match in found_files: if not ff_stamp or ff_stamp < os.stat( Path(self.mtr_source_dir) / file_match).st_mtime: ff_stamp = os.stat( Path(self.mtr_source_dir) / file_match).st_mtime self.mtr_source = file_match # self.mtr_source = found_files[0] self.mtr_source_path = Path( self.mtr_source_dir) / self.mtr_source # else: # self.mtr_source = None # self.mtr_source_path = None self.mtr_destin = self.mtr_cfg.get_value('Location/destination') self.mtr_item = self.mtr_cfg.get_value('Monitoring/item') self.mtr_type = self.mtr_cfg.get_value('Monitoring/type') self.mtr_action = self.mtr_cfg.get_value('Monitoring/action') self.mtr_frequency = self.mtr_cfg.get_value('Monitoring/frequency') # self.mtr_email = self.mtr_cfg.get_value('Monitoring/email_notification') # self.mtr_email_cc = self.mtr_cfg.get_value('Monitoring/email_cc') # load stamp info from stamp config file self.mtr_sync_date = self.mtr_cfg_stamp.get_value( 'Last_sync/date_time') self.mtr_watch_value = self.mtr_cfg_stamp.get_value( 'Last_sync/watch_value') def verify_config_stamp_file(self, file_path): if not cm.file_exists(file_path): # if file is not present, create it f = open(file_path, "w+") f.close def validate_config_file(self): # TODO: add some rules to validate the current monitoring config file return True def start_monitor(self): if self.mtr_source_path: next_sync_datetime = None # default value # check if delay between monitoring events was fulfilled if self.mtr_sync_date and str(self.mtr_frequency).isnumeric(): try: next_sync_datetime = datetime.strptime(self.mtr_sync_date, gc.STAMP_DATETIME_FORMAT) + \ timedelta(seconds=self.mtr_frequency) except Exception as ex: # report unexpected error to log file _str = 'Unexpected Error "{}" occurred during calculating next sync datetime. ' \ 'Saved sync date: "{}", sync frequency: "{}"' \ .format(ex, self.mtr_sync_date, self.mtr_frequency) self.status.append(_str) _str = _str + '\n{} '.format(traceback.format_exc()) self.log.error(_str) self.error.add_error(_str) if not next_sync_datetime or next_sync_datetime < datetime.now(): self.log.info( 'Monitoring delay of "{}" seconds has expired since the last syncronization event on {}. ' 'Proceeding to monitor "{}" file.'.format( self.mtr_frequency if self.mtr_frequency else 'N/A', self.mtr_sync_date if self.mtr_sync_date else 'N/A', self.mtr_source)) custom_action = self.action_copy # set default value if self.mtr_action == 'copy': custom_action = self.action_copy watcher = Watcher( self.mtr_source_path, custom_action, self, self.mtr_watch_value) # self.mtr_item, self.mtr_type) watcher.watch() # start the watch going # update stats in the config file datetime_stamp = time.strftime(gc.STAMP_DATETIME_FORMAT, time.localtime()) self.mtr_cfg_stamp.set_value(datetime_stamp, 'Last_sync/date_time') self.log.info( 'Datetime information for monitored file was recorded: Last_sync/date_time: {}' .format(datetime_stamp)) else: _str = 'Monitoring delay of "{}" seconds has not expired since the last syncronization event on {}. '\ .format(self.mtr_frequency if self.mtr_frequency else 'N/A', self.mtr_sync_date if self.mtr_sync_date else 'N/A') self.log.info(_str) self.status.append(_str) else: _str = 'Source file "{}" was not found in the source directory "{}". '\ .format(self.mtr_source_file, self.mtr_source_dir) self.log.warning(_str) self.status.append(_str) def action_copy(self, file_time_stamp): self.log.info('Start copying "{}" to "{}"'.format( self.mtr_source, self.mtr_destin)) self.new_file_time_stamp = file_time_stamp try: shutil.copy(self.mtr_source_path, self.mtr_destin) _str = 'Copying of "{}" to "{}" completed successfuly.'.format( self.mtr_source_path, self.mtr_destin) self.log.info(_str) self.action_completed = True self.status.append(_str) # update stats in the config file self.mtr_cfg_stamp.set_value(file_time_stamp, 'Last_sync/watch_value') self.log.info( 'Stamp information for just copied file was recorded: ' 'Last_sync/watch_value: {}'.format(file_time_stamp)) except Exception as ex: # report unexpected error to log file _str = 'Unexpected Error "{}" occurred during copying file "{}" to "{}"\n{} ' \ .format(ex, self.mtr_source, self.mtr_destin, traceback.format_exc()) self.log.error(_str) self.error.add_error(_str) self.status.append(_str)
def process_download_inquiries(): # load main config file and get required values m_cfg = ConfigData(gc.CONFIG_FILE_MAIN) if not m_cfg.loaded: print( 'Specified main config file ({}) was not loaded. Aborting execution.' .format(gc.CONFIG_FILE_MAIN)) return 1 # load location config file (with local value specific for the location) cfg_location = ConfigData(gc.CONFIG_FILE_LOCATION) if not cfg_location.loaded: print( 'Specified location config file ({}) was not loaded. Aborting execution.' .format(gc.CONFIG_FILE_LOCATION)) return 1 # if both configs were loaded, update the main config with the location config m_cfg.update(cfg_location.get_whole_dictionary()) # print ('m_cfg = {}'.format(m_cfg.cfg)) # assign values common_logger_name = gc.MAIN_LOG_NAME # m_cfg.get_value('Logging/main_log_name') # get path configuration values logging_level = m_cfg.get_value('Logging/main_log_level') # path to the folder where all new inquiry files will be posted inquiries_loc = m_cfg.get_value('Location/inquiries') gc.DISQUALIFIED_INQUIRIES = m_cfg.get_value( 'Location/inquiries_disqualified') # get path configuration values and save them to global_const module # path to the folder where all application level log files will be stored (one file per run) gc.APP_LOG_DIR = m_cfg.get_value('Location/app_logs') # path to the folder where all log files for processing inquiry files will be stored # (one file per inquiry) gc.INQUIRY_LOG_DIR = m_cfg.get_value('Location/inquiry_logs_relative_path') # path to the folder where all processed (and renamed) inquiries will be stored gc.INQUIRY_PROCESSED_DIR = m_cfg.get_value( 'Location/inquiries_processed_relative_path') # get config setting for the processed_add_datestamp and save it to global const module processed_add_datestamp = m_cfg.get_value( 'Location/processed_add_datestamp') if processed_add_datestamp: gc.PROCESSED_ADD_DATESTAMP = processed_add_datestamp # path to the folder where created submission packages will be located. One package sub_folder per inquiry. gc.OUTPUT_REQUESTS_DIR = m_cfg.get_value('Location/output_requests') # path to dir with dynamically created inquiry files for disqualified aliquots gc.DISQUALIFIED_INQUIRIES = m_cfg.get_value( 'Location/inquiries_disqualified_path') log_folder_name = gc.APP_LOG_DIR # gc.LOG_FOLDER_NAME # this variable define if Data Downloader app will be executed at the end of processing inquiries run_data_download = m_cfg.get_value('Execute/run_data_downloader') # path to the Data Downloader tool gc.DATA_DOWNLOADER_PATH = m_cfg.get_value('Location/data_downloader_path') prj_wrkdir = os.path.dirname(os.path.abspath(__file__)) email_msgs = [] # email_attchms = [] inquiries_path = Path(inquiries_loc) # get current location of the script and create Log folder # if a relative path provided, convert it to the absolute address based on the application working dir if not os.path.isabs(log_folder_name): logdir = Path(prj_wrkdir) / log_folder_name else: logdir = Path(log_folder_name) # logdir = Path(prj_wrkdir) / log_folder_name # 'logs' lg_filename = time.strftime("%Y%m%d_%H%M%S", time.localtime()) + '.log' lg = setup_logger_common(common_logger_name, logging_level, logdir, lg_filename) # logging_level mlog = lg['logger'] mlog.info( 'Start processing download inquiries in "{}"'.format(inquiries_path)) try: (root, source_inq_dirs, _) = next(walk(inquiries_path)) inq_proc_cnt = 0 errors_present = 'OK' for inq_dir in source_inq_dirs: source_inquiry_path = Path(root) / inq_dir mlog.info( 'Selected for processing inquiry source: "{}", full path: {}'. format(inq_dir, source_inquiry_path)) (_, _, inq_files) = next(walk(source_inquiry_path)) # filter only excel files for processing as inquiries inquiries = [ fl for fl in inq_files if fl.endswith(('xlsx', 'xls')) ] # filter out temp files (starting with '~$') created when an excel file is open inquiries = [fl for fl in inquiries if not fl.startswith('~$')] mlog.info('Inquiry files presented (count = {}): "{}"'.format( len(inquiries), inquiries)) for inq_file in inquiries: inq_path = Path(source_inquiry_path) / inq_file # email_msgs = [] # email_attchms = [] try: # print('--------->Process file {}'.format(inq_path)) mlog.info('The following Inquiry file was selected: "{}".'. format(inq_path)) # save timestamp of beginning of the file processing ts = time.strftime("%Y%m%d_%H%M%S", time.localtime()) inq_obj = Inquiry(inq_path, m_cfg) if inq_obj and inq_obj.loaded: # proceed processing inquiry mlog.info('Inquiry file was successfully loaded.') mlog.info( 'Starting processing Download Inquiry file: "{}".'. format(inq_path)) inq_obj.process_inquiry() mlog.info( 'Processing of Download Inquiry was finished for {}' .format(inq_path)) inq_proc_cnt += 1 # identify if any errors were identified and set status variable accordingly if not inq_obj.error.exist(): if not inq_obj.disqualified_items: # no disqualified sub-aliquots present fl_status = 'OK' _str = 'Processing status: "{}". Download Inquiry: {}'.format( fl_status, inq_path) # errors_present = 'OK' # this variable is set to OK by default, no update needed else: # some disqualified sub-aliquots are presetn fl_status = 'OK_with_Disqualifications' _str = 'Processing status: "{}". Download Inquiry: {}'.format( fl_status, inq_path) if not errors_present == 'ERROR': errors_present = 'DISQUALIFY' else: fl_status = 'ERROR' _str = 'Processing status: "{}". Check processing log file for this inquiry: {}' \ .format(fl_status, inq_obj.logger.handlers[0]) errors_present = 'ERROR' if fl_status == "OK": mlog.info(_str) else: mlog.warning(_str) processed_dir = inq_obj.processed_folder # 'Processed' # combine the name of the processed file inq_processed_name = fl_status + '_' + str( inq_file).replace(' ', '_').replace('__', '_') if gc.PROCESSED_ADD_DATESTAMP: inq_processed_name = ts + '_' + inq_processed_name # move processed files to Processed folder fl_processed_name = cm.move_file_to_processed( inq_path, inq_processed_name, processed_dir, inq_obj.logger, inq_obj.error) if fl_processed_name: mlog.info( 'Processed file "{}" was moved(renamed) to: "{}"'. format(inq_path, processed_dir / fl_processed_name)) else: errors_present = errors_present + '|MoveProcessedError' mlog.warning( 'Moving the processed file "{}" was not successful due to some errors ' 'reported in the request\'s log file {}.'.format( inq_path, inq_obj.log_handler.baseFilename)) # preps for email notification # create a dictionary to feed into template for preparing an email body template_feeder = { 'file_num': inq_proc_cnt, 'file_path': str(inq_path), 'file_path_new': (str(processed_dir / fl_processed_name) if processed_dir and fl_processed_name else None), 'inq_obj_errors_cnt': inq_obj.error.count, 'log_file_path': inq_obj.log_handler.baseFilename, 'dld_request_file_path': str(inq_obj.download_request_path), 'inq_sources': inq_obj.inq_sources, 'inq_match_aliquots': inq_obj.inq_match_arr, 'inq_disqul_aliquots': inq_obj.disqualified_items, 'inq_disqul_reprocess_path': str(inq_obj.disqualified_inquiry_path) } email_body_part = cm.populate_email_template( 'processed_inquiry.html', template_feeder) email_msgs.append(email_body_part) # deactivate the current Inquiry logger deactivate_logger_common(inq_obj.logger, inq_obj.log_handler) inq_obj = None except Exception as ex: # report an error to log file and proceed to next file. mlog.error( 'Error "{}" occurred during processing file: {}\n{} '. format(ex, inq_path, traceback.format_exc())) raise mlog.info('Number of successfully processed Inquiries = {}'.format( inq_proc_cnt)) # start Data Download request if proper config setting was provided dd_status = {'status': '', 'message': ''} if run_data_download: # start process mlog.info( 'Starting asynchronously Data Downloader app: "{}".'.format( gc.DATA_DOWNLOADER_PATH)) try: dd_process = cm.start_external_process_async( gc.DATA_DOWNLOADER_PATH) # check if it is running dd_status = cm.check_external_process(dd_process) mlog.info( 'Status of running Data Downloader app: "{}".'.format( dd_status)) except Exception as ex: # report unexpected error during starting Data Downloader _str = 'Unexpected Error "{}" occurred during an attempt to start Data Downloader app ({})\n{} ' \ .format(ex, gc.DATA_DOWNLOADER_PATH, traceback.format_exc()) mlog.critical(_str) dd_status = {'status': 'Error', 'message': _str} mlog.info('Preparing to send notificatoin email.') email_to = m_cfg.get_value('Email/send_to_emails') email_subject = 'processing of download inquiry. ' if inq_proc_cnt > 0: # inquiries and len(inquiries) > 0: # collect final details and send email about this study results err_present = errors_present.split( '|' ) # get all statuses into an array; 1st element is the main status if err_present: # set email subject based on the main status err_present[0] if err_present[0] == 'OK': email_subject = 'SUCCESSFUL ' + email_subject elif err_present[0] == 'DISQUALIFY': email_subject = 'SUCCESSFUL (with disqualifications) ' + email_subject else: email_subject = 'ERROR(s) present during ' + email_subject if len(err_present) > 1: if err_present[1] == 'MoveProcessedError': email_subject = email_subject + ' Error moving inquiry to processed.' if dd_status and 'status' in dd_status.keys( ) and dd_status['status'].lower() == 'error': email_subject = email_subject + ' Errors starting Data Downloader.' # create a dictionary to feed into template for preparing an email body template_feeder = { 'inq_cnt': inq_proc_cnt, 'run_data_download': run_data_download, 'downloader_path': gc.DATA_DOWNLOADER_PATH, 'downloader_start_status': dd_status['status'].lower(), 'processed_details': '<br/>'.join(email_msgs) } email_body = cm.populate_email_template('processed_inquiries.html', template_feeder) # remove return characters from the body of the email, to keep just clean html code email_body = email_body.replace("\r", "") email_body = email_body.replace("\n", "") # print ('email_subject = {}'.format(email_subject)) # print('email_body = {}'.format(email_body)) mlog.info( 'Sending a status email with subject "{}" to "{}".'.format( email_subject, email_to)) try: if m_cfg.get_value('Email/send_emails'): email.send_yagmail( emails_to=email_to, subject=email_subject, message=email_body # commented adding attachements, since some log files go over 25GB limit and fail email sending # ,attachment_path=email_attchms ) except Exception as ex: # report unexpected error during sending emails to a log file and continue _str = 'Unexpected Error "{}" occurred during an attempt to send email upon ' \ 'finishing processing "{}" study: {}\n{} ' \ .format(ex, inq_path, os.path.abspath(__file__), traceback.format_exc()) mlog.critical(_str) mlog.info( 'End of processing of download inquiries in "{}".'.format( inquiries_path)) except Exception as ex: # report unexpected error to log file _str = 'Unexpected Error "{}" occurred during processing file: {}\n{} ' \ .format(ex, os.path.abspath(__file__), traceback.format_exc()) mlog.critical(_str) raise sys.exit()
def init_specific_settings(self): self.source_name_generic = self.inq_obj.get_inquiry_value_by_field_name( 'source_id', self.inq_line, False) # get configuration of all source location types of the given datasource source_locations = self.conf_process_entity.get_value('sources') self.source_locations = source_locations # get (if present) configuration values to adjust the source path with the current user mountpoint specics path_to_replace = self.conf_process_entity.get_value( 'Location/path_to_replace') path_local_mountpoint = self.conf_process_entity.get_value( 'Location/path_local_mountpoint') map_file_path = self.conf_process_entity.get_value( 'Location/map_file_path') # default search_by parameters from source config file search_by_default = self.conf_process_entity.get_value( 'search_method_default/search_by') search_deep_level_defalult = self.conf_process_entity.get_value( 'search_method_default/search_deep_level_max') exclude_dirs_defalult = self.conf_process_entity.get_value( 'search_method_default/exclude_folders') ext_match_defalult = self.conf_process_entity.get_value( 'search_method_default/file_ext') aliquot_match_default = self.conf_process_entity.get_value( 'search_method_default/aliquot_match') soft_comparisons_default = self.conf_process_entity.get_value( 'soft_comparision') map_file_default = self.conf_process_entity.get_value( 'search_method_default/map_file') # get source main value from the inquiry file for the current row source_main = self.inq_obj.get_inquiry_value_by_field_name( 'source_main', self.inq_line, False) if len(source_main.strip()) == 0: # if source_main was no provided, set flag allow_nomatch_per_sourcetype = True self.allow_nomatch_per_sourcetype = True ds_count = 0 for loc_item in source_locations: ds_count += 1 current_source_type_id = loc_item[ 'source_id'] if 'source_id' in loc_item else '' source_subfolder = loc_item['source_subfolder'] \ if 'source_subfolder' in loc_item and loc_item['source_subfolder'] else '' self.logger.info( 'Start processing data source #{}, source_id: "{}"'.format( ds_count, current_source_type_id)) current_source_id_path = \ self.inq_obj.get_inquiry_value_by_field_name('source_' + current_source_type_id, self.inq_line, False)\ .strip() # if a special path for the current source id was not provided in the inquiry row, use the default one if len(current_source_id_path) == 0: current_source_id_path = source_main + '/' + source_subfolder # check if partial path replacement in the current_source_id_path is required for the current datasource if path_to_replace and path_local_mountpoint: current_source_id_path = str( Path( current_source_id_path.replace(path_to_replace, path_local_mountpoint))) # add each source type id to a source_types dictionary to hold all source types required for this datasource # save path associated with the current source type here as well self.source_types[current_source_type_id] = { 'source_path': current_source_id_path, 'items_count': 0 } # check if a current source has specific search_by parameters, otherwise use default ones src_sm = loc_item[ 'search_method'] if 'search_method' in loc_item.keys( ) else None search_by = src_sm['search_by'] \ if src_sm and 'search_by' in src_sm.keys() else search_by_default search_deep_level = src_sm['search_deep_level_max'] \ if src_sm and 'search_deep_level_max' in src_sm.keys() else search_deep_level_defalult exclude_dirs = src_sm['exclude_folders'] \ if src_sm and 'exclude_folders' in src_sm.keys() else exclude_dirs_defalult ext_match = src_sm['file_ext'] \ if src_sm and 'file_ext' in src_sm.keys() else ext_match_defalult soft_comparisons = src_sm['soft_comparision'] \ if src_sm and 'soft_comparision' in src_sm.keys() else soft_comparisons_default aliquot_match = src_sm['aliquot_match'] \ if src_sm and 'aliquot_match' in src_sm.keys() else aliquot_match_default map_file = src_sm['map_file'] \ if src_sm and 'map_file' in src_sm.keys() else map_file_default # if the file with the custom soft-comparison rules was supplied as the command line argument use it # and overwrite the current value of the soft_comparison variable if gc.CONFIG_CUSTOM_SOFT_MATCH: custom_soft_match_cfg = ConfigData(gc.CONFIG_CUSTOM_SOFT_MATCH) soft_comparisons = custom_soft_match_cfg.get_value( "soft_comparision") # update map_file's "file_path" variable with the value of "map_file_path" from local config if map_file: if 'file_path' in map_file: if len(map_file['file_path'].strip() ) == 0 and map_file_path and len( map_file_path.strip()) > 0: map_file['file_path'] = map_file_path.strip() error_on_disqualification = loc_item['report_error_on_disqualification'] \ if 'report_error_on_disqualification' in loc_item.keys() else False web_location = loc_item[ 'web_location'] if 'web_location' in loc_item.keys() else None xpath = loc_item['xpath'] if 'xpath' in loc_item.keys( ) else '/' # default option - start with root element # make sure that web urls ends with "/", if not add the charcter if web_location and current_source_id_path[-1:] != '/': current_source_id_path += '/' # set default value for target_subfolder target_subfolder = '' # if target_subfolder value is provided in config, get it from there if 'target_subfolder' in loc_item.keys(): target_subfolder = loc_item['target_subfolder'] if loc_item[ 'target_subfolder'] else '' self.logger.info( 'Current data source config details: ' 'source_type_id: "{}",' 'web_location: "{}", ' 'search_by: "{}", ' 'search_deep_level_max: "{}", ' 'exclude_folders: "{}", ' 'file_ext: "{}", ' 'soft_comparison (loose comparison): "{}", ' 'aliquot_match: "{}", ' 'target_subfolder: "{}"' 'xpath: "{}"' ''.format(current_source_type_id, (web_location if web_location else False), search_by, (search_deep_level if search_deep_level else 0 if web_location else 'No limit'), exclude_dirs, (ext_match if ext_match else ''), (soft_comparisons if soft_comparisons else ''), aliquot_match, target_subfolder, xpath)) self.logger.info( 'Current data source path: {}'.format(current_source_id_path)) # start processing current source items = [] disqualify = None if search_by == 'folder_name': if not web_location: items, disqualify = self.get_data_by_folder_name( current_source_id_path, search_deep_level, exclude_dirs) else: items, disqualify = self.get_web_data( current_source_id_path, xpath, exclude_dirs) elif search_by == 'file_name': if not web_location: items, disqualify = self.get_data_by_file_name( current_source_id_path, search_deep_level, exclude_dirs, ext_match) else: items, disqualify = self.get_web_data( current_source_id_path, xpath, exclude_dirs, ext_match) elif search_by == 'map_file': if not web_location: items, disqualify = self.get_data_by_map_file( current_source_id_path, map_file) else: _str = 'Web locations are not currently set to work with the "map_file" ' \ 'search_by configuration parameter' self.logger.warning(_str) disqualify = (loc_item['path'], _str) else: _str = 'Unexpected "search_by" configuration parameter "{}" was provided.'.format( search_by) _str2 = 'Skipping processing of the current source "{}"'.format( current_source_id_path) self.logger.warning('{} {}'.format(_str, _str2)) disqualify = (loc_item['path'], _str) if disqualify: # if disqualification was reported for current source location, disqualify it and skip to next location self.disqualify_source(current_source_type_id, disqualify[1], error_on_disqualification) continue if items and len(items) > 0: for item in items: # this variable will be filled with a value if realpath file name does not match the name # of the link provided as the file to be retrieved target_copied_item_name = '' if not web_location: # identify real path of the item for cases when symlinks are part of the path # this should be done for all not web locations # get values to be searched in the item's path to adopt the provided path to the local mount point path_to_replace = self.conf_process_entity.get_value( 'Location/path_to_replace') path_local_mountpoint = self.conf_process_entity.get_value( 'Location/path_local_mountpoint') real_path = os.path.realpath( item['path']) # real path of the current item if path_to_replace and path_local_mountpoint and path_to_replace != path_local_mountpoint: # check if real path needs to be adopted to use the local mount point while path_to_replace in real_path: # loop until the final path does not have the string requiring the mountpoint adoption real_path = real_path.replace( path_to_replace, path_local_mountpoint) real_path = os.path.realpath(real_path) if real_path != item['path']: # if realpath file/dir name not equal file/dir of the given location, save given name # into a separate variable target_copied_item_name target_copied_item_name = os.path.basename( item['path']) # identify the obj_type of the current item if web_location: # if the datasource is the web location if search_by == 'folder_name': obj_type = 'dir' elif search_by == 'file_name': obj_type = 'file' else: obj_type = 'UNKNOWN' else: # if the datasource is a network location if os.path.isfile(real_path): obj_type = 'file' elif os.path.isdir(real_path): obj_type = 'dir' else: obj_type = 'UNKNOWN' item_details = { # 'aliquot_search': item['aliquot_search'], # 'path': item['path'] if not web_location else current_source_id_path + item['path'], 'path': real_path if not web_location else current_source_id_path + item['path'], # 'name': os.path.basename(item['path']) if not web_location else item['path'], 'name': os.path.basename(item['aliquot_search']) if not web_location else item['aliquot_search'], 'target_copied_item_name': target_copied_item_name, 'source_type_id': current_source_type_id, 'target_subfolder': target_subfolder, 'soft_comparisions': soft_comparisons, 'aliquot_match': aliquot_match, 'search_by': search_by, 'obj_type': obj_type, 'web_location': web_location } self.source_content_arr.append(item_details) else: self.logger.warning( 'No available files/folders were found in the current source. ' 'Configuration settings of the source might need to be reviewed.' ) self.logger.info( 'Processing data source #{} was completed. ' 'Total number of files/folder available in the source = {}.'. format(ds_count, len(items) if items else 0))
class ApiProcess(): def __init__(self, api_cfg_file, log_obj): self.loaded = False # set logger object self.logger = log_obj self.dataset = None # set error object self.error = ApiError(self) self.logger.info( 'Start processing API call for the following conig file: {}'. format(api_cfg_file)) # load config file for the current api process cfg_file_path = gc.CONFIGS_DIR + api_cfg_file self.api_cfg = ConfigData(cfg_file_path) if not self.api_cfg.loaded: _str = 'Cannot load the config file: "{}"'.format(cfg_file_path) self.logger.error(_str) self.error.add_error(_str) return # get values from the config file self.api_name = self.api_cfg.get_value('API/name') self.api_url = self.api_cfg.get_value('API/url') self.post_fields = self.api_cfg.get_value('API/post_fields') # verify if "eval" is present in any of the post fields and perform the evaluation, if needed if self.post_fields: for pf in self.post_fields: self.post_fields[pf] = cm.eval_cfg_value( self.post_fields[pf], self.logger, self.error) # if no errors were generated during init, set loaded = True if not self.error.errors_exist(): self.loaded = True def process_api_call(self): # perform the actual API call, collect output in api_output and status into errors_reported (T/F variable) api_output, errors_reported = cm.perform_api_call( self.api_url, self.post_fields, self.logger, self.error) # check if errors were reported if errors_reported: # stop processing of API is error is reported self.logger.warning( 'Aborting processing the current API call, since errors were reported (see earlier entries)' ) return #validate the returned ds if api_output and len(api_output.strip()) != 0: # proceed with processing an API ds self.dataset = ApiDataset(api_output, self.api_cfg, self.logger, self.error, self.api_name) if self.dataset.loaded: self.dataset.submit_rows_to_db() else: self.logger.warning( 'Application failed to process the API response. See previous log entries ' 'for more details. Aborting processing the current API call.' ) return else: # stop processing API if returned ds is empty self.logger.warning( 'API call returned an empty ds, aborting processing the current API call' ) return pass