def run_analysis(run_id, nb_cpu_to_use, pvclust, verbose): """Create input data for creationf of reaction dendrogram tsv reactions files. Args: run_id (str): ID of the run nb_cpu_to_use (int): number of CPU for multiprocessing pvclust (boolean): use also pvclust to create reaction dendrogram verbose (boolean): verbose """ if verbose: print('--- Running analysis step ---') analysis_start_time = time.time() config_data = parse_config_file(run_id) analysis_group_file_path = config_data['analysis_group_file_path'] # Create list of dictionaries containing input data for multiprocessing. # As we have to give one argument after the function to pool(). # Create one dictionary by group in the group_template.tsv file. group_data = [] with open(analysis_group_file_path, 'r') as group_file: group_reader = csv.reader(group_file, delimiter='\t') for row in group_reader: group_name = row[0] groups = [org_name for org_name in row[1:] if org_name] analysis_on_group(group_name, groups, config_data, pvclust, nb_cpu_to_use, verbose) analysis_end_time = (time.time() - analysis_start_time) integer_part, decimal_part = str(analysis_end_time).split('.') analysis_time = ".".join([integer_part, decimal_part[:3]]) if verbose: print("--- analysis step done in: %ss ---" % analysis_time)
def run_reconstruction(run_id, nb_cpu_to_use, verbose): config_data = parse_config_file(run_id) pgdb_from_annotation_path = config_data['pgdb_from_annotation_path'] studied_organisms_path = config_data['studied_organisms_path'] log_path = config_data['log_path'] #check for each study if exist PGDB folder in PGDBs folder, if missing RUN ptools chronoDepart = time.time() mpwt.multiprocess_pwt(input_folder=config_data['studied_organisms_path'], output_folder=pgdb_from_annotation_path, patho_inference=True, dat_creation=True, dat_extraction=True, number_cpu=nb_cpu_to_use, patho_log=log_path, verbose=verbose) chrono = (time.time() - chronoDepart) partie_entiere, partie_decimale = str(chrono).split('.') chrono = ".".join([partie_entiere, partie_decimale[:3]]) if os.listdir(pgdb_from_annotation_path) == []: print('Pathway-Tools inference failed!') return if verbose: print("Pathway-Tools done in: %ss" % chrono) create_padmet_sbml_from_pgdb(run_id, nb_cpu_to_use, verbose)
def create_padmet_sbml_from_pgdb(run_id, nb_cpu_to_use, verbose): config_data = parse_config_file(run_id) padmet_from_annotation_path = config_data['padmet_from_annotation_path'] study_from_annot_prefix = config_data['study_from_annot_prefix'] sbml_from_annotation_path = config_data['sbml_from_annotation_path'] padmet_utils_path = config_data['padmet_utils_path'] database_path = config_data['database_path'] pgdb_from_annotation_path = config_data['pgdb_from_annotation_path'] aucome_pool = Pool(nb_cpu_to_use) all_study_name = set( next(os.walk(config_data['studied_organisms_path']))[1]) all_study_pgdb = dict([(study_name, "{0}/{1}".format( pgdb_from_annotation_path, study_name)) if os.path.isdir( "{0}/{1}".format(pgdb_from_annotation_path, study_name)) else (study_name, '') for study_name in all_study_name]) study_padmet_data = [] for study_name in all_study_name: padmet_file = "{0}/{1}{2}.padmet".format(padmet_from_annotation_path, study_from_annot_prefix, study_name) pgdb_folder = all_study_pgdb[study_name] tmp_padmet_data = { 'study_name': study_name, 'pgdb_folder': pgdb_folder, 'padmet_utils_path': padmet_utils_path, 'verbose': verbose, 'padmet_file': padmet_file, 'database_path': database_path } study_padmet_data.append(tmp_padmet_data) aucome_pool.map(create_padmet_from_pgdb, study_padmet_data) all_study_padmet = dict([(study_name, "{0}/{1}{2}.padmet".format( padmet_from_annotation_path, study_from_annot_prefix, study_name)) if os.path.isfile("{0}/{1}{2}.padmet".format( padmet_from_annotation_path, study_from_annot_prefix, study_name)) else (study_name, '') for study_name in all_study_name]) study_sbml_data = [] for study_name in all_study_padmet: sbml_file = "{0}/{1}{2}.sbml".format(sbml_from_annotation_path, study_from_annot_prefix, study_name) padmet_file = all_study_padmet[study_name] tmp_sbml_data = { 'sbml_file': sbml_file, 'padmet_file': padmet_file, 'padmet_utils_path': padmet_utils_path, 'study_name': study_name, 'verbose': verbose } study_sbml_data.append(tmp_sbml_data) aucome_pool.map(create_sbml, study_sbml_data)
def run_merge(run_id, nb_cpu_to_use, verbose, veryverbose=None): if verbose: print('--- Running merge step ---') merge_start_time = time.time() aucome_pool = Pool(nb_cpu_to_use) config_data = parse_config_file(run_id) padmet_from_annotation_path = config_data['padmet_from_annotation_path'] padmet_from_networks_path = config_data['padmet_from_networks_path'] sbml_from_networks_path = config_data['sbml_from_networks_path'] database_path = config_data['database_path'] structural_padmets_path = config_data['structural_padmets_path'] orthofinder_filtered_path = config_data['orthofinder_filtered_path'] orthofinder_padmet_path = config_data['orthofinder_padmet_path'] padmet_from_annotation_path = config_data['padmet_from_annotation_path'] networks_path = config_data['networks_path'] structural_padmets = [padmet for padmet in os.listdir(structural_padmets_path) if padmet.endswith('.padmet')] orthofinder_filtered_padmets = [padmet for padmet in os.listdir(orthofinder_filtered_path) if padmet.endswith('.padmet')] orthofinder_padmets = [padmet for padmet in os.listdir(orthofinder_padmet_path) if padmet.endswith('.padmet')] pathway_tools_padmets = [padmet for padmet in os.listdir(padmet_from_annotation_path) if padmet.endswith('.padmet')] if len(structural_padmets) > 0: padmets = [(padmet, structural_padmets_path + '/' + padmet) for padmet in structural_padmets] elif len(orthofinder_filtered_padmets) > 0: padmets = [(padmet, orthofinder_filtered_path + '/' + padmet) for padmet in orthofinder_filtered_padmets] elif len(orthofinder_padmets) > 0: padmets = [(padmet, orthofinder_padmet_path + '/' + padmet) for padmet in orthofinder_padmets] elif len(pathway_tools_padmets) > 0: padmets = [(padmet, padmet_from_annotation_path + '/' + padmet) for padmet in pathway_tools_padmets] else: sys.exit('No padmets have been created, run reconstruction or workflow.') study_draft_data = [] for study_name, padmet_path in padmets: tmp_study_data = {'padmet_path': padmet_path, 'study_padmet': study_name, 'padmet_from_networks_path': padmet_from_networks_path, 'sbml_from_networks_path': sbml_from_networks_path, 'database_path': database_path, 'verbose': verbose, 'veryverbose': veryverbose} study_draft_data.append(tmp_study_data) aucome_pool.map(create_output, study_draft_data) aucome_pool.close() aucome_pool.join() padmet_to_padmet.padmet_to_padmet(padmet_from_networks_path, networks_path + '/panmetabolism.padmet', verbose=veryverbose) sbmlGenerator.padmet_to_sbml(padmet=networks_path + '/panmetabolism.padmet', output=networks_path + '/panmetabolism.sbml', verbose=veryverbose) merge_end_time = (time.time() - merge_start_time) integer_part, decimal_part = str(merge_end_time).split('.') merge_time = ".".join([integer_part, decimal_part[:3]]) if verbose: print("--- merge step done in: %ss ---" %merge_time)
def run_draft(run_id, nb_cpu_to_use, verbose): aucome_pool = Pool(nb_cpu_to_use) config_data = parse_config_file(run_id) studied_organisms_path = config_data['studied_organisms_path'] padmet_from_annotation_path = config_data['padmet_from_annotation_path'] study_from_annot_prefix = config_data['study_from_annot_prefix'] networks_path = config_data['networks_path'] orthology_based_path = config_data['orthology_based_path'] padmet_utils_path = config_data['padmet_utils_path'] database_path = config_data['database_path'] padmet_from_networks_path = config_data['padmet_from_networks_path'] sbml_from_networks_path = config_data['sbml_from_networks_path'] all_study_name = set(next(os.walk(studied_organisms_path))[1]) all_study_padmet = dict([(study_name, "{0}/{1}{2}.padmet".format( padmet_from_annotation_path, study_from_annot_prefix, study_name)) if os.path.isfile("{0}/{1}{2}.padmet".format( padmet_from_annotation_path, study_from_annot_prefix, study_name)) else (study_name, '') for study_name in all_study_name]) study_draft_data = [] for study_name in all_study_name: tmp_study_data = { 'study_name': study_name, 'study_padmet': all_study_padmet[study_name], 'networks_path': networks_path, 'orthology_based_path': orthology_based_path, 'padmet_utils_path': padmet_utils_path, 'database_path': database_path, 'padmet_from_networks_path': padmet_from_networks_path, 'sbml_from_networks_path': sbml_from_networks_path, 'verbose': verbose } study_draft_data.append(tmp_study_data) aucome_pool.map(create_draft, study_draft_data)
def run_reconstruction(run_id, nb_cpu_to_use, verbose, veryverbose=None): if verbose: logger.setLevel(logging.DEBUG) logging.getLogger("mpwt").setLevel(logging.DEBUG) print('--- Running reconstruction step ---') start_time = time.time() config_data = parse_config_file(run_id) pgdb_from_annotation_path = config_data['pgdb_from_annotation_path'] studied_organisms_path = config_data['studied_organisms_path'] log_path = config_data['log_path'] taxon_file = None if 'taxon_id.tsv' in set(next(os.walk(config_data['studied_organisms_path']))[2]): taxon_file = True mpwt.multiprocess_pwt(input_folder=studied_organisms_path, output_folder=pgdb_from_annotation_path, patho_inference=True, flat_creation=True, dat_extraction=True, number_cpu=nb_cpu_to_use, patho_log=log_path, taxon_file=taxon_file, verbose=verbose) if os.listdir(pgdb_from_annotation_path) == []: print('Pathway-Tools inference failed!') return create_padmet_sbml_from_pgdb(run_id, nb_cpu_to_use, verbose, veryverbose) end_time = (time.time() - start_time) integer_part, decimal_part = str(end_time).split('.') reconstruction_time = ".".join([integer_part, decimal_part[:3]]) if verbose: print("--- reconstruction step done in: %ss ---" %reconstruction_time)
def run_structural(run_id, keep_tmp, nb_cpu_to_use, verbose): if verbose: print('--- Running structural check step ---') structural_start_time = time.time() config_data = parse_config_file(run_id) database_path = config_data['database_path'] prot2genome.fromAucome(run_id, nb_cpu_to_use, database_path, blastp=True, tblastn=True, exonerate=True, keep_tmp=keep_tmp, debug=False) structural_end_time = (time.time() - structural_start_time) integer_part, decimal_part = str(structural_end_time).split('.') structural_time = ".".join([integer_part, decimal_part[:3]]) if verbose: print("--- structural step done in: %ss ---" % structural_time)
def run_compare(run_id, nb_cpu_to_use, verbose): """Compare the gorup specified by the user. Args: run_id (str): ID of the run nb_cpu_to_use (int): number of CPU for multiprocessing verbose (boolean): verbose """ config_data = parse_config_file(run_id) analysis_path = config_data['analysis_path'] analysis_group_file_path = config_data['analysis_group_file_path'] upset_path = analysis_path + '/upset_graph' upset_tmp_data_path = upset_path + '/tmp_data' upset_tmp_reaction_path = upset_tmp_data_path + '/tmp' padmet_utils_path = config_data['padmet_utils_path'] database_path = config_data['database_path'] padmet_from_networks_path = config_data['padmet_from_networks_path'] # Create a dictionary containing the group name and the species inside the group. group_data = {} padmets = [] with open(analysis_group_file_path, 'r') as group_file: group_reader = csv.reader(group_file, delimiter='\t') cluster_reactions = {} for row in group_reader: group_name = row[0] groups = [species for species in row[1:] if species != ''] group_data[group_name] = groups if group_name != 'all': padmets.extend([padmet_from_networks_path + '/' + species + '.padmet' for species in groups]) padmets = list(set(padmets)) if not os.path.isdir(upset_path): os.mkdir(upset_path) if not os.path.isdir(upset_tmp_data_path): os.mkdir(upset_tmp_data_path) if not os.path.isdir(upset_tmp_reaction_path): os.mkdir(upset_tmp_reaction_path) # Create the reactions.csv file needed to create dendrogram. cmds = ["python3", padmet_utils_path + "/padmet_utils/exploration/compare_padmet.py", "--padmet", ','.join(padmets), "--output", upset_tmp_reaction_path, "--padmetRef", database_path] if verbose: cmds.append('-v') subprocess.call(cmds) # Read the reactions.csv file and remove the column unused. reactions_file = upset_tmp_reaction_path + '/' + 'reactions.csv' reactions_dataframe = pa.read_csv(reactions_file, sep='\t') columns = [column for column in reactions_dataframe.columns if '(sep=;)' not in column] columns = [column for column in columns if '_formula' not in column] reactions_dataframe = reactions_dataframe[columns].copy() reactions_dataframe.set_index('reaction', inplace=True) # Translate 'present'/(nan) data into a True/False absence-presence matrix. for column in reactions_dataframe.columns.tolist(): reactions_dataframe[column] = [True if data == 'present' else False for data in reactions_dataframe[column]] # For each group, extract the reactions present in its species. # Then create a tsv file containing these reactions.. for group_name in group_data: if group_name != 'all': groups = group_data[group_name] reactions_temp = [] for species in groups: species_reactions_dataframe = reactions_dataframe[reactions_dataframe[species] == True] reactions_temp.extend(species_reactions_dataframe.index.tolist()) cluster_reactions[group_name] = set(reactions_temp) df = pa.DataFrame({group_name: list(cluster_reactions[group_name])}) df.to_csv(upset_tmp_data_path+'/'+group_name+'.tsv', sep='\t', index=None, header=None) # Launch Intervene to create upset graph using each group file. upset_data_path = [upset_tmp_data_path + '/' + tsv_file for tsv_file in os.listdir(upset_tmp_data_path) if tsv_file.endswith('.tsv')] cmds = ['intervene', 'upset', '-i', *upset_data_path, '--type', 'list', '-o', upset_path, '--figtype', 'svg'] if verbose: subprocess.call(cmds) else: FNULL = open(os.devnull, 'w') subprocess.call(cmds, stdout=FNULL, stderr=subprocess.STDOUT) cmds = ["python3", padmet_utils_path + "/padmet_utils/exploration/dendrogram_reactions_distance.py", "--reactions", reactions_file, "--output", upset_path + '/dendrogram_output', "--padmetRef", database_path] if verbose: cmds.append('-v') subprocess.call(cmds)
def run_orthology(run_id, orthogroups, sequence_search_prg, nb_cpu_to_use, verbose): aucome_pool = Pool(nb_cpu_to_use) config_data = parse_config_file(run_id) orthofinder_wd_path = config_data['orthofinder_wd_path'] orthofinder_bin_path = config_data['orthofinder_bin_path'] orthology_based_path = config_data['orthology_based_path'] padmet_utils_path = config_data['padmet_utils_path'] studied_organisms_path = config_data['studied_organisms_path'] model_organisms_path = config_data['model_organisms_path'] mnx_cpd_path = config_data['mnx_cpd_path'] mnx_rxn_path = config_data['mnx_rxn_path'] all_study_name = set(next(os.walk(studied_organisms_path))[1]) all_model_name = set(next(os.walk(model_organisms_path))[1]) all_study_faa = dict([(study_name, "{0}/{1}/{1}.faa".format( studied_organisms_path, study_name)) if os.path.isfile( "{0}/{1}/{1}.faa".format(studied_organisms_path, study_name)) else (study_name, '') for study_name in all_study_name]) all_model_faa = dict([(model_name, "{0}/{1}/{1}.faa".format( model_organisms_path, model_name)) if os.path.isfile( "{0}/{1}/{1}.faa".format(model_organisms_path, model_name)) else (model_name, '') for model_name in all_model_name]) #check if Orthofinder already run, if yes, get the last workdir try: if orthogroups: orthodata_path = max([ "%s/%s" % (x[0], 'Orthogroups/Orthogroups.tsv') for x in os.walk(orthofinder_wd_path) if 'Orthogroups' in x[1] ]) else: orthodata_path = max([ "%s/%s" % (x[0], 'Orthologues') for x in os.walk(orthofinder_wd_path) if 'Orthologues' in x[1] ]) except ValueError: if verbose: print( "Enable to find file Orthogroups.csv in {0}, need to run Orthofinder..." .format(orthofinder_wd_path)) for name, faa_path in list(all_study_faa.items()): if not os.path.isfile("{0}/{1}.faa".format(orthofinder_wd_path, name)): if verbose: print("Copying {0}'s faa to {1}".format( name, orthofinder_wd_path)) cmds = ["cp", faa_path, orthofinder_wd_path] subprocess.call(cmds) for name, faa_path in list(all_model_faa.items()): if not os.path.isfile("{0}/{1}.faa".format(orthofinder_wd_path, name)): if verbose: print("Copying {0}'s faa to {1}".format( name, orthofinder_wd_path)) cmds = ["cp", faa_path, orthofinder_wd_path] subprocess.call(cmds) if verbose: print("Running Orthofinder on %s cpu" % nb_cpu_to_use) chronoDepart = time.time() cmds = [ orthofinder_bin_path, "-f", orthofinder_wd_path, "-t", str(nb_cpu_to_use), "-S", sequence_search_prg ] subprocess.call(cmds) chrono = (time.time() - chronoDepart) partie_entiere, partie_decimale = str(chrono).split('.') chrono = ".".join([partie_entiere, partie_decimale[:3]]) if verbose: print("Orthofinder done in: %ss" % chrono) if orthogroups: orthodata_path = max([ "%s/%s" % (x[0], 'Orthogroups/Orthogroups.tsv') for x in os.walk(orthofinder_wd_path) if 'Orthogroups' in x[1] ]) else: orthodata_path = max([ "%s/%s" % (x[0], 'Orthologues') for x in os.walk(orthofinder_wd_path) if 'Orthologues' in x[1] ]) if verbose: print("Parsing Orthofinder output %s" % orthodata_path) if verbose: print("Start sbml creation...") all_dict_data = [] for study_name in all_study_name: dict_data = { 'sbml': run_id, 'orthodata_path': orthodata_path, 'study_name': study_name, 'padmet_utils_path': padmet_utils_path, 'verbose': verbose, 'orthogroups': orthogroups, 'output': orthology_based_path + '/' + study_name } all_dict_data.append(dict_data) chronoDepart = time.time() aucome_pool.map(orthogroup_to_sbml, all_dict_data) chrono = (time.time() - chronoDepart) integer_part, decimal_part = str(chrono).split('.') chrono = ".".join([integer_part, decimal_part[:3]]) if verbose: print("Orthofinder output parsed in: %ss" % chrono) #check database, mapping to metacyc ??? data_convert_sbml_db = [] for dict_data in all_dict_data: tmp_dict_data = { 'sbml': orthology_based_path + '/' + study_name, 'padmet_utils_path': padmet_utils_path, 'mnx_rxn_path': mnx_rxn_path, 'mnx_cpd_path': mnx_cpd_path, 'verbose': verbose } data_convert_sbml_db.append(tmp_dict_data) aucome_pool.map(convert_sbml_db, data_convert_sbml_db) aucome_pool.close() aucome_pool.join()
def run_compare(run_id, nb_cpu_to_use, verbose): """Compare the gorup specified by the user. Args: run_id (str): ID of the run nb_cpu_to_use (int): number of CPU for multiprocessing verbose (boolean): verbose """ if verbose: print('--- Running compare step ---') compare_start_time = time.time() config_data = parse_config_file(run_id) analysis_path = config_data['analysis_path'] analysis_group_file_path = config_data['analysis_group_file_path'] compare_output_path = analysis_path + '/compare_group' database_path = config_data['database_path'] padmet_from_networks_path = config_data['padmet_from_networks_path'] # Create a dictionary containing the group name and the species inside the group. group_data = {} padmets = [] with open(analysis_group_file_path, 'r') as group_file: group_reader = csv.reader(group_file, delimiter='\t') cluster_reactions = {} for row in group_reader: group_name = row[0] groups = [species for species in row[1:] if species != ''] group_data[group_name] = groups if group_name != 'all': padmets.extend([ padmet_from_networks_path + '/' + species + '.padmet' for species in groups ]) padmets = list(set(padmets)) if not os.path.isdir(compare_output_path): os.mkdir(compare_output_path) padmetref = PadmetRef(database_path) # Create the reactions.tsv file needed to create dendrogram. padmet_path = ','.join(padmets) compare_padmet.compare_padmet(padmet_path=padmet_path, output=compare_output_path, padmetRef=padmetref, verbose=verbose) # Read the reactions.tsv file and remove the column unused. reactions_file = compare_output_path + '/' + 'reactions.tsv' reactions_dataframe = pa.read_csv(reactions_file, sep='\t') columns = [ column for column in reactions_dataframe.columns if '(sep=;)' not in column and '_formula' not in column ] reactions_dataframe = reactions_dataframe[columns].copy() reactions_dataframe.set_index('reaction', inplace=True) # For each group, extract the reactions present in its species to create supervenn sets. supervenn_sets = [] supervenn_labels = [] for group_name in group_data: if group_name != 'all': groups = group_data[group_name] reactions_temp = [] for species in groups: species_reactions_dataframe = reactions_dataframe[ reactions_dataframe[species] == 1] reactions_temp.extend( species_reactions_dataframe.index.tolist()) supervenn_sets.append(set(reactions_temp)) supervenn_labels.append(group_name) cluster_reactions[group_name] = set(reactions_temp) supervenn(supervenn_sets, supervenn_labels, chunks_ordering='occurence', sets_ordering='minimize gaps') plt.savefig(compare_output_path + '/compare_group.png', bbox_inches='tight') plt.clf() dendrogram_reactions_distance.reaction_figure_creation( reactions_file, os.path.join(compare_output_path, "dendrogram_output"), padmetRef_file=database_path, verbose=verbose) compare_end_time = (time.time() - compare_start_time) integer_part, decimal_part = str(compare_end_time).split('.') compare_time = ".".join([integer_part, decimal_part[:3]]) if verbose: print("--- compare step done in: %ss ---" % compare_time)
def run_check(run_id, nb_cpu_to_use, verbose, veryverbose): if verbose: print('--- Running check step ---') start_time = time.time() config_data = parse_config_file(run_id) padmet_from_annotation_path = config_data['padmet_from_annotation_path'] study_from_annot_prefix = config_data['study_from_annot_prefix'] sbml_from_annotation_path = config_data['sbml_from_annotation_path'] database_path = config_data['database_path'] pgdb_from_annotation_path = config_data['pgdb_from_annotation_path'] studied_organisms_path = config_data['studied_organisms_path'] model_organisms_path = config_data['model_organisms_path'] analysis_group_file_path = config_data['analysis_group_file_path'] #create dict for ortho data all_study_name = set(next(os.walk(studied_organisms_path))[1]) all_model_name = set(next(os.walk(model_organisms_path))[1]) all_study_pgdb = dict([(study_name, "{0}/{1}".format( pgdb_from_annotation_path, study_name)) if os.path.isdir( "{0}/{1}".format(pgdb_from_annotation_path, study_name)) else (study_name, '') for study_name in all_study_name]) all_study_gbk = dict([(study_name, "{0}/{1}/{1}.gbk".format( studied_organisms_path, study_name)) if os.path.isfile( "{0}/{1}/{1}.gbk".format(studied_organisms_path, study_name)) else (study_name, '') for study_name in all_study_name]) #k = folder_name in model_organisms_path, v = path to faa in this folder, faa name should be folder_name.faa all_model_gbk = dict([(model_name, "{0}/{1}/{1}.gbk".format( model_organisms_path, model_name)) if os.path.isfile( "{0}/{1}/{1}.gbk".format(model_organisms_path, model_name)) else (model_name, '') for model_name in all_model_name]) # Update group file in analysis if not os.path.exists(analysis_group_file_path): with open(analysis_group_file_path, 'w') as group_file: group_writer = csv.writer(group_file, delimiter='\t') group_writer.writerow(['all', *all_study_name]) else: groups_data = [] with open(analysis_group_file_path, 'r') as group_file: group_reader = csv.reader(group_file, delimiter='\t') for row in group_reader: groups = [org_name for org_name in row[1:] if org_name] groups_data.append((row[0], groups)) # Check if 'all' row matches species in study_organisms. if sorted(groups_data[0][1]) != sorted(all_study_name): with open(analysis_group_file_path, 'w') as group_file: group_writer = csv.writer(group_file, delimiter='\t') group_writer.writerow(['all', *all_study_name]) for group in groups_data: if group[0] != 'all': group_writer.writerow([group[0], *group[1]]) aucome_pool = Pool(nb_cpu_to_use) if verbose: print('Checking genbank file.') study_faa_data = [] for study_name in all_study_name: faa_path = "{0}/{1}/{1}.faa".format(studied_organisms_path, study_name) tmp_faa_data = { 'study_name': study_name, 'faa_path': faa_path, 'gbk_file': all_study_gbk[study_name], 'studied_organisms_path': studied_organisms_path, 'verbose': verbose } study_faa_data.append(tmp_faa_data) aucome_pool.map(check_create_faa, study_faa_data) #k = folder_name in studied_org_path, v = path to faa in this folder, faa name should be folder_name.faa all_study_faa = dict([(study_name, "{0}/{1}/{1}.faa".format( studied_organisms_path, study_name)) if os.path.isfile( "{0}/{1}/{1}.faa".format(studied_organisms_path, study_name)) else (study_name, '') for study_name in all_study_name]) study_model_data = [] for model_name in all_model_name: faa_path = "{0}/{1}/{1}.faa".format(model_organisms_path, model_name) tmp_model_data = { 'model_name': model_name, 'faa_path': faa_path, 'gbk_file': all_model_gbk[model_name], 'verbose': verbose } study_model_data.append(tmp_model_data) aucome_pool.map(create_faa_model, study_model_data) #k = folder_name in model_organisms_path, v = path to faa in this folder, faa name should be folder_name.faa all_model_faa = dict([(model_name, "{0}/{1}/{1}.faa".format( model_organisms_path, model_name)) if os.path.isfile( "{0}/{1}/{1}.faa".format(model_organisms_path, model_name)) else (model_name, '') for model_name in all_model_name]) study_padmet_data = [] for study_name in all_study_name: padmet_file = "{0}/{1}{2}.padmet".format(padmet_from_annotation_path, study_from_annot_prefix, study_name) pgdb_folder = all_study_pgdb[study_name] tmp_padmet_data = { 'study_name': study_name, 'pgdb_folder': pgdb_folder, 'verbose': verbose, 'padmet_file': padmet_file, 'database_path': database_path, 'veryverbose': veryverbose } study_padmet_data.append(tmp_padmet_data) aucome_pool.map(create_padmet_from_pgdb, study_padmet_data) all_study_padmet = dict([(study_name, "{0}/{1}{2}.padmet".format( padmet_from_annotation_path, study_from_annot_prefix, study_name)) if os.path.isfile("{0}/{1}{2}.padmet".format( padmet_from_annotation_path, study_from_annot_prefix, study_name)) else (study_name, '') for study_name in all_study_name]) study_sbml_data = [] for study_name in all_study_padmet: sbml_file = "{0}/{1}{2}.sbml".format(sbml_from_annotation_path, study_from_annot_prefix, study_name) padmet_file = all_study_padmet[study_name] tmp_sbml_data = { 'sbml_file': sbml_file, 'padmet_file': padmet_file, 'study_name': study_name, 'verbose': verbose, 'veryverbose': veryverbose } study_sbml_data.append(tmp_sbml_data) aucome_pool.map(create_sbml, study_sbml_data) #sbml of study are obtained from annotation, they should be in sbml_from_annotation_path #k = study_name (== folder_name in studied_org_path or obtained from sbml name), v = path to sbml, sbml_study_prefi+study_name+.sbml all_study_sbml = dict([(study_name, "{0}/{1}{2}.sbml".format( sbml_from_annotation_path, study_from_annot_prefix, study_name)) if os.path.isfile("{0}/{1}{2}.sbml".format( sbml_from_annotation_path, study_from_annot_prefix, study_name)) else (study_name, '') for study_name in all_study_name]) #k = folder_name in model_organisms_path, v = path to sbml in this folder, sbml name should be folder_name.sbml all_model_sbml = dict([(model_name, "{0}/{1}/{1}.sbml".format( model_organisms_path, model_name)) if os.path.isfile( "{0}/{1}/{1}.sbml".format(model_organisms_path, model_name)) else (model_name, '') for model_name in all_model_name]) #PGDB, padmet, sbml all_study_pgdb = dict([(study_name, "{0}/{1}".format( pgdb_from_annotation_path, study_name)) if os.path.isdir( "{0}/{1}".format(pgdb_from_annotation_path, study_name)) else (study_name, '') for study_name in all_study_name]) if verbose: print("Input summary:") print("* %s Studied organims:" % (len(all_study_name))) for study_name in all_study_name: print("%s:" % study_name) if all_study_gbk[study_name]: print("\tGBK: OK") else: print("\t[WARNING] No GBK found, should be in {1}/{0}/{0}.gbk". format(study_name, studied_organisms_path)) if all_study_pgdb[study_name]: print("\tPGDB: OK") else: print("\t[WARNING] No PGDB found, should be in {1}/{0}".format( study_name, pgdb_from_annotation_path)) if all_study_padmet[study_name]: print("\tPADMET: OK") else: print( "\t[WARNING] No PADMET found, should be in {1}/{2}{0}.padmet" .format(study_name, padmet_from_annotation_path, study_from_annot_prefix)) if all_study_faa[study_name]: print("\tFAA: OK") else: print("\t[WARNING] No FAA found, should be in {1}/{0}/{0}.faa". format(study_name, studied_organisms_path)) if all_study_sbml[study_name]: print("\tSBML: OK") else: print( "\t[WARNING] No SBML found, should be in {1}/{2}{0}.sbml". format(study_name, sbml_from_annotation_path, study_from_annot_prefix)) print("* %s models organims:" % (len(all_model_name))) for model_name in all_model_name: print("%s:" % model_name) if all_model_faa[model_name]: print("\tFAA: OK") else: print("\t[WARNING] No FAA found, should be in {1}/{0}/{0}.faa". format(model_name, model_organisms_path)) if all_model_sbml[model_name]: print("\tSBML: OK") else: print( "\t[WARNING] No SBML found, should be in {1}/{0}/{0}.faa". format(model_name, model_organisms_path)) aucome_pool.close() aucome_pool.join() end_time = (time.time() - start_time) integer_part, decimal_part = str(end_time).split('.') check_time = ".".join([integer_part, decimal_part[:3]]) if verbose: print("--- check step done in: %ss ---" % check_time)