예제 #1
0
def run_analysis(run_id, nb_cpu_to_use, pvclust, verbose):
    """Create input data for creationf of reaction dendrogram tsv reactions files.

    Args:
        run_id (str): ID of the run
        nb_cpu_to_use (int): number of CPU for multiprocessing
        pvclust (boolean): use also pvclust to create reaction dendrogram
        verbose (boolean): verbose
    """
    if verbose:
        print('--- Running analysis step ---')
    analysis_start_time = time.time()
    config_data = parse_config_file(run_id)

    analysis_group_file_path = config_data['analysis_group_file_path']

    # Create list of dictionaries containing input data for multiprocessing.
    # As we have to give one argument after the function to pool().
    # Create one dictionary by group in the group_template.tsv file.
    group_data = []
    with open(analysis_group_file_path, 'r') as group_file:
        group_reader = csv.reader(group_file, delimiter='\t')
        for row in group_reader:
            group_name = row[0]
            groups = [org_name for org_name in row[1:] if org_name]
            analysis_on_group(group_name, groups, config_data, pvclust,
                              nb_cpu_to_use, verbose)

    analysis_end_time = (time.time() - analysis_start_time)
    integer_part, decimal_part = str(analysis_end_time).split('.')
    analysis_time = ".".join([integer_part, decimal_part[:3]])

    if verbose:
        print("--- analysis step done in: %ss ---" % analysis_time)
예제 #2
0
def run_reconstruction(run_id, nb_cpu_to_use, verbose):
    config_data = parse_config_file(run_id)

    pgdb_from_annotation_path = config_data['pgdb_from_annotation_path']
    studied_organisms_path = config_data['studied_organisms_path']
    log_path = config_data['log_path']
    #check for each study if exist PGDB folder in PGDBs folder, if missing RUN ptools
    chronoDepart = time.time()

    mpwt.multiprocess_pwt(input_folder=config_data['studied_organisms_path'],
                          output_folder=pgdb_from_annotation_path,
                          patho_inference=True,
                          dat_creation=True,
                          dat_extraction=True,
                          number_cpu=nb_cpu_to_use,
                          patho_log=log_path,
                          verbose=verbose)

    chrono = (time.time() - chronoDepart)
    partie_entiere, partie_decimale = str(chrono).split('.')
    chrono = ".".join([partie_entiere, partie_decimale[:3]])

    if os.listdir(pgdb_from_annotation_path) == []:
        print('Pathway-Tools inference failed!')
        return
    if verbose:
        print("Pathway-Tools done in: %ss" % chrono)

    create_padmet_sbml_from_pgdb(run_id, nb_cpu_to_use, verbose)
예제 #3
0
def create_padmet_sbml_from_pgdb(run_id, nb_cpu_to_use, verbose):
    config_data = parse_config_file(run_id)

    padmet_from_annotation_path = config_data['padmet_from_annotation_path']
    study_from_annot_prefix = config_data['study_from_annot_prefix']
    sbml_from_annotation_path = config_data['sbml_from_annotation_path']
    padmet_utils_path = config_data['padmet_utils_path']
    database_path = config_data['database_path']
    pgdb_from_annotation_path = config_data['pgdb_from_annotation_path']

    aucome_pool = Pool(nb_cpu_to_use)

    all_study_name = set(
        next(os.walk(config_data['studied_organisms_path']))[1])

    all_study_pgdb = dict([(study_name, "{0}/{1}".format(
        pgdb_from_annotation_path, study_name)) if os.path.isdir(
            "{0}/{1}".format(pgdb_from_annotation_path, study_name)) else
                           (study_name, '') for study_name in all_study_name])

    study_padmet_data = []
    for study_name in all_study_name:
        padmet_file = "{0}/{1}{2}.padmet".format(padmet_from_annotation_path,
                                                 study_from_annot_prefix,
                                                 study_name)
        pgdb_folder = all_study_pgdb[study_name]
        tmp_padmet_data = {
            'study_name': study_name,
            'pgdb_folder': pgdb_folder,
            'padmet_utils_path': padmet_utils_path,
            'verbose': verbose,
            'padmet_file': padmet_file,
            'database_path': database_path
        }
        study_padmet_data.append(tmp_padmet_data)
    aucome_pool.map(create_padmet_from_pgdb, study_padmet_data)

    all_study_padmet = dict([(study_name, "{0}/{1}{2}.padmet".format(
        padmet_from_annotation_path, study_from_annot_prefix,
        study_name)) if os.path.isfile("{0}/{1}{2}.padmet".format(
            padmet_from_annotation_path, study_from_annot_prefix, study_name))
                             else (study_name, '')
                             for study_name in all_study_name])

    study_sbml_data = []
    for study_name in all_study_padmet:
        sbml_file = "{0}/{1}{2}.sbml".format(sbml_from_annotation_path,
                                             study_from_annot_prefix,
                                             study_name)
        padmet_file = all_study_padmet[study_name]
        tmp_sbml_data = {
            'sbml_file': sbml_file,
            'padmet_file': padmet_file,
            'padmet_utils_path': padmet_utils_path,
            'study_name': study_name,
            'verbose': verbose
        }
        study_sbml_data.append(tmp_sbml_data)
    aucome_pool.map(create_sbml, study_sbml_data)
예제 #4
0
파일: merge.py 프로젝트: AuReMe/aucome
def run_merge(run_id, nb_cpu_to_use, verbose, veryverbose=None):
    if verbose:
        print('--- Running merge step ---')
    merge_start_time = time.time()
    aucome_pool = Pool(nb_cpu_to_use)

    config_data = parse_config_file(run_id)

    padmet_from_annotation_path = config_data['padmet_from_annotation_path']
    padmet_from_networks_path = config_data['padmet_from_networks_path']
    sbml_from_networks_path = config_data['sbml_from_networks_path']
    database_path = config_data['database_path']

    structural_padmets_path = config_data['structural_padmets_path']
    orthofinder_filtered_path = config_data['orthofinder_filtered_path']
    orthofinder_padmet_path = config_data['orthofinder_padmet_path']
    padmet_from_annotation_path = config_data['padmet_from_annotation_path']
    networks_path = config_data['networks_path']

    structural_padmets = [padmet for padmet in os.listdir(structural_padmets_path) if padmet.endswith('.padmet')]
    orthofinder_filtered_padmets = [padmet for padmet in os.listdir(orthofinder_filtered_path) if padmet.endswith('.padmet')]
    orthofinder_padmets = [padmet for padmet in os.listdir(orthofinder_padmet_path) if padmet.endswith('.padmet')]
    pathway_tools_padmets = [padmet for padmet in os.listdir(padmet_from_annotation_path) if padmet.endswith('.padmet')]

    if len(structural_padmets) > 0:
        padmets = [(padmet, structural_padmets_path + '/' + padmet) for padmet in structural_padmets]
    elif len(orthofinder_filtered_padmets) > 0:
        padmets = [(padmet, orthofinder_filtered_path + '/' + padmet) for padmet in orthofinder_filtered_padmets]
    elif len(orthofinder_padmets) > 0:
        padmets = [(padmet, orthofinder_padmet_path + '/' + padmet) for padmet in orthofinder_padmets]
    elif len(pathway_tools_padmets) > 0:
        padmets = [(padmet, padmet_from_annotation_path + '/' + padmet) for padmet in pathway_tools_padmets]
    else:
        sys.exit('No padmets have been created, run reconstruction or workflow.')

    study_draft_data = []
    for study_name, padmet_path in padmets:
        tmp_study_data = {'padmet_path': padmet_path, 'study_padmet': study_name, 'padmet_from_networks_path': padmet_from_networks_path,
                            'sbml_from_networks_path': sbml_from_networks_path, 'database_path': database_path,
                            'verbose': verbose, 'veryverbose': veryverbose}
        study_draft_data.append(tmp_study_data)
    aucome_pool.map(create_output, study_draft_data)

    aucome_pool.close()
    aucome_pool.join()

    padmet_to_padmet.padmet_to_padmet(padmet_from_networks_path, networks_path + '/panmetabolism.padmet', verbose=veryverbose)
    sbmlGenerator.padmet_to_sbml(padmet=networks_path + '/panmetabolism.padmet', output=networks_path + '/panmetabolism.sbml', verbose=veryverbose)

    merge_end_time = (time.time() - merge_start_time)
    integer_part, decimal_part = str(merge_end_time).split('.')
    merge_time = ".".join([integer_part, decimal_part[:3]])

    if verbose:
        print("--- merge step done in: %ss ---" %merge_time)
예제 #5
0
파일: draft.py 프로젝트: AuReMe/aureme
def run_draft(run_id, nb_cpu_to_use, verbose):

    aucome_pool = Pool(nb_cpu_to_use)

    config_data = parse_config_file(run_id)

    studied_organisms_path = config_data['studied_organisms_path']
    padmet_from_annotation_path = config_data['padmet_from_annotation_path']
    study_from_annot_prefix = config_data['study_from_annot_prefix']
    networks_path = config_data['networks_path']
    orthology_based_path = config_data['orthology_based_path']
    padmet_utils_path = config_data['padmet_utils_path']
    database_path = config_data['database_path']
    padmet_from_networks_path = config_data['padmet_from_networks_path']
    sbml_from_networks_path = config_data['sbml_from_networks_path']

    all_study_name = set(next(os.walk(studied_organisms_path))[1])

    all_study_padmet = dict([(study_name, "{0}/{1}{2}.padmet".format(
        padmet_from_annotation_path, study_from_annot_prefix,
        study_name)) if os.path.isfile("{0}/{1}{2}.padmet".format(
            padmet_from_annotation_path, study_from_annot_prefix, study_name))
                             else (study_name, '')
                             for study_name in all_study_name])

    study_draft_data = []
    for study_name in all_study_name:
        tmp_study_data = {
            'study_name': study_name,
            'study_padmet': all_study_padmet[study_name],
            'networks_path': networks_path,
            'orthology_based_path': orthology_based_path,
            'padmet_utils_path': padmet_utils_path,
            'database_path': database_path,
            'padmet_from_networks_path': padmet_from_networks_path,
            'sbml_from_networks_path': sbml_from_networks_path,
            'verbose': verbose
        }
        study_draft_data.append(tmp_study_data)
    aucome_pool.map(create_draft, study_draft_data)
예제 #6
0
def run_reconstruction(run_id, nb_cpu_to_use, verbose, veryverbose=None):
    if verbose:
        logger.setLevel(logging.DEBUG)
        logging.getLogger("mpwt").setLevel(logging.DEBUG)
        print('--- Running reconstruction step ---')
    start_time = time.time()
    config_data = parse_config_file(run_id)

    pgdb_from_annotation_path = config_data['pgdb_from_annotation_path']
    studied_organisms_path = config_data['studied_organisms_path']
    log_path = config_data['log_path']


    taxon_file = None
    if 'taxon_id.tsv' in set(next(os.walk(config_data['studied_organisms_path']))[2]):
        taxon_file = True

    mpwt.multiprocess_pwt(input_folder=studied_organisms_path,
                            output_folder=pgdb_from_annotation_path,
                            patho_inference=True,
                            flat_creation=True,
                            dat_extraction=True,
                            number_cpu=nb_cpu_to_use,
                            patho_log=log_path,
                            taxon_file=taxon_file,
                            verbose=verbose)

    if os.listdir(pgdb_from_annotation_path) == []:
        print('Pathway-Tools inference failed!')
        return

    create_padmet_sbml_from_pgdb(run_id, nb_cpu_to_use, verbose, veryverbose)

    end_time = (time.time() - start_time)
    integer_part, decimal_part = str(end_time).split('.')
    reconstruction_time = ".".join([integer_part, decimal_part[:3]])

    if verbose:
        print("--- reconstruction step done in: %ss ---" %reconstruction_time)
예제 #7
0
def run_structural(run_id, keep_tmp, nb_cpu_to_use, verbose):
    if verbose:
        print('--- Running structural check step ---')
    structural_start_time = time.time()

    config_data = parse_config_file(run_id)
    database_path = config_data['database_path']

    prot2genome.fromAucome(run_id,
                           nb_cpu_to_use,
                           database_path,
                           blastp=True,
                           tblastn=True,
                           exonerate=True,
                           keep_tmp=keep_tmp,
                           debug=False)

    structural_end_time = (time.time() - structural_start_time)
    integer_part, decimal_part = str(structural_end_time).split('.')
    structural_time = ".".join([integer_part, decimal_part[:3]])

    if verbose:
        print("--- structural step done in: %ss ---" % structural_time)
예제 #8
0
파일: compare.py 프로젝트: AuReMe/aureme
def run_compare(run_id, nb_cpu_to_use, verbose):
    """Compare the gorup specified by the user.

    Args:
        run_id (str): ID of the run
        nb_cpu_to_use (int): number of CPU for multiprocessing
        verbose (boolean): verbose
    """
    config_data = parse_config_file(run_id)

    analysis_path = config_data['analysis_path']
    analysis_group_file_path = config_data['analysis_group_file_path']
    upset_path = analysis_path + '/upset_graph'
    upset_tmp_data_path = upset_path + '/tmp_data'
    upset_tmp_reaction_path = upset_tmp_data_path + '/tmp'

    padmet_utils_path = config_data['padmet_utils_path']
    database_path = config_data['database_path']
    padmet_from_networks_path = config_data['padmet_from_networks_path']

    # Create a dictionary containing the group name and the species inside the group.
    group_data = {}
    padmets = []
    with open(analysis_group_file_path, 'r') as group_file:
        group_reader = csv.reader(group_file, delimiter='\t')
        cluster_reactions = {}
        for row in group_reader:
            group_name = row[0]
            groups = [species for species in row[1:] if species != '']
            group_data[group_name] = groups
            if group_name != 'all':
                padmets.extend([padmet_from_networks_path + '/' + species + '.padmet' for species in groups])

    padmets = list(set(padmets))

    if not os.path.isdir(upset_path):
        os.mkdir(upset_path)
    if not os.path.isdir(upset_tmp_data_path):
        os.mkdir(upset_tmp_data_path)
        if not os.path.isdir(upset_tmp_reaction_path):
            os.mkdir(upset_tmp_reaction_path)

        # Create the reactions.csv file needed to create dendrogram.
        cmds = ["python3",  padmet_utils_path + "/padmet_utils/exploration/compare_padmet.py", "--padmet", ','.join(padmets),
                "--output", upset_tmp_reaction_path, "--padmetRef", database_path]

        if verbose:
            cmds.append('-v')

        subprocess.call(cmds)

    # Read the reactions.csv file and remove the column unused.
    reactions_file = upset_tmp_reaction_path + '/' + 'reactions.csv'
    reactions_dataframe = pa.read_csv(reactions_file, sep='\t')
    columns = [column for column in reactions_dataframe.columns if '(sep=;)' not in column]
    columns = [column for column in columns if '_formula' not in column]
    reactions_dataframe = reactions_dataframe[columns].copy()
    reactions_dataframe.set_index('reaction', inplace=True)

    # Translate 'present'/(nan) data into a True/False absence-presence matrix.
    for column in reactions_dataframe.columns.tolist():
        reactions_dataframe[column] = [True if data == 'present' else False for data in reactions_dataframe[column]]

    # For each group, extract the reactions present in its species.
    # Then create a tsv file containing these reactions..
    for group_name in group_data:
        if group_name != 'all':
            groups = group_data[group_name]
            reactions_temp = []
            for species in groups:
                species_reactions_dataframe = reactions_dataframe[reactions_dataframe[species] == True]
                reactions_temp.extend(species_reactions_dataframe.index.tolist())
            cluster_reactions[group_name] = set(reactions_temp)

            df = pa.DataFrame({group_name: list(cluster_reactions[group_name])})
            df.to_csv(upset_tmp_data_path+'/'+group_name+'.tsv', sep='\t', index=None, header=None)

    # Launch Intervene to create upset graph using each group file.
    upset_data_path = [upset_tmp_data_path + '/' + tsv_file for tsv_file in os.listdir(upset_tmp_data_path) if tsv_file.endswith('.tsv')]
    cmds = ['intervene', 'upset', '-i', *upset_data_path, '--type', 'list', '-o', upset_path, '--figtype', 'svg']

    if verbose:
        subprocess.call(cmds)
    else:
        FNULL = open(os.devnull, 'w')
        subprocess.call(cmds, stdout=FNULL, stderr=subprocess.STDOUT)

    cmds = ["python3",  padmet_utils_path + "/padmet_utils/exploration/dendrogram_reactions_distance.py", "--reactions", reactions_file,
            "--output", upset_path + '/dendrogram_output', "--padmetRef", database_path]

    if verbose:
        cmds.append('-v')

    subprocess.call(cmds)
예제 #9
0
파일: orthology.py 프로젝트: AuReMe/aureme
def run_orthology(run_id, orthogroups, sequence_search_prg, nb_cpu_to_use,
                  verbose):
    aucome_pool = Pool(nb_cpu_to_use)

    config_data = parse_config_file(run_id)

    orthofinder_wd_path = config_data['orthofinder_wd_path']
    orthofinder_bin_path = config_data['orthofinder_bin_path']
    orthology_based_path = config_data['orthology_based_path']
    padmet_utils_path = config_data['padmet_utils_path']
    studied_organisms_path = config_data['studied_organisms_path']
    model_organisms_path = config_data['model_organisms_path']
    mnx_cpd_path = config_data['mnx_cpd_path']
    mnx_rxn_path = config_data['mnx_rxn_path']

    all_study_name = set(next(os.walk(studied_organisms_path))[1])
    all_model_name = set(next(os.walk(model_organisms_path))[1])

    all_study_faa = dict([(study_name, "{0}/{1}/{1}.faa".format(
        studied_organisms_path, study_name)) if os.path.isfile(
            "{0}/{1}/{1}.faa".format(studied_organisms_path, study_name)) else
                          (study_name, '') for study_name in all_study_name])

    all_model_faa = dict([(model_name, "{0}/{1}/{1}.faa".format(
        model_organisms_path, model_name)) if os.path.isfile(
            "{0}/{1}/{1}.faa".format(model_organisms_path, model_name)) else
                          (model_name, '') for model_name in all_model_name])

    #check if Orthofinder already run, if yes, get the last workdir
    try:
        if orthogroups:
            orthodata_path = max([
                "%s/%s" % (x[0], 'Orthogroups/Orthogroups.tsv')
                for x in os.walk(orthofinder_wd_path) if 'Orthogroups' in x[1]
            ])
        else:
            orthodata_path = max([
                "%s/%s" % (x[0], 'Orthologues')
                for x in os.walk(orthofinder_wd_path) if 'Orthologues' in x[1]
            ])
    except ValueError:
        if verbose:
            print(
                "Enable to find file Orthogroups.csv in {0}, need to run Orthofinder..."
                .format(orthofinder_wd_path))
        for name, faa_path in list(all_study_faa.items()):
            if not os.path.isfile("{0}/{1}.faa".format(orthofinder_wd_path,
                                                       name)):
                if verbose:
                    print("Copying {0}'s faa to {1}".format(
                        name, orthofinder_wd_path))
                cmds = ["cp", faa_path, orthofinder_wd_path]
                subprocess.call(cmds)
        for name, faa_path in list(all_model_faa.items()):
            if not os.path.isfile("{0}/{1}.faa".format(orthofinder_wd_path,
                                                       name)):
                if verbose:
                    print("Copying {0}'s faa to {1}".format(
                        name, orthofinder_wd_path))
                cmds = ["cp", faa_path, orthofinder_wd_path]
                subprocess.call(cmds)

        if verbose:
            print("Running Orthofinder on %s cpu" % nb_cpu_to_use)

        chronoDepart = time.time()
        cmds = [
            orthofinder_bin_path, "-f", orthofinder_wd_path, "-t",
            str(nb_cpu_to_use), "-S", sequence_search_prg
        ]
        subprocess.call(cmds)
        chrono = (time.time() - chronoDepart)
        partie_entiere, partie_decimale = str(chrono).split('.')
        chrono = ".".join([partie_entiere, partie_decimale[:3]])
        if verbose:
            print("Orthofinder done in: %ss" % chrono)
        if orthogroups:
            orthodata_path = max([
                "%s/%s" % (x[0], 'Orthogroups/Orthogroups.tsv')
                for x in os.walk(orthofinder_wd_path) if 'Orthogroups' in x[1]
            ])
        else:
            orthodata_path = max([
                "%s/%s" % (x[0], 'Orthologues')
                for x in os.walk(orthofinder_wd_path) if 'Orthologues' in x[1]
            ])
    if verbose:
        print("Parsing Orthofinder output %s" % orthodata_path)

    if verbose:
        print("Start sbml creation...")
    all_dict_data = []
    for study_name in all_study_name:
        dict_data = {
            'sbml': run_id,
            'orthodata_path': orthodata_path,
            'study_name': study_name,
            'padmet_utils_path': padmet_utils_path,
            'verbose': verbose,
            'orthogroups': orthogroups,
            'output': orthology_based_path + '/' + study_name
        }
        all_dict_data.append(dict_data)

    chronoDepart = time.time()
    aucome_pool.map(orthogroup_to_sbml, all_dict_data)
    chrono = (time.time() - chronoDepart)
    integer_part, decimal_part = str(chrono).split('.')
    chrono = ".".join([integer_part, decimal_part[:3]])
    if verbose:
        print("Orthofinder output parsed in: %ss" % chrono)
    #check database, mapping to metacyc ???
    data_convert_sbml_db = []
    for dict_data in all_dict_data:
        tmp_dict_data = {
            'sbml': orthology_based_path + '/' + study_name,
            'padmet_utils_path': padmet_utils_path,
            'mnx_rxn_path': mnx_rxn_path,
            'mnx_cpd_path': mnx_cpd_path,
            'verbose': verbose
        }
        data_convert_sbml_db.append(tmp_dict_data)

    aucome_pool.map(convert_sbml_db, data_convert_sbml_db)

    aucome_pool.close()
    aucome_pool.join()
예제 #10
0
파일: compare.py 프로젝트: AuReMe/aucome
def run_compare(run_id, nb_cpu_to_use, verbose):
    """Compare the gorup specified by the user.

    Args:
        run_id (str): ID of the run
        nb_cpu_to_use (int): number of CPU for multiprocessing
        verbose (boolean): verbose
    """
    if verbose:
        print('--- Running compare step ---')
    compare_start_time = time.time()
    config_data = parse_config_file(run_id)

    analysis_path = config_data['analysis_path']
    analysis_group_file_path = config_data['analysis_group_file_path']
    compare_output_path = analysis_path + '/compare_group'

    database_path = config_data['database_path']
    padmet_from_networks_path = config_data['padmet_from_networks_path']

    # Create a dictionary containing the group name and the species inside the group.
    group_data = {}
    padmets = []
    with open(analysis_group_file_path, 'r') as group_file:
        group_reader = csv.reader(group_file, delimiter='\t')
        cluster_reactions = {}
        for row in group_reader:
            group_name = row[0]
            groups = [species for species in row[1:] if species != '']
            group_data[group_name] = groups
            if group_name != 'all':
                padmets.extend([
                    padmet_from_networks_path + '/' + species + '.padmet'
                    for species in groups
                ])

    padmets = list(set(padmets))

    if not os.path.isdir(compare_output_path):
        os.mkdir(compare_output_path)

    padmetref = PadmetRef(database_path)
    # Create the reactions.tsv file needed to create dendrogram.
    padmet_path = ','.join(padmets)
    compare_padmet.compare_padmet(padmet_path=padmet_path,
                                  output=compare_output_path,
                                  padmetRef=padmetref,
                                  verbose=verbose)

    # Read the reactions.tsv file and remove the column unused.
    reactions_file = compare_output_path + '/' + 'reactions.tsv'
    reactions_dataframe = pa.read_csv(reactions_file, sep='\t')
    columns = [
        column for column in reactions_dataframe.columns
        if '(sep=;)' not in column and '_formula' not in column
    ]
    reactions_dataframe = reactions_dataframe[columns].copy()
    reactions_dataframe.set_index('reaction', inplace=True)

    # For each group, extract the reactions present in its species to create supervenn sets.
    supervenn_sets = []
    supervenn_labels = []
    for group_name in group_data:
        if group_name != 'all':
            groups = group_data[group_name]
            reactions_temp = []
            for species in groups:
                species_reactions_dataframe = reactions_dataframe[
                    reactions_dataframe[species] == 1]
                reactions_temp.extend(
                    species_reactions_dataframe.index.tolist())
            supervenn_sets.append(set(reactions_temp))
            supervenn_labels.append(group_name)
            cluster_reactions[group_name] = set(reactions_temp)

    supervenn(supervenn_sets,
              supervenn_labels,
              chunks_ordering='occurence',
              sets_ordering='minimize gaps')
    plt.savefig(compare_output_path + '/compare_group.png',
                bbox_inches='tight')
    plt.clf()

    dendrogram_reactions_distance.reaction_figure_creation(
        reactions_file,
        os.path.join(compare_output_path, "dendrogram_output"),
        padmetRef_file=database_path,
        verbose=verbose)

    compare_end_time = (time.time() - compare_start_time)
    integer_part, decimal_part = str(compare_end_time).split('.')
    compare_time = ".".join([integer_part, decimal_part[:3]])

    if verbose:
        print("--- compare step done in: %ss ---" % compare_time)
예제 #11
0
파일: check.py 프로젝트: AuReMe/aucome
def run_check(run_id, nb_cpu_to_use, verbose, veryverbose):
    if verbose:
        print('--- Running check step ---')
    start_time = time.time()

    config_data = parse_config_file(run_id)

    padmet_from_annotation_path = config_data['padmet_from_annotation_path']
    study_from_annot_prefix = config_data['study_from_annot_prefix']
    sbml_from_annotation_path = config_data['sbml_from_annotation_path']
    database_path = config_data['database_path']
    pgdb_from_annotation_path = config_data['pgdb_from_annotation_path']
    studied_organisms_path = config_data['studied_organisms_path']
    model_organisms_path = config_data['model_organisms_path']
    analysis_group_file_path = config_data['analysis_group_file_path']

    #create dict for ortho data
    all_study_name = set(next(os.walk(studied_organisms_path))[1])
    all_model_name = set(next(os.walk(model_organisms_path))[1])
    all_study_pgdb = dict([(study_name, "{0}/{1}".format(
        pgdb_from_annotation_path, study_name)) if os.path.isdir(
            "{0}/{1}".format(pgdb_from_annotation_path, study_name)) else
                           (study_name, '') for study_name in all_study_name])
    all_study_gbk = dict([(study_name, "{0}/{1}/{1}.gbk".format(
        studied_organisms_path, study_name)) if os.path.isfile(
            "{0}/{1}/{1}.gbk".format(studied_organisms_path, study_name)) else
                          (study_name, '') for study_name in all_study_name])
    #k = folder_name in model_organisms_path, v = path to faa in this folder, faa name should be folder_name.faa
    all_model_gbk = dict([(model_name, "{0}/{1}/{1}.gbk".format(
        model_organisms_path, model_name)) if os.path.isfile(
            "{0}/{1}/{1}.gbk".format(model_organisms_path, model_name)) else
                          (model_name, '') for model_name in all_model_name])

    # Update group file in analysis
    if not os.path.exists(analysis_group_file_path):
        with open(analysis_group_file_path, 'w') as group_file:
            group_writer = csv.writer(group_file, delimiter='\t')
            group_writer.writerow(['all', *all_study_name])
    else:
        groups_data = []
        with open(analysis_group_file_path, 'r') as group_file:
            group_reader = csv.reader(group_file, delimiter='\t')
            for row in group_reader:
                groups = [org_name for org_name in row[1:] if org_name]
                groups_data.append((row[0], groups))

        # Check if 'all' row matches species in study_organisms.
        if sorted(groups_data[0][1]) != sorted(all_study_name):
            with open(analysis_group_file_path, 'w') as group_file:
                group_writer = csv.writer(group_file, delimiter='\t')
                group_writer.writerow(['all', *all_study_name])
                for group in groups_data:
                    if group[0] != 'all':
                        group_writer.writerow([group[0], *group[1]])

    aucome_pool = Pool(nb_cpu_to_use)

    if verbose:
        print('Checking genbank file.')
    study_faa_data = []
    for study_name in all_study_name:
        faa_path = "{0}/{1}/{1}.faa".format(studied_organisms_path, study_name)
        tmp_faa_data = {
            'study_name': study_name,
            'faa_path': faa_path,
            'gbk_file': all_study_gbk[study_name],
            'studied_organisms_path': studied_organisms_path,
            'verbose': verbose
        }
        study_faa_data.append(tmp_faa_data)
    aucome_pool.map(check_create_faa, study_faa_data)

    #k = folder_name in studied_org_path, v = path to faa in this folder, faa name should be folder_name.faa
    all_study_faa = dict([(study_name, "{0}/{1}/{1}.faa".format(
        studied_organisms_path, study_name)) if os.path.isfile(
            "{0}/{1}/{1}.faa".format(studied_organisms_path, study_name)) else
                          (study_name, '') for study_name in all_study_name])

    study_model_data = []
    for model_name in all_model_name:
        faa_path = "{0}/{1}/{1}.faa".format(model_organisms_path, model_name)
        tmp_model_data = {
            'model_name': model_name,
            'faa_path': faa_path,
            'gbk_file': all_model_gbk[model_name],
            'verbose': verbose
        }
        study_model_data.append(tmp_model_data)
    aucome_pool.map(create_faa_model, study_model_data)

    #k = folder_name in model_organisms_path, v = path to faa in this folder, faa name should be folder_name.faa
    all_model_faa = dict([(model_name, "{0}/{1}/{1}.faa".format(
        model_organisms_path, model_name)) if os.path.isfile(
            "{0}/{1}/{1}.faa".format(model_organisms_path, model_name)) else
                          (model_name, '') for model_name in all_model_name])

    study_padmet_data = []
    for study_name in all_study_name:
        padmet_file = "{0}/{1}{2}.padmet".format(padmet_from_annotation_path,
                                                 study_from_annot_prefix,
                                                 study_name)
        pgdb_folder = all_study_pgdb[study_name]
        tmp_padmet_data = {
            'study_name': study_name,
            'pgdb_folder': pgdb_folder,
            'verbose': verbose,
            'padmet_file': padmet_file,
            'database_path': database_path,
            'veryverbose': veryverbose
        }
        study_padmet_data.append(tmp_padmet_data)
    aucome_pool.map(create_padmet_from_pgdb, study_padmet_data)

    all_study_padmet = dict([(study_name, "{0}/{1}{2}.padmet".format(
        padmet_from_annotation_path, study_from_annot_prefix,
        study_name)) if os.path.isfile("{0}/{1}{2}.padmet".format(
            padmet_from_annotation_path, study_from_annot_prefix, study_name))
                             else (study_name, '')
                             for study_name in all_study_name])

    study_sbml_data = []
    for study_name in all_study_padmet:
        sbml_file = "{0}/{1}{2}.sbml".format(sbml_from_annotation_path,
                                             study_from_annot_prefix,
                                             study_name)
        padmet_file = all_study_padmet[study_name]
        tmp_sbml_data = {
            'sbml_file': sbml_file,
            'padmet_file': padmet_file,
            'study_name': study_name,
            'verbose': verbose,
            'veryverbose': veryverbose
        }
        study_sbml_data.append(tmp_sbml_data)
    aucome_pool.map(create_sbml, study_sbml_data)

    #sbml of study are obtained from annotation, they should be in sbml_from_annotation_path
    #k = study_name (== folder_name in studied_org_path or obtained from sbml name), v = path to sbml, sbml_study_prefi+study_name+.sbml
    all_study_sbml = dict([(study_name, "{0}/{1}{2}.sbml".format(
        sbml_from_annotation_path, study_from_annot_prefix,
        study_name)) if os.path.isfile("{0}/{1}{2}.sbml".format(
            sbml_from_annotation_path, study_from_annot_prefix, study_name))
                           else (study_name, '')
                           for study_name in all_study_name])

    #k = folder_name in model_organisms_path, v = path to sbml in this folder, sbml name should be folder_name.sbml
    all_model_sbml = dict([(model_name, "{0}/{1}/{1}.sbml".format(
        model_organisms_path, model_name)) if os.path.isfile(
            "{0}/{1}/{1}.sbml".format(model_organisms_path, model_name)) else
                           (model_name, '') for model_name in all_model_name])
    #PGDB, padmet, sbml
    all_study_pgdb = dict([(study_name, "{0}/{1}".format(
        pgdb_from_annotation_path, study_name)) if os.path.isdir(
            "{0}/{1}".format(pgdb_from_annotation_path, study_name)) else
                           (study_name, '') for study_name in all_study_name])

    if verbose:
        print("Input summary:")
        print("* %s Studied organims:" % (len(all_study_name)))
        for study_name in all_study_name:
            print("%s:" % study_name)
            if all_study_gbk[study_name]:
                print("\tGBK: OK")
            else:
                print("\t[WARNING] No GBK found, should be in {1}/{0}/{0}.gbk".
                      format(study_name, studied_organisms_path))
            if all_study_pgdb[study_name]:
                print("\tPGDB: OK")
            else:
                print("\t[WARNING] No PGDB found, should be in {1}/{0}".format(
                    study_name, pgdb_from_annotation_path))
            if all_study_padmet[study_name]:
                print("\tPADMET: OK")
            else:
                print(
                    "\t[WARNING] No PADMET found, should be in {1}/{2}{0}.padmet"
                    .format(study_name, padmet_from_annotation_path,
                            study_from_annot_prefix))
            if all_study_faa[study_name]:
                print("\tFAA: OK")
            else:
                print("\t[WARNING] No FAA found, should be in {1}/{0}/{0}.faa".
                      format(study_name, studied_organisms_path))
            if all_study_sbml[study_name]:
                print("\tSBML: OK")
            else:
                print(
                    "\t[WARNING] No SBML found, should be in {1}/{2}{0}.sbml".
                    format(study_name, sbml_from_annotation_path,
                           study_from_annot_prefix))
        print("* %s models organims:" % (len(all_model_name)))
        for model_name in all_model_name:
            print("%s:" % model_name)
            if all_model_faa[model_name]:
                print("\tFAA: OK")
            else:
                print("\t[WARNING] No FAA found, should be in {1}/{0}/{0}.faa".
                      format(model_name, model_organisms_path))
            if all_model_sbml[model_name]:
                print("\tSBML: OK")
            else:
                print(
                    "\t[WARNING] No SBML found, should be in {1}/{0}/{0}.faa".
                    format(model_name, model_organisms_path))

    aucome_pool.close()
    aucome_pool.join()

    end_time = (time.time() - start_time)
    integer_part, decimal_part = str(end_time).split('.')
    check_time = ".".join([integer_part, decimal_part[:3]])

    if verbose:
        print("--- check step done in: %ss ---" % check_time)