Example #1
0
def test_compare_padmet_cli():
    subprocess.call([
        'padmet', 'pgdb_to_padmet', '--pgdb', 'test_data/pgdb', '--output',
        'test.padmet', '--extract-gene'
    ])
    fabo_1_padmetSpec = PadmetSpec('test.padmet')
    os.remove('test.padmet')
    fabo_1_padmetSpec.delNode('ACYLCOASYN-RXN')
    fabo_1_padmetSpec.generateFile('fabo_1.padmet')

    subprocess.call([
        'padmet', 'pgdb_to_padmet', '--pgdb', 'test_data/pgdb', '--output',
        'test.padmet', '--extract-gene'
    ])
    fabo_2_padmetSpec = PadmetSpec('test.padmet')
    os.remove('test.padmet')
    fabo_2_padmetSpec.delNode('ACYLCOADEHYDROG-RXN')
    fabo_2_padmetSpec.generateFile('fabo_2.padmet')

    subprocess.call([
        'padmet', 'compare_padmet', '--padmet', 'fabo_1.padmet,fabo_2.padmet',
        '--output', 'output'
    ])

    genes_fabo_1 = []
    genes_fabo_2 = []
    with open('output/genes.tsv', 'r') as genes_file:
        csvreader = csv.reader(genes_file, delimiter='\t')
        for row in csvreader:
            if row[1] == '1':
                genes_fabo_1.append(row[0])
            if row[2] == '1':
                genes_fabo_2.append(row[0])

    assert set(FABO_GENES).issubset(set(genes_fabo_1))

    assert set(FABO_GENES).issubset(set(genes_fabo_2))

    reactions_fabo_1 = []
    reactions_fabo_2 = []
    with open('output/reactions.tsv', 'r') as reactions_file:
        csvreader = csv.reader(reactions_file, delimiter='\t')
        for row in csvreader:
            if row[1] == '1':
                reactions_fabo_1.append(row[0])
            if row[2] == '1':
                reactions_fabo_2.append(row[0])

    expected_fabo_1_rxns = [
        rxn for rxn in FABO_RXNS if rxn != 'ACYLCOASYN-RXN'
    ]
    expected_fabo_2_rxns = [
        rxn for rxn in FABO_RXNS if rxn != 'ACYLCOADEHYDROG-RXN'
    ]

    assert set(expected_fabo_1_rxns).issubset(set(reactions_fabo_1))

    assert set(expected_fabo_2_rxns).issubset(set(reactions_fabo_2))

    pathway_fabo_1 = []
    pathway_fabo_2 = []
    with open('output/pathways.tsv', 'r') as pathways_file:
        csvreader = csv.reader(pathways_file, delimiter='\t')
        for row in csvreader:
            if row[0] != 'pathway':
                pathway_fabo_1.append(row[0])
                pathway_fabo_2.append(row[0])
            if row[3] != 'fabo_1_rxn_assoc (sep=;)':
                pwy_reactions_fabo_1 = row[3].split(';')
            if row[4] != 'fabo_2_rxn_assoc (sep=;)':
                pwy_reactions_fabo_2 = row[4].split(';')

    assert pathway_fabo_1 == ['FAO-PWY']

    assert pathway_fabo_2 == ['FAO-PWY']

    assert set(expected_fabo_1_rxns).issubset(set(pwy_reactions_fabo_1))

    assert set(expected_fabo_2_rxns).issubset(set(pwy_reactions_fabo_2))

    metabolites_fabo_1 = []
    metabolites_fabo_2 = []
    with open('output/metabolites.tsv', 'r') as metabolites_file:
        csvreader = csv.reader(metabolites_file, delimiter='\t')
        for row in csvreader:
            if row[1] != 'fabo_1_rxn_consume' or row[1] != '':
                if row[0] != 'metabolite':
                    metabolites_fabo_1.append(row[0])
            if row[3] != 'fabo_1_rxn_produce' or row[3] != '':
                if row[0] != 'metabolite':
                    metabolites_fabo_1.append(row[0])
            if row[2] != 'fabo_2_rxn_consume' or row[2] != '':
                if row[0] != 'metabolite':
                    metabolites_fabo_2.append(row[0])
            if row[2] != 'fabo_2_rxn_produce' or row[2] != '':
                if row[0] != 'metabolite':
                    metabolites_fabo_2.append(row[0])

    metabolites_fabo_1 = list(set(metabolites_fabo_1))
    metabolites_fabo_2 = list(set(metabolites_fabo_2))

    assert set(FABO_CPDS).issubset(set(metabolites_fabo_1))

    assert set(FABO_CPDS).issubset(set(metabolites_fabo_2))

    os.remove('fabo_1.padmet')
    os.remove('fabo_2.padmet')
    shutil.rmtree('output')
Example #2
0
def from_pgdb_to_padmet(pgdb_folder, db='NA', version='NA', source='GENOME', extract_gene=False, no_orphan=False, enhanced_db=False, padmetRef_file=None, verbose=False):
    """
    Parameters
    ----------
    pgdb_folder: str
        path to pgdb
    db: str
        pgdb name, default is 'NA'
    version: str
        pgdb version, default is 'NA'
    source: str
        tag reactions for traceability, default is 'GENOME'
    extract_gene: bool
        if true extract genes information
    no_orphan: bool
        if true, remove reactions without genes associated
    enhanced_db: bool
        if true, read metabolix-reactions.xml sbml file and add information in final padmet
    padmetRef_file: str
        path to padmetRef corresponding to metacyc in padmet format
    verbose: bool
        if True print information
    
    Returns
    -------
    padmet.padmetRef:
        padmet instance with pgdb within pgdb data
    """
    global regex_purge, regex_xref, list_of_relation, def_compart_in, def_compart_out
    regex_purge = re.compile("<.*?>|\|")
    regex_xref = re.compile('^\((?P<DB>\S*)\s*"(?P<ID>\S*)"')
    list_of_relation = []
    def_compart_in = "c"
    def_compart_out = "e"
    #parsing args
    source = source.upper()
    
    classes_file, compounds_file, proteins_file, reactions_file, enzrxns_file, pathways_file = \
    [os.path.join(pgdb_folder,_file) for _file in ["classes.dat", "compounds.dat", "proteins.dat", "reactions.dat", "enzrxns.dat", "pathways.dat"]]
    if enhanced_db:
        metabolic_reactions = os.path.join(pgdb_folder,"metabolic-reactions.xml")
    else:
        metabolic_reactions = None
    if extract_gene:
        genes_file = os.path.join(pgdb_folder,"genes.dat")
    else:
        genes_file = None

    now = datetime.now()
    today_date = now.strftime("%Y-%m-%d")
    if padmetRef_file:
        padmet = PadmetSpec()
        padmetRef = PadmetRef(padmetRef_file)
        version = padmetRef.info["DB_info"]["version"]
        db = padmetRef.info["DB_info"]["DB"]
        dbNotes = {"PADMET":{"creation":today_date,"version":"2.6"},"DB_info":{"DB":db,"version":version}}
        padmet.setInfo(dbNotes)

        padmet.setPolicy(padmetRef)
        with open(reactions_file, 'r') as f:
            rxns_id = [line.split(" - ")[1] for line in f.read().splitlines() if line.startswith("UNIQUE-ID")]
        count = 0
        for rxn_id in rxns_id:
            count += 1
            if verbose: print("%s/%s Copy %s" %(count, len(rxns_id), rxn_id))
            try:
                padmet.copyNode(padmetRef, rxn_id)
                reconstructionData_id = rxn_id+"_reconstructionData_"+source
                if reconstructionData_id in list(padmet.dicOfNode.keys()) and verbose:
                    print("Warning: The reaction %s seems to be already added from the same source %s" %(rxn_id, source))
                reconstructionData = {"SOURCE":[source],"TOOL":["PATHWAYTOOLS"],"CATEGORY":["ANNOTATION"]}
                reconstructionData_rlt = Relation(rxn_id,"has_reconstructionData",reconstructionData_id)
                padmet.dicOfNode[reconstructionData_id] = Node("reconstructionData", reconstructionData_id, reconstructionData)
                padmet._addRelation(reconstructionData_rlt)

            except TypeError:
                print("%s not in padmetRef" %(rxn_id))

        if extract_gene:
            if verbose: print("parsing genes")
            map_gene_ids = genes_parser(genes_file, padmet, verbose)
            if verbose: print("parsing proteins")
            dict_protein_gene_id = proteins_parser(proteins_file, padmet, verbose)
            mapped_dict_protein_gene_id = map_gene_id(dict_protein_gene_id, map_gene_ids)
            if verbose: print("parsing association enzrxns")
            enzrxns_parser(enzrxns_file, padmet, mapped_dict_protein_gene_id, source, verbose)

    else:
        POLICY_IN_ARRAY = [['class','is_a_class','class'], ['class','has_name','name'], ['class','has_xref','xref'], ['class','has_suppData','suppData'],
                        ['compound','is_a_class','class'], ['compound','has_name','name'], ['compound','has_xref','xref'], ['compound','has_suppData','suppData'],
                        ['gene','is_a_class','class'], ['gene','has_name','name'], ['gene','has_xref','xref'], ['gene','has_suppData','suppData'], ['gene','codes_for','protein'],
                        ['pathway','is_a_class','class'], ['pathway','has_name','name'], ['pathway','has_xref','xref'], ['pathway','is_in_pathway','pathway'],
                        ['protein','is_a_class','class'], ['protein','has_name','name'], ['protein','has_xref','xref'], ['protein','has_suppData','suppData'], ['protein','catalyses','reaction'],
                        ['protein','is_in_species','class'],
                        ['reaction','is_a_class','class'], ['reaction','has_name','name'], ['reaction','has_xref','xref'], ['reaction','has_suppData','suppData'], ['reaction','has_reconstructionData','reconstructionData'], ['reaction','is_in_pathway','pathway'],
                        ['reaction','consumes','class','STOICHIOMETRY','X','COMPARTMENT','Y'], ['reaction','produces','class','STOICHIOMETRY','X','COMPARTMENT','Y'],
                        ['reaction','consumes','compound','STOICHIOMETRY','X','COMPARTMENT','Y'], ['reaction','produces','compound','STOICHIOMETRY','X','COMPARTMENT','Y'],
                        ['reaction','consumes','protein','STOICHIOMETRY','X','COMPARTMENT','Y'], ['reaction','produces','protein','STOICHIOMETRY','X','COMPARTMENT','Y'],
                        ['reaction','is_linked_to','gene','SOURCE:ASSIGNMENT','X:Y']]
        dbNotes = {"PADMET":{"creation":today_date,"version":"2.6"},"DB_info":{"DB":db,"version":version}}
        padmet = PadmetRef()
        if verbose: print("setting policy")
        padmet.setPolicy(POLICY_IN_ARRAY)
        if verbose: print("setting dbInfo")
        padmet.setInfo(dbNotes)
    
    
        if verbose: print("parsing classes")
        classes_parser(classes_file, padmet, verbose)
    
        if verbose: print("parsing compounds")
        compounds_parser(compounds_file, padmet, verbose)
    
        if verbose: print("parsing reactions")
        reactions_parser(reactions_file, padmet, extract_gene, source, verbose)
    
        if verbose: print("parsing pathways")
        pathways_parser(pathways_file, padmet, verbose)
    
        if extract_gene:
            if verbose: print("parsing genes")
            map_gene_ids = genes_parser(genes_file, padmet, verbose)
            if verbose: print("parsing proteins")
            dict_protein_gene_id = proteins_parser(proteins_file, padmet, verbose)
            mapped_dict_protein_gene_id = map_gene_id(dict_protein_gene_id, map_gene_ids)
            if verbose: print("parsing association enzrxns")
            enzrxns_parser(enzrxns_file, padmet, mapped_dict_protein_gene_id, source, verbose)
    
        if metabolic_reactions is not None:
            if verbose: print("enhancing db from metabolic-reactions.xml")
            padmet = enhance_db(metabolic_reactions, padmet, extract_gene, verbose)
    
    for rlt in list_of_relation:
        try:
            padmet.dicOfRelationIn[rlt.id_in].append(rlt)
        except KeyError:
            padmet.dicOfRelationIn[rlt.id_in] = [rlt]
        try:
            padmet.dicOfRelationOut[rlt.id_out].append(rlt)
        except KeyError:
            padmet.dicOfRelationOut[rlt.id_out] = [rlt]

    if extract_gene and no_orphan:
        all_reactions = [node for node in list(padmet.dicOfNode.values()) if node.type == "reaction"]
        rxn_to_del = [r for r in all_reactions if not any([rlt for rlt in padmet.dicOfRelationIn[r.id] if rlt.type == "is_linked_to"])]
        for rxn in rxn_to_del:
            padmet.delNode(rxn.id)
        if verbose:
            print("%s/%s orphan reactions (without gene association) deleted" %(len(rxn_to_del), len(all_reactions)))
        all_genes_linked = set([rlt.id_out for rlt in padmet.getAllRelation() if rlt.type == "is_linked_to"])
        all_genes = set([node.id for node in list(padmet.dicOfNode.values()) if node.type == "gene"])
        count = 0
        for gene_id in [g for g in all_genes if g not in all_genes_linked]:
            count += 1
            #if verbose: print("Removing gene without gene assoc %s" %gene_id)
            padmet.dicOfNode.pop(gene_id)
        if verbose:
            print("%s/%s orphan genes (not linked to any reactions) deleted" %(count, len(all_genes)))

    rxns = [node.id for node in list(padmet.dicOfNode.values()) if node.type == "reaction"]
    for rxn_id in rxns:
        cp_rlts = set([rlt.type for rlt in padmet.dicOfRelationIn[rxn_id] if rlt.type in ["consumes","produces"]])
        if len(cp_rlts) == 1:
            print("rxn only consume or produce, transport ???: %s" %rxn_id)
            padmet.delNode(rxn_id)

    return padmet