def test_lipoprotein(self): """Parsing ENZYME record for lipoprotein lipase (3.1.1.34)""" filename = os.path.join('Enzymes', 'lipoprotein.txt') handle = open(filename) record = Enzyme.read(handle) handle.close() self.assertEqual(record["ID"], "3.1.1.34") self.assertEqual(record["DE"], "Lipoprotein lipase.") self.assertEqual(len(record["AN"]), 3) self.assertEqual(record["AN"][0], "Clearing factor lipase.") self.assertEqual(record["AN"][1], "Diacylglycerol lipase.") self.assertEqual(record["AN"][2], "Diglyceride lipase.") self.assertEqual(record["CA"], "Triacylglycerol + H(2)O = diacylglycerol + a carboxylate.") self.assertEqual(record["CC"][0], 'Hydrolyzes triacylglycerols in chylomicrons and very low-density lipoproteins (VLDL).') self.assertEqual(record["CC"][1], "Also hydrolyzes diacylglycerol.") self.assertEqual(record['PR'], ["PDOC00110"]) self.assertEqual(record["DR"][0], ["P11151", "LIPL_BOVIN"]) self.assertEqual(record["DR"][1], ["P11153", "LIPL_CAVPO"]) self.assertEqual(record["DR"][2], ["P11602", "LIPL_CHICK"]) self.assertEqual(record["DR"][3], ["P55031", "LIPL_FELCA"]) self.assertEqual(record["DR"][4], ["P06858", "LIPL_HUMAN"]) self.assertEqual(record["DR"][5], ["P11152", "LIPL_MOUSE"]) self.assertEqual(record["DR"][6], ["O46647", "LIPL_MUSVI"]) self.assertEqual(record["DR"][7], ["P49060", "LIPL_PAPAN"]) self.assertEqual(record["DR"][8], ["P49923", "LIPL_PIG"]) self.assertEqual(record["DR"][9], ["Q06000", "LIPL_RAT"]) self.assertEqual(record["DR"][10], ["Q29524", "LIPL_SHEEP"])
def test_lipoprotein(self): """Parsing ENZYME record for lipoprotein lipase (3.1.1.34).""" filename = os.path.join("Enzymes", "lipoprotein.txt") with open(filename) as handle: record = Enzyme.read(handle) self.assertEqual(record["ID"], "3.1.1.34") self.assertEqual(record["DE"], "Lipoprotein lipase.") self.assertEqual(len(record["AN"]), 3) self.assertEqual(record["AN"][0], "Clearing factor lipase.") self.assertEqual(record["AN"][1], "Diacylglycerol lipase.") self.assertEqual(record["AN"][2], "Diglyceride lipase.") self.assertEqual( record["CA"], "Triacylglycerol + H(2)O = diacylglycerol + a carboxylate.") self.assertEqual( record["CC"][0], "Hydrolyzes triacylglycerols in chylomicrons and very low-density lipoproteins (VLDL).", ) self.assertEqual(record["CC"][1], "Also hydrolyzes diacylglycerol.") self.assertEqual(record["PR"], ["PDOC00110"]) self.assertEqual(record["DR"][0], ["P11151", "LIPL_BOVIN"]) self.assertEqual(record["DR"][1], ["P11153", "LIPL_CAVPO"]) self.assertEqual(record["DR"][2], ["P11602", "LIPL_CHICK"]) self.assertEqual(record["DR"][3], ["P55031", "LIPL_FELCA"]) self.assertEqual(record["DR"][4], ["P06858", "LIPL_HUMAN"]) self.assertEqual(record["DR"][5], ["P11152", "LIPL_MOUSE"]) self.assertEqual(record["DR"][6], ["O46647", "LIPL_MUSVI"]) self.assertEqual(record["DR"][7], ["P49060", "LIPL_PAPAN"]) self.assertEqual(record["DR"][8], ["P49923", "LIPL_PIG"]) self.assertEqual(record["DR"][9], ["Q06000", "LIPL_RAT"]) self.assertEqual(record["DR"][10], ["Q29524", "LIPL_SHEEP"]) self.assertTrue( str(record).startswith("ID: 3.1.1.34\nDE: Lipoprotein lipase.\n"), "Did not expect:\n%s" % record, )
def test_lipoprotein(self): "Parsing ENZYME record for lipoprotein lipase (3.1.1.34)" filename = os.path.join( 'Enzymes', 'lipoprotein.txt') handle = open(filename) record = Enzyme.read(handle) handle.close() self.assertEqual(record["ID"], "3.1.1.34") self.assertEqual(record["DE"], "Lipoprotein lipase.") self.assertEqual(len(record["AN"]), 3) self.assertEqual(record["AN"][0], "Clearing factor lipase.") self.assertEqual(record["AN"][1], "Diacylglycerol lipase.") self.assertEqual(record["AN"][2], "Diglyceride lipase.") self.assertEqual(record["CA"], "Triacylglycerol + H(2)O = diacylglycerol + a carboxylate.") self.assertEqual(record["CC"][0], 'Hydrolyzes triacylglycerols in chylomicrons and very low-density lipoproteins (VLDL).') self.assertEqual(record["CC"][1], "Also hydrolyzes diacylglycerol.") self.assertEqual(record['PR'], ["PDOC00110"]) self.assertEqual(record["DR"][0], ["P11151", "LIPL_BOVIN"]) self.assertEqual(record["DR"][1], ["P11153", "LIPL_CAVPO"]) self.assertEqual(record["DR"][2], ["P11602", "LIPL_CHICK"]) self.assertEqual(record["DR"][3], ["P55031", "LIPL_FELCA"]) self.assertEqual(record["DR"][4], ["P06858", "LIPL_HUMAN"]) self.assertEqual(record["DR"][5], ["P11152", "LIPL_MOUSE"]) self.assertEqual(record["DR"][6], ["O46647", "LIPL_MUSVI"]) self.assertEqual(record["DR"][7], ["P49060", "LIPL_PAPAN"]) self.assertEqual(record["DR"][8], ["P49923", "LIPL_PIG"]) self.assertEqual(record["DR"][9], ["Q06000", "LIPL_RAT"]) self.assertEqual(record["DR"][10], ["Q29524", "LIPL_SHEEP"])
def test_valine(self): "Parsing ENZYME record for valine decarboxylase (4.1.1.14)" filename = os.path.join( 'Enzymes', 'valine.txt') handle = open(filename) record = Enzyme.read(handle) self.assertEqual(record["ID"], "4.1.1.14") self.assertEqual(record["DE"], "Valine decarboxylase.") self.assertEqual(record["CA"], "L-valine = 2-methylpropanamine + CO(2).") self.assertEqual(record["CF"], "Pyridoxal 5'-phosphate.") self.assertEqual(record["CC"], ["Also acts on L-leucine."]) self.assertEqual(len(record["DR"]), 0)
def get_EC_RXNs(): ec_count = 0 ec_ontologyDictionary_filename = 'EBI_EC_ontologyDictionary.json' # get EC version number ebi_ec_release = "" if args.test == False: ebi_ec_call = 'curl ftp://ftp.ebi.ac.uk/pub/databases/enzyme/enzclass.txt' ebi_ec_release = os.popen(ebi_ec_call).read() else: with open('enzclass.txt', 'r') as myfile: ebi_ec_release = myfile.read() ebi_ec_release = ebi_ec_release.split("\n")[7].split()[1] # create dictionary ec_dict = { 'data_version': ebi_ec_release, 'date': timestamp, 'format_version': 'N/A', 'ontology': 'ec_orthology', 'term_hash': {} } # parse data ebi_ec_enzyme = 'enzyme.dat' if args.test == False: ebi_ec_call = 'curl ftp://ftp.ebi.ac.uk/pub/databases/enzyme/enzyme.dat > enzyme.dat' os.system(ebi_ec_call) records = Enzyme.parse(open(ebi_ec_enzyme)) for record in records: ec_dict['term_hash'][record['ID']] = { 'id': record['ID'], 'name': record['DE'], 'synonyms': record['AN'] } ec_count += 1 # save json file with open(ec_ontologyDictionary_filename, 'w') as outfile: json.dump(ec_dict, outfile, indent=2) # print summary if args.summary == True: print("ec_orthology", ec_count, ebi_ec_release, ec_ontologyDictionary_filename, sep="\t")
def test_valine(self): """Parsing ENZYME record for valine decarboxylase (4.1.1.14)""" filename = os.path.join('Enzymes', 'valine.txt') handle = open(filename) record = Enzyme.read(handle) handle.close() self.assertEqual(record["ID"], "4.1.1.14") self.assertEqual(record["DE"], "Valine decarboxylase.") self.assertEqual(record["CA"], "L-valine = 2-methylpropanamine + CO(2).") self.assertEqual(record["CF"], "Pyridoxal 5'-phosphate.") self.assertEqual(record["CC"], ["Also acts on L-leucine."]) self.assertEqual(len(record["DR"]), 0)
def get_expasy_enzyme(): """ """ url = "ftp://ftp.expasy.org/databases/enzyme/enzyme.dat" enzyme = urllib.request.urlretrieve(url) enzyme_p = bee.parse(open(enzyme[0], 'r')) enz_records = [] count = 0 for record in enzyme_p: enz_rec = {} enz_rec['Reaction(s)'] = record['CA'] #create record for each enzyme with EC number as primary key enz_rec['PreferedName'] = record['DE'] enz_rec['ECNumber'] = record['ID'] enz_rec['Reaction(s)'] = [] enz_rec['Substrates'] = {} enz_rec['Products'] = {} enz_rec['UniProt'] = {} enz_records.append(enz_rec) # split split to seperate multiple reactions reaction1 = record['CA'].split('.') for rxn in reaction1: if len(reaction1) > 2: rxn = rxn[3:] enz_rec['Reaction(s)'].append(rxn) #split reactions into [substrates, products] constituents = rxn.split('=') # split each side of reaction on '+' not '(+)' r = re.compile(r'(?:[^\+(]|\([^)]*\))+') substrates = r.findall(constituents[0]) products = r.findall(constituents[-1]) if substrates: for sub in substrates: sub = replace_strings(sub.lstrip().rstrip()) schebi = link_compound2chebi(sub) enz_rec['Substrates'][sub] = schebi if products: for prod in products: prod = replace_strings(prod.lstrip().rstrip()) pchebi = link_compound2chebi(prod) enz_rec['Products'][prod] = pchebi # populate enz_rec['UniProt'] with dictionary of uniprotid:name key, value pairs for protein for unpid in record['DR']: enz_rec['UniProt'][unpid[0]] = unpid[1] enz_records.append(enz_rec) return enz_records
def test_lactate(self): "Parsing ENZYME record for lactate racemase (5.1.2.1)" filename = os.path.join( 'Enzymes', 'lactate.txt') handle = open(filename) record = Enzyme.read(handle) self.assertEqual(record["ID"], "5.1.2.1") self.assertEqual(record["DE"], "Lactate racemase.") self.assertEqual(len(record["AN"]), 3) self.assertEqual(record["AN"][0], "Hydroxyacid racemase.") self.assertEqual(record["AN"][1], "Lactic acid racemase.") self.assertEqual(record["AN"][2], "Lacticoracemase.") self.assertEqual(record["CA"], "(S)-lactate = (R)-lactate.") self.assertEqual(len(record["DR"]), 0)
def test_lactate(self): "Parsing ENZYME record for lactate racemase (5.1.2.1)" filename = os.path.join('Enzymes', 'lactate.txt') handle = open(filename) record = Enzyme.read(handle) self.assertEqual(record["ID"], "5.1.2.1") self.assertEqual(record["DE"], "Lactate racemase.") self.assertEqual(len(record["AN"]), 3) self.assertEqual(record["AN"][0], "Hydroxyacid racemase.") self.assertEqual(record["AN"][1], "Lactic acid racemase.") self.assertEqual(record["AN"][2], "Lacticoracemase.") self.assertEqual(record["CA"], "(S)-lactate = (R)-lactate.") self.assertEqual(len(record["DR"]), 0)
def test_valine(self): """Parsing ENZYME record for valine decarboxylase (4.1.1.14)""" filename = os.path.join('Enzymes', 'valine.txt') handle = open(filename) record = Enzyme.read(handle) handle.close() self.assertEqual(record["ID"], "4.1.1.14") self.assertEqual(record["DE"], "Valine decarboxylase.") self.assertEqual(record["CA"], "L-valine = 2-methylpropanamine + CO(2).") self.assertEqual(record["CF"], "Pyridoxal 5'-phosphate.") self.assertEqual(record["CC"], ["Also acts on L-leucine."]) self.assertEqual(len(record["DR"]), 0) self.assertTrue(str(record).startswith("ID: 4.1.1.14\nDE: Valine decarboxylase.\n"), "Did not expect:\n%s" % record)
def test_parse_many(self): """Check parse function with multiple records.""" data = "" for filename in ["Enzymes/lipoprotein.txt", "Enzymes/proline.txt", "Enzymes/valine.txt"]: with open(filename) as handle: data += handle.read() handle = StringIO(data) records = list(Enzyme.parse(handle)) self.assertEqual(len(records), 3) self.assertEqual(records[0]["ID"], "3.1.1.34") self.assertEqual(records[1]["ID"], "5.1.1.4") self.assertEqual(records[2]["ID"], "4.1.1.14")
def test_valine(self): """Parsing ENZYME record for valine decarboxylase (4.1.1.14).""" filename = os.path.join("Enzymes", "valine.txt") handle = open(filename) record = Enzyme.read(handle) handle.close() self.assertEqual(record["ID"], "4.1.1.14") self.assertEqual(record["DE"], "Valine decarboxylase.") self.assertEqual(record["CA"], "L-valine = 2-methylpropanamine + CO(2).") self.assertEqual(record["CF"], "Pyridoxal 5'-phosphate.") self.assertEqual(record["CC"], ["Also acts on L-leucine."]) self.assertEqual(len(record["DR"]), 0) self.assertTrue(str(record).startswith("ID: 4.1.1.14\nDE: Valine decarboxylase.\n"), "Did not expect:\n%s" % record)
def get_expasy_enzyme(): """ """ url = "ftp://ftp.expasy.org/databases/enzyme/enzyme.dat" enzyme = urllib.request.urlretrieve(url) enzyme_p = bee.parse(open(enzyme[0], 'r')) enz_records = [] count = 0 for record in enzyme_p: count += 1 enz_rec = {} enz_rec['Reaction(s)'] = record['CA'] #create record for each enzyme with EC number as primary key enz_rec['PreferedName'] = record['DE'] enz_rec['ECNumber'] = record['ID'] enz_rec['Reaction(s)'] = [] enz_rec['Substrates'] = {} enz_rec['Products'] = {} enz_rec['UniProt'] = {} enz_records.append(enz_rec) # split split to seperate multiple reactions reaction1 = record['CA'].split('.') for rxn in reaction1: if len(reaction1) > 2: rxn = rxn[3:] enz_rec['Reaction(s)'].append(rxn) #split reactions into [substrates, products] constituents = rxn.split('=') # split each side of reaction on '+' not '(+)' r = re.compile(r'(?:[^\+(]|\([^)]*\))+') for sub in r.findall(constituents[0]): sub = replace_strings(sub.lstrip().rstrip()) schebi = link_compound2chebi(sub) enz_rec['Substrates'][sub] = schebi for prod in r.findall(constituents[-1]): prod = replace_strings(prod.lstrip().rstrip()) pchebi = link_compound2chebi(prod) enz_rec['Products'][prod] = pchebi # populate enz_rec['UniProt'] with dictionary of uniprotid:name key, value pairs for protein for unpid in record['DR']: enz_rec['UniProt'][unpid[0]] = unpid[1] enz_records.append(enz_rec) return enz_records
def test_lactate(self): """Parsing ENZYME record for lactate racemase (5.1.2.1).""" filename = os.path.join("Enzymes", "lactate.txt") with open(filename) as handle: record = Enzyme.read(handle) self.assertEqual(record["ID"], "5.1.2.1") self.assertEqual(record["DE"], "Lactate racemase.") self.assertEqual(len(record["AN"]), 3) self.assertEqual(record["AN"][0], "Hydroxyacid racemase.") self.assertEqual(record["AN"][1], "Lactic acid racemase.") self.assertEqual(record["AN"][2], "Lacticoracemase.") self.assertEqual(record["CA"], "(S)-lactate = (R)-lactate.") self.assertEqual(len(record["DR"]), 0) self.assertTrue( str(record).startswith("ID: 5.1.2.1\nDE: Lactate racemase.\n"), "Did not expect:\n%s" % record)
def test_lactate(self): """Parsing ENZYME record for lactate racemase (5.1.2.1)""" filename = os.path.join('Enzymes', 'lactate.txt') handle = open(filename) record = Enzyme.read(handle) handle.close() self.assertEqual(record["ID"], "5.1.2.1") self.assertEqual(record["DE"], "Lactate racemase.") self.assertEqual(len(record["AN"]), 3) self.assertEqual(record["AN"][0], "Hydroxyacid racemase.") self.assertEqual(record["AN"][1], "Lactic acid racemase.") self.assertEqual(record["AN"][2], "Lacticoracemase.") self.assertEqual(record["CA"], "(S)-lactate = (R)-lactate.") self.assertEqual(len(record["DR"]), 0) self.assertTrue(str(record).startswith("ID: 5.1.2.1\nDE: Lactate racemase.\n"), "Did not expect:\n%s" % record)
def test_proline(self): "Parsing ENZYME record for proline racemase (5.1.1.4)" filename = os.path.join('Enzymes', 'proline.txt') handle = open(filename) record = Enzyme.read(handle) self.assertEqual(record["ID"], "5.1.1.4") self.assertEqual(record["DE"], "Proline racemase.") self.assertEqual(record["CA"], "L-proline = D-proline.") self.assertEqual(len(record["DR"]), 9) self.assertEqual(record["DR"][0], ["Q17ZY4", "PRAC_CLOD6"]) self.assertEqual(record["DR"][1], ["A8DEZ8", "PRAC_CLODI"]) self.assertEqual(record["DR"][2], ["Q4DA80", "PRCMA_TRYCR"]) self.assertEqual(record["DR"][3], ["Q868H8", "PRCMB_TRYCR"]) self.assertEqual(record["DR"][4], ["Q3SX04", "PRCM_BOVIN"]) self.assertEqual(record["DR"][5], ["Q96EM0", "PRCM_HUMAN"]) self.assertEqual(record["DR"][6], ["Q9CXA2", "PRCM_MOUSE"]) self.assertEqual(record["DR"][7], ["Q5RC28", "PRCM_PONAB"]) self.assertEqual(record["DR"][8], ["Q66II5", "PRCM_XENTR"])
def test_proline(self): "Parsing ENZYME record for proline racemase (5.1.1.4)" filename = os.path.join( 'Enzymes', 'proline.txt') handle = open(filename) record = Enzyme.read(handle) self.assertEqual(record["ID"], "5.1.1.4") self.assertEqual(record["DE"], "Proline racemase.") self.assertEqual(record["CA"], "L-proline = D-proline.") self.assertEqual(len(record["DR"]), 9) self.assertEqual(record["DR"][0], ["Q17ZY4", "PRAC_CLOD6"]) self.assertEqual(record["DR"][1], ["A8DEZ8", "PRAC_CLODI"]) self.assertEqual(record["DR"][2], ["Q4DA80", "PRCMA_TRYCR"]) self.assertEqual(record["DR"][3], ["Q868H8", "PRCMB_TRYCR"]) self.assertEqual(record["DR"][4], ["Q3SX04", "PRCM_BOVIN"]) self.assertEqual(record["DR"][5], ["Q96EM0", "PRCM_HUMAN"]) self.assertEqual(record["DR"][6], ["Q9CXA2", "PRCM_MOUSE"]) self.assertEqual(record["DR"][7], ["Q5RC28", "PRCM_PONAB"]) self.assertEqual(record["DR"][8], ["Q66II5", "PRCM_XENTR"])
def _create_ec_num_enzyme_name_association_file(enzdat_file, ec_id_file): """ use the Enz class from BioPython to parse the enzyme.dat file, which contains all fully specified enzymes (i.e., with 4 levels) and annotations. The file is parsed to a dictionary, which is then dumped to JSON for easy reading later. :param enzdat_file: path to enzyme.dat file :param ec_id_file: path to JSON dump :return: None """ with open(enzdat_file) as file: enzyme_dat = Enz.parse(file) enz_id = dict() for record in enzyme_dat: enz_id[record['ID']] = record['DE'] with open(ec_id_file, 'w') as ec: json.dump(enz_id, ec)
def keggMet(tag): request = REST.kegg_get("lpn:" + tag) open("lpn:" + tag, "w").write(request.read()) records = Enzyme.parse(open("lpn:" + tag)) record = list(records)[0] ofile = open("lpn:" + tag, "r") owrite = open("kegg/lpn:" + tag, "w") flagPath = flagMotifs = flagOrtho = 1 for line in ofile: if "ORTHOLOGY" in line or flagPath == 0: if flagOrtho == 1: flagOrtho = 0 owrite.write(line) elif "ORGANISM" in line: flagOrtho = 1 else: owrite.write(line) if "PATHWAY" in line or flagPath == 0: if flagPath == 1: flagPath = 0 owrite.write(line) elif "BRITE" in line or "MODULE" in line: flagPath = 1 else: owrite.write(line) if "MOTIF" in line or flagMotifs == 0: if flagMotifs == 1: flagMotifs = 0 owrite.write(line) elif "DBLINKS" in line: flagMotifs = 1 else: owrite.write(line) if "NAME" in line: name = re.split(r' ', line) owrite.write("GENE NAME: " + name[-1]) if "REACTIONS" in line or flagReactions == 0: if flagReactions == 1: flagReactions = 0 owrite.write(line) elif "COMPOUND" in line: flagReactions = 1 else: owrite.write(line)
def test_proline(self): """Parsing ENZYME record for proline racemase (5.1.1.4).""" filename = os.path.join("Enzymes", "proline.txt") with open(filename) as handle: record = Enzyme.read(handle) self.assertEqual(record["ID"], "5.1.1.4") self.assertEqual(record["DE"], "Proline racemase.") self.assertEqual(record["CA"], "L-proline = D-proline.") self.assertEqual(len(record["DR"]), 9) self.assertEqual(record["DR"][0], ["Q17ZY4", "PRAC_CLOD6"]) self.assertEqual(record["DR"][1], ["A8DEZ8", "PRAC_CLODI"]) self.assertEqual(record["DR"][2], ["Q4DA80", "PRCMA_TRYCR"]) self.assertEqual(record["DR"][3], ["Q868H8", "PRCMB_TRYCR"]) self.assertEqual(record["DR"][4], ["Q3SX04", "PRCM_BOVIN"]) self.assertEqual(record["DR"][5], ["Q96EM0", "PRCM_HUMAN"]) self.assertEqual(record["DR"][6], ["Q9CXA2", "PRCM_MOUSE"]) self.assertEqual(record["DR"][7], ["Q5RC28", "PRCM_PONAB"]) self.assertEqual(record["DR"][8], ["Q66II5", "PRCM_XENTR"]) self.assertTrue( str(record).startswith("ID: 5.1.1.4\nDE: Proline racemase.\n"), "Did not expect:\n%s" % record)
def get_EC_RXNs(): ec_dict = {} # version ebi_ec_release = "" if args.test == False: ebi_ec_call = 'curl ftp://ftp.ebi.ac.uk/pub/databases/enzyme/enzclass.txt' ebi_ec_release = os.popen(ebi_ec_call).read() else: with open('enzclass.txt', 'r') as myfile: ebi_ec_release = myfile.read() ebi_ec_release = ebi_ec_release.split("\n")[7].split()[1] ec_dict = { 'data_version': ebi_ec_release, 'date': timestamp, 'format_version': 'N/A', 'ontology': 'ec_orthology' } # parser ebi_ec_enzyme = 'enzyme.dat' if args.test == False: ebi_ec_call = 'curl ftp://ftp.ebi.ac.uk/pub/databases/enzyme/enzyme.dat > enzyme.dat' os.system(ebi_ec_call) records = Enzyme.parse(open(ebi_ec_enzyme)) for record in records: ec_dict[record['ID']] = { 'id': record['ID'], 'name': record['DE'], 'synonyms': record['AN'] } with open('EBI_EC_ontologyDictionary.json', 'w') as outfile: json.dump(ec_dict, outfile, indent=2)
def getEC(): handle = open("enzyme.dat") records = Enzyme.parse(handle) ecnumbers = [record["ID"] for record in records] print(ecnumbers)
def get_knndataset(cdhit, output_dir, database): #Reads a Expasy Enzyme .dat file and writes a numpy data frame where the first column is #EC number, the second column is the reaction description, the third column is the associated #sequenceID ids separated by '|', and the fourth column indicates whether the reactions described #by this EC have been transferred to other EC numbers. if not os.path.exists(os.path.join("database", "enzyme", "enzyme.dat")): curl_enzyme = os.path.join("ftp://ftp.expasy.org", "databases", "enzyme", "enzyme.dat") subprocess.check_output("wget -cq -P database/enzyme " + curl_enzyme, shell=True) if not os.path.exists(os.path.join("database", "enzyme", "enzyme.dat")): print("%s\n", "Missing enzyme database!") exit(0) input_name = os.path.join("database", "enzyme", "enzyme.dat") output_name = os.path.join("database", "enzyme", "enzyme.tsv") records = Enzyme.parse(open(input_name)) out = dict() # dict of dicts, first key: EC number, second key: field transferred = dict() #dict of lists for record in records: if 'Transferred entry:' in record['DE']: record['DE'] = record['DE'].rstrip('.') record['DE'] = record['DE'].replace('Transferred entry:', ' ') record['DE'] = record['DE'].replace(',', ' ') record['DE'] = record['DE'].replace('and', ' ') point_to = record['DE'].split() transferred[record['ID']] = point_to else: out[record['ID']] = dict() out[record['ID']]['sequenceID'] = '|'.join( [x[0] for x in record['DR']]) out[record['ID']]['description'] = record['DE'] out[record['ID']]['transferred'] = False for id in transferred: out[id] = dict() out[id]['sequenceID'] = '|'.join( [out[x]['sequenceID'] for x in transferred[id]]) out[id]['description'] = 'Transferred entry: ' + ' '.join( transferred[id]) out[id]['transferred'] = True df = pd.DataFrame.from_dict(out, orient='index') df.index.name = 'EC' # write all data in a enzyme.csv file df.to_csv(output_name, sep='\t') # ignore EC numbers with no sequenceID ids associated df.dropna(subset=['sequenceID'], inplace=True) # ignore EC numbers that are obsolete due to transfer df = df[df.transferred == False] # The numpy data frame is converted to a python dictionnary mydic = df.to_dict() enzyme_protIDS = [ mydic["sequenceID"][ec].split("|") for ec in mydic["sequenceID"].keys() ] enzyme_protIDS = list(set(reduce(lambda x, y: x + y, enzyme_protIDS))) enzyme_protIDS = [ elt.strip(" \n\t\r") for elt in enzyme_protIDS if elt != "" ] dic_ecs = dict() dic_ecs["1"] = set() dic_ecs["2"] = set() dic_ecs["3"] = set() dic_ecs["4"] = set() dic_ecs["5"] = set() dic_ecs["6"] = set() if not os.path.exists(os.path.join("database", "uniprot", "sp.tab")): url = os.path.join("http://www.uniprot.org", "uniprot", "?query=reviewed:yes&format=tab") subprocess.check_output("wget -cq -P database/uniprot '" + url + "'", shell=True) subprocess.check_output( "mv database/uniprot/*=tab database/uniprot/sp.tab", shell=True) if not os.path.exists(os.path.join("database", "uniprot", "sp.tab")): print("%s\n", "Missing uniprot database!") exit(0) csvfile = os.path.join("database", "uniprot", "sp.tab") readCSV = csv.reader(csvfile, delimiter='\t') non_valids_enzyme = set() dic_sp = dict() for row in readCSV: if row[0] != "Entry": seqID = row[0] seqName = row[3] seqLength = row[6] dic_sp[seqID] = dict() dic_sp[seqID]['name'] = seqName dic_sp[seqID]['length'] = seqLength #===================================================o======================================== # Selection rules for the Main functional classes #===================================================o======================================== # step 1 # those enzymes whose sequences were annotated with ‘‘fragment’’ were excluded # those enzymes whose sequences had less than 50 amino acids were excluded for ec in mydic["description"].keys(): sequenceID_iDs = mydic["sequenceID"][ec] protIDs = sequenceID_iDs.strip(" \n\t\r").split("|") protIDs = [elt for elt in protIDs if elt != ""] frag_seqs = list() short_seqs = list() for seqID in protIDs: if "Fragment" in dic_sp[seqID]['name']: frag_seqs.append(seqID) if int(dic_sp[seqID]['length']) < 50: short_seqs.append(seqID) protIDs = [ e for e in protIDs if not e in frag_seqs and not e in short_seqs ] if ec.startswith("1"): dic_ecs["1"].update(protIDs) elif ec.startswith("2"): dic_ecs["2"].update(protIDs) elif ec.startswith("3"): dic_ecs["3"].update(protIDs) elif ec.startswith("4"): dic_ecs["4"].update(protIDs) elif ec.startswith("5"): dic_ecs["5"].update(protIDs) elif ec.startswith("6"): dic_ecs["6"].update(protIDs) non_valids_enzyme.update(frag_seqs) non_valids_enzyme.update(short_seqs) # step 2 # for the uniqueness, those enzymes that occur in two or more classes were excluded for ec in ["2", "3", "4", "5", "6"]: dic_ecs["1"] = dic_ecs["1"].difference(dic_ecs[ec]) non_valids_enzyme.update(dic_ecs["1"].intersection(dic_ecs[ec])) for ec in ["1", "3", "4", "5", "6"]: dic_ecs["2"] = dic_ecs["2"].difference(dic_ecs[ec]) non_valids_enzyme.update(dic_ecs["2"].intersection(dic_ecs[ec])) for ec in ["2", "1", "4", "5", "6"]: dic_ecs["3"] = dic_ecs["3"].difference(dic_ecs[ec]) non_valids_enzyme.update(dic_ecs["3"].intersection(dic_ecs[ec])) for ec in ["2", "3", "1", "5", "6"]: dic_ecs["4"] = dic_ecs["4"].difference(dic_ecs[ec]) non_valids_enzyme.update(dic_ecs["4"].intersection(dic_ecs[ec])) for ec in ["2", "3", "4", "1", "6"]: dic_ecs["5"] = dic_ecs["5"].difference(dic_ecs[ec]) non_valids_enzyme.update(dic_ecs["5"].intersection(dic_ecs[ec])) for ec in ["2", "3", "4", "5", "1"]: dic_ecs["6"] = dic_ecs["6"].difference(dic_ecs[ec]) non_valids_enzyme.update(dic_ecs["6"].intersection(dic_ecs[ec])) # these following two functions are internal and allow to create processes to parallel the fasta # files downloading and their passage to the cd-hit program def run_process(list_seqs, filename, output_dir, database, cdhit): # @nested function # Downlod and constructing fasta file of suclasses file = open(os.path.join(output_dir, filename + ".ids.list"), 'w') for seqID in list_seqs: file.write("%s\n" % seqID) file.close() fasta = os.path.join(output_dir, filename + ".faa") batch = os.path.join(output_dir, filename + ".ids.list") print commands.getoutput("blastdbcmd -db " + database + " -entry_batch " + batch + " > " + fasta) os.remove(batch) # run cdhit program cdhitout = os.path.join(output_dir, filename + ".cdhit.faa") cdhitverbose = os.path.join(output_dir, filename + ".out") print commands.getoutput( cdhit + " -i " + fasta + " -d 0 -o " + cdhitout + " -c 0.4 -n 2 -G 1 -g 1 -b 20 -s 0.0 -aL 0.0 -aS 0.0 -T 4 -M 32000 > " + cdhitverbose) def create_process(list_seqs, filename, output_dir, database, cdhit): # @nested function: p = Process(target=run_process, args=( list_seqs, filename, output_dir, database, cdhit, )) p.start() return p # step 3: # to reduce the homology bias, a redundancy cutoff was operated by cd-hit program to winnow # those sequences which have >=40% sequence identity to any other in a same functional class # making fasta files for the six main classes for ec in dic_ecs: create_process(dic_ecs[ec], str(ec), output_dir, database, cdhit) #===================================================o=========================================== # Selection rules for the subclasses: same screening procedures than the Main functional classes #===================================================o=========================================== # step 1 # those enzymes whose sequences were annotated with 'fragment' were excluded # those enzymes whose sequences had less than 50 amino acids were excluded dic_subclasses = dict() for ec in mydic["description"].keys(): sequenceID_iDs = mydic["sequenceID"][ec] protIDs = sequenceID_iDs.strip(" \n\t\r").split("|") protIDs = [elt for elt in protIDs if elt != ""] frag_seqs = list() short_seqs = list() for seqID in protIDs: if "Fragment" in dic_sp[seqID]['name']: frag_seqs.append(seqID) if int(dic_sp[seqID]['length']) < 50: short_seqs.append(seqID) protIDs = [ e for e in protIDs if not e in frag_seqs and not e in short_seqs ] list_ec_digits = [x for x in ec.split(".") if x != "-"] if len(list_ec_digits) >= 2: ec_on_l2 = '.'.join(list_ec_digits[:2]) if ec_on_l2 in dic_subclasses: dic_subclasses[ec_on_l2].update(protIDs) else: dic_subclasses[ec_on_l2] = set(protIDs) # step 2 # for the uniqueness, those enzymes that occur in two or more classes were excluded for ec1 in dic_subclasses.keys(): for ec2 in dic_subclasses.keys(): if ec1 != ec2: dic_subclasses[ec1] = dic_subclasses[ec1].difference( dic_subclasses[ec2]) #print(len(dic_subclasses)) excluded_ecs = list() for ec in dic_subclasses: if len(dic_subclasses[ec]) < 10: excluded_ecs.append(ec) dic_subclasses = { k: v for k, v in dic_subclasses.items() if k not in excluded_ecs } # step 3: # to reduce the homology bias, a redundancy cutoff was operated by cd-hit program to winnow # those sequences which have >=40% sequence identity to any other in a same functional class for ec in dic_subclasses: # making fasta files for the subclasses: after retrieving associated fasta file and # reducing redundancy with cd-hit program create_process(dic_subclasses[ec], str(ec), output_dir, database, cdhit)
'The name of the strain in the input file. This will be used to name the output file. The default behaviour is to take the input filename minus the ".top" part.' ) args = parser.parse_args() inputFile = args.inputFile enzymeDB = args.enzymeDB outputDir = args.outputDir sepGenes = args.sepGenes sepDist = args.sepDist minClusterSize = args.minClusterSize strainName = args.strainName enzymeDB_dict = {} #db = open(enzymeDB) with open(enzymeDB) as db: for record in Enzyme.parse(db): id_ec = record["ID"] de = record["DE"] enzymeDB_dict[id_ec] = de #db.close() fileName = inputFile.split("/")[-1] if not (fileName.split(".")[-1] == "top"): sys.exit('ERROR! Wrong filetype! Input should be a ".top" file!') if not strainName: strainName = fileName.split(".")[0] # open the input file (for reading by default) #fh = open(inputFile) # initialise dictionary to hold enzyme data for each contig group_enzymes = defaultdict(list)
def test_parse_one(self): """Check parse function with one record.""" with open("Enzymes/lipoprotein.txt") as handle: records = list(Enzyme.parse(handle)) self.assertEqual(len(records), 1) self.assertEqual(records[0]["ID"], "3.1.1.34")
from Bio.ExPASy import Enzyme with open("/home/koreanraichu/RuBisCO.txt") as handle: record = Enzyme.read(handle) print(record['ID']) # EC no. print(record['DE']) # description print(record['AN']) # 대충 synonyms같은건가? 뭐 얘 이렇게도 불러요 이런거 print(record["CA"]) # 촉매하는 반응(오 이거 식으로 나온다) print(record["PR"]) # 이건 모르겠다... 데이터베이스 번호인가... print(record["CC"]) # 아마도 뭐 하는 효소인가에 대한 설명인 듯 print(record['DR']) # 뭔진 모르겠지만 일단 잘못했어요... 뭐가 되게 많이떴는데 넘파이 마려웠음
def do_oxyphen(proteome, output_filename, ec_classes_file): ''' Read and parse enzyme.dat file ''' input_name = "DATA/enzyme.dat" output_name = "DATA/ec_uniprot.tsv" ### program ### handle = open(input_name) records = Enzyme.parse(handle) out = dict() #dict of dicts, first key: EC number, second key: field transferred = dict() #dict of lists for record in records: if 'Transferred entry:' in record['DE']: record['DE'] = record['DE'].rstrip('.') #remove period record['DE'] = record['DE'].replace('Transferred entry:', ' ') #remove title record['DE'] = record['DE'].replace(',', ' ') #remove commas record['DE'] = record['DE'].replace('and', ' ') #remove and point_to = record['DE'].split() transferred[record['ID']] = point_to else: out[record['ID']] = dict() out[record['ID']]['uniprot'] = ' '.join( [x[0] for x in record['DR']]) out[record['ID']]['description'] = record['DE'] out[record['ID']]['transferred'] = False # for id in transferred: # out[id] = dict() # out[id]['uniprot'] = ' '.join([out[x]['uniprot'] for x in transferred[id]]) # out[id]['description'] = 'Transferred entry: ' + ' '.join(transferred[id]) # out[id]['transferred'] = True df = pd.DataFrame.from_dict(out, orient='index') df.index.name = 'EC' df.to_csv(output_name, sep='\t') ''' Take a subset of ecs of interest ''' oxidases = tuple(open("DATA/oxygen_ecclasses", "r").read().splitlines()) infile = open("DATA/ec_uniprot.tsv", "r").readlines() outfile = open("DATA/ec_uniprot_oxidases.tsv", "w") for line in infile: if line.startswith("EC"): outfile.write(line) elif line.startswith(oxidases): outfile.write(line) outfile.close() ''' write a file with one uniprot ID per line, containing all of the uniprot IDs mentioned in uniprot column of the input file Ignore EC numbers that have been transferred ''' input = "DATA/ec_uniprot_oxidases.tsv" output = "DATA/uniprot_ids.txt" df = pd.read_table(input) df.dropna(subset=['uniprot'], inplace=True) #ignore EC numbers with no uniprot ids associated #df = df[df.transferred == False] #ignore EC numbers that are obsolete due to transfer unique_uniprot = set(" ".join(df.uniprot.values).split(" ")) with open(output, "w") as outfile: for id in unique_uniprot: outfile.write(id + "\n") outfile.close() ''' Make blastdb out of the swissprot subset ''' blast_path, num_threads, multinome_folder = read_config() os.system( "%s -in DATA/sprot_subset.fasta -dbtype prot -out DATA/sprot_subset -hash_index" % (os.path.join(blast_path, "makeblastdb"))) ''' Blast our pre-selected proteomes against the uniprot subset ''' print "Performing Blast searches against oxygen-utilizing database..." os.system( "%s -max_target_seqs 1 -outfmt '6 qseqid sseqid pident evalue qcovs' -query %s -db DATA/sprot_subset -out DATA/new_sequences_sprot_enzyme.tab -num_threads %d" % (os.path.join(blast_path, "blastp"), proteome, num_threads)) ''' Filter Blast output. ''' evalue = 10e-3 identity = 40.0 coverage = 40.0 print "Filtering Blast output: evalue", evalue, " identity", identity, " coverage", coverage hits_table_file_name = "DATA/new_sequences_sprot_enzyme.tab" hits_table_file_name_filtered_out = open( "DATA/new_sequences_sprot_enzyme_filtered.tab", "w") hits_table_file_name_filtered_out.write( "\t".join(["hit", "subject", "id", "len", "eval", "cov"]) + "\n") for line in open(hits_table_file_name, "r").read().splitlines(): if line.startswith("#"): continue query, target, ident, eval, cover = line.split("\t") eval = float(eval) ident = float(ident) cover = float(cover) if eval <= evalue and ident >= identity and cover >= coverage: hits_table_file_name_filtered_out.write(line + "\n") hits_table_file_name_filtered_out.close() hits_table_file_name_filtered = "DATA/new_sequences_sprot_enzyme_filtered.tab" enzyme_table_file_name = 'DATA/ec_uniprot_oxidases.tsv' hits = pd.read_csv(hits_table_file_name_filtered, sep="\t", header=0) enzyme = pd.read_csv(enzyme_table_file_name, sep="\t", header=0) hits.fillna('', inplace=True) #replace empty values with blank spaces enzyme.fillna('', inplace=True) enzyme = enzyme[enzyme.transferred == False] #drop transferred EC numbers hits.subject = hits.subject.str[3: 9] #take just the uniprot ID from the name def get_ecs(uniprot): if uniprot == '': #ignore invalid uniprot ids return '' else: return ' '.join( enzyme.EC[enzyme.uniprot.str.contains(uniprot)].values) hits['EC'] = hits.subject.apply(get_ecs) output_file_name = output_filename hits.to_csv(output_file_name, sep="\t", index=False) ### read final mapping output mapping_out = open(output_file_name, "r").read().splitlines() ecs_dict = {} for line in mapping_out[1:]: splitted = line.split("\t") ecs = splitted[-1] for ec in ecs.split(): if ec not in ecs_dict: ecs_dict[ec] = [] ecs_dict[ec].append(splitted[0]) print "\n\n" print len( ecs_dict ), "oxygen-utilizing enzymes were found from classes", ecs_dict.keys() ec_out = open(ec_classes_file, "w") ec_out.write("\t".join(ecs_dict.keys())) ec_out.close() GLOBAL_RESULTS.write( os.path.basename(proteome) + "\t" + str(len(ecs_dict)) + "\t" + ",".join(ecs_dict.keys()) + "\n") #print "Detailed mapping can be found in OUTPUT/oxygen_utilizing_annot.tsv file" #print "Executing SVM classifier..." infile = open("DATA/model_svm", "r").read().splitlines() classifier_input = [] classes = [] ec_classes = [] for line in infile: if line.startswith("@attribute") and "class" not in line: ec_classes.append(line.split()[1].replace("'", ""))
def get_knndataset(cdhit, output_dir, database): ''' Reads a Expasy Enzyme .dat file and writes a tab separated file where the first column is EC number, the second column is the reaction description, the third column is the associated uniprot ids separated by '|', and the fourth column indicates whether the reactions described by this EC have been transferred to other EC numbers. ''' if not os.path.exists(os.path.join("database", "enzyme", "enzyme.dat")): curl_enzyme = os.path.join("ftp://ftp.expasy.org", "databases","enzyme", "enzyme.dat") subprocess.check_output("wget -cq -P database/enzyme " + curl_enzyme, shell = True) if not os.path.exists(os.path.join("database", "enzyme", "enzyme.dat")): print ("%s\n", "Missing enzyme database!") exit(0) input_name = os.path.join("database", "enzyme", "enzyme.dat") output_name = os.path.join("database", "enzyme", "enzyme.tsv") records = Enzyme.parse(open(input_name)) out = dict() # dict of dicts, first key: EC number, second key: field transferred = dict() #dict of lists for record in records: if 'Transferred entry:' in record['DE']: record['DE'] = record['DE'].rstrip('.') record['DE'] = record['DE'].replace('Transferred entry:',' ') record['DE'] = record['DE'].replace(',',' ') record['DE'] = record['DE'].replace('and',' ') point_to = record['DE'].split() transferred[record['ID']] = point_to else: out[record['ID']] = dict() out[record['ID']]['uniprot'] = '|'.join([x[0] for x in record['DR']]) out[record['ID']]['description'] = record['DE'] out[record['ID']]['transferred'] = False for id in transferred: out[id] = dict() out[id]['uniprot'] = '|'.join([out[x]['uniprot'] for x in transferred[id]]) out[id]['description'] = 'Transferred entry: ' + ' '.join(transferred[id]) out[id]['transferred'] = True df = pd.DataFrame.from_dict(out, orient = 'index') df.index.name = 'EC' # write all data in a enzyme.csv file df.to_csv(output_name, sep = '\t') # ignore EC numbers with no uniprot ids associated df.dropna(subset = ['uniprot'], inplace = True) # ignore EC numbers that are obsolete due to transfer df = df[df.transferred == False] # construct a dictionnary from dataframe mydic = df.to_dict() enzyme_protIDS = [mydic["uniprot"][ec].split("|") for ec in mydic["uniprot"].keys()] enzyme_protIDS = list(set(reduce(lambda x, y: x + y, enzyme_protIDS))) enzyme_protIDS = [elt.strip(" \n\t\r") for elt in enzyme_protIDS if elt != ""] dic_ecs = dict() dic_ecs["1."] = set() dic_ecs["2."] = set() dic_ecs["3."] = set() dic_ecs["4."] = set() dic_ecs["5."] = set() dic_ecs["6."] = set() csvfile = open('uniprot-reviewed%3Ayes.tab', 'r') readCSV = csv.reader(csvfile, delimiter = '\t') non_valids_enzyme = set() dic_sp = dict() for row in readCSV: if row[0] != "Entry": seqID = row[0] seqName = row[3] seqLength = row[6] dic_sp[seqID] = dict() dic_sp[seqID]['name'] = seqName dic_sp[seqID]['length'] = seqLength #===================================================o======================================== # Selection rules for the Main functional classes #===================================================o======================================== # step 1 # those enzymes whose sequences were annotated with ‘‘fragment’’ were excluded # those enzymes whose sequences had less than 50 amino acids were excluded for ec in mydic["description"].keys(): uniprot_iDs = mydic["uniprot"][ec] protIDs = uniprot_iDs.strip(" \n\t\r").split("|") protIDs = [elt for elt in protIDs if elt != ""] frag_seqs = list() short_seqs = list() for seqID in protIDs: if "Fragment" in dic_sp[seqID]['name']: frag_seqs.append(seqID) if int(dic_sp[seqID]['length']) < 50: short_seqs.append(seqID) protIDs=[e for e in protIDs if not e in frag_seqs and not e in short_seqs] if ec.startswith("1."): dic_ecs["1."].update(protIDs) elif ec.startswith("2."): dic_ecs["2."].update(protIDs) elif ec.startswith("3."): dic_ecs["3."].update(protIDs) elif ec.startswith("4."): dic_ecs["4."].update(protIDs) elif ec.startswith("5."): dic_ecs["5."].update(protIDs) elif ec.startswith("6."): dic_ecs["6."].update(protIDs) non_valids_enzyme.update(frag_seqs) non_valids_enzyme.update(short_seqs) # step 2 # for the uniqueness, those enzymes that occur in two or more classes were excluded for ec in ["2.", "3.", "4.", "5.", "6."]: dic_ecs["1."] = dic_ecs["1."].difference(dic_ecs[ec]) non_valids_enzyme.update(dic_ecs["1."].intersection(dic_ecs[ec])) for ec in ["1.", "3.", "4.", "5.", "6."]: dic_ecs["2."] = dic_ecs["2."].difference(dic_ecs[ec]) non_valids_enzyme.update(dic_ecs["2."].intersection(dic_ecs[ec])) for ec in ["2.", "1.", "4.", "5.", "6."]: dic_ecs["3."] = dic_ecs["3."].difference(dic_ecs[ec]) non_valids_enzyme.update(dic_ecs["3."].intersection(dic_ecs[ec])) for ec in ["2.", "3.", "1.", "5.", "6."]: dic_ecs["4."] = dic_ecs["4."].difference(dic_ecs[ec]) non_valids_enzyme.update(dic_ecs["4."].intersection(dic_ecs[ec])) for ec in ["2.", "3.", "4.", "1.", "6."]: dic_ecs["5."] = dic_ecs["5."].difference(dic_ecs[ec]) non_valids_enzyme.update(dic_ecs["5."].intersection(dic_ecs[ec])) for ec in ["2.", "3.", "4.", "5.", "1."]: dic_ecs["6."] = dic_ecs["6."].difference(dic_ecs[ec]) non_valids_enzyme.update(dic_ecs["6."].intersection(dic_ecs[ec])) # step 3: # to reduce the homology bias, a redundancy cutoff was operated by cd-hit program to winnow # those sequences which have >=40% sequence identity to any other in a same functional class # # Downlod and constructing fasta file of the Main functional classes def split_sequence(seq, l): new_seq = "" if len(seq) > l: new_seq = seq[:l] k = l while k + l < len(seq): new_seq+= "\n"+str(seq[k:k+l]) k+= l new_seq+= "\n" + str(seq[k:]) return new_seq + "\n" else: return seq + "\n" def run_process(list_seqs, filename): # @nested function session = requests.Session() outfile = open(filename, "a") for seqID in list_seqs: #handle = ExPASy.get_sprot_raw(seqID.strip(" \n\r\t")) #record = SeqIO.read(handle, "swiss") #SeqIO.write(record, outfile, "fasta") req = "http://wwwdev.ebi.ac.uk/proteins/api/proteins?offset=0&size=100&accession="+str(seqID) res = session.get(req, headers = {'User-Agent' : 'application/XML Mozilla/5.0 (X11; U; Linux i686) Gecko/20071127 Firefox/2.0.0.11', "content-type":"text"}) # parse the returned XML uniprot = ET.fromstring(res.text) for isoform in uniprot.getchildren(): # get the sequence iso_sequence = isoform.find('{http://uniprot.org/uniprot}sequence') # get the accession number iso_accession = isoform.find('{http://uniprot.org/uniprot}accession') outfile.write(">"+str(iso_accession.text)+"\n") outfile.write(split_sequence(str(iso_sequence.text), 60)) outfile.close() def create_process(list_seqs, filename): # @nested function: p = Process(target = run_process, args = (list_seqs, filename,)) p.start() return p #ec1 = create_process(dic_ecs["1."], "knnDataset/ec_1.*.faa") #ec2 = create_process(dic_ecs["2."], "knnDataset/ec_2.*.faa") #ec3 = create_process(dic_ecs["3."], "knnDataset/ec_3.*.faa") #ec4 = create_process(dic_ecs["4."], "knnDataset/ec_4.*.faa") #ec5 = create_process(dic_ecs["5."], "knnDataset/ec_5.*.faa") #ec6 = create_process(dic_ecs["6."], "knnDataset/ec_6.*.faa") #===================================================o=========================================== # Selection rules for the subclasses: same screening procedures than the Main functional classes #===================================================o=========================================== # step 1 # those enzymes whose sequences were annotated with 'fragment' were excluded # those enzymes whose sequences had less than 50 amino acids were excluded dic_subclasses = dict() for ec in mydic["description"].keys(): uniprot_iDs = mydic["uniprot"][ec] protIDs = uniprot_iDs.strip(" \n\t\r").split("|") protIDs = [elt for elt in protIDs if elt != ""] frag_seqs = list() short_seqs = list() for seqID in protIDs: if "Fragment" in dic_sp[seqID]['name']: frag_seqs.append(seqID) if int(dic_sp[seqID]['length']) < 50: short_seqs.append(seqID) protIDs=[e for e in protIDs if not e in frag_seqs and not e in short_seqs] list_ec_digits = [x for x in ec.split(".") if x != "-"] if len(list_ec_digits) >= 2: ec_on_l2 = '.'.join(list_ec_digits[:2]) if ec_on_l2 in dic_subclasses: dic_subclasses[ec_on_l2].update(protIDs) else: dic_subclasses[ec_on_l2] = set(protIDs) # step 2 # for the uniqueness, those enzymes that occur in two or more classes were excluded for ec1 in dic_subclasses.keys(): for ec2 in dic_subclasses.keys(): if ec1 != ec2: dic_subclasses[ec1] = dic_subclasses[ec1].difference(dic_subclasses[ec2]) #print(len(dic_subclasses)) excluded_ecs = list() for ec in dic_subclasses: if len(dic_subclasses[ec]) < 10: excluded_ecs.append(ec) dic_subclasses = {k: v for k, v in dic_subclasses.items() if k not in excluded_ecs} # making fasta files # list_process = list() # for ec in dic_subclasses: # process = create_process(dic_subclasses[ec], os.path.join(output_dir, str(ec)+".faa")) # list_process.append(process) # for i in range(len(list_process)): # while list_process[i].is_alive(): time.sleep(60) for ec in dic_subclasses: file = open(os.path.join(output_dir, str(ec)+".ids.list"), 'w') for seqID in dic_subclasses[ec]: file.write("%s\n" % seqID) file.close() for ec in dic_subclasses: batch = os.path.join(output_dir, str(ec)+".ids.list") fasta = os.path.join(output_dir, str(ec)+".faa") print commands.getoutput("blastdbcmd -db "+ database +" -entry_batch "+ batch +" > "+ fasta) #outfile = open(os.path.join(output_dir, str(ec)+".faa"), "a") #for seqID in dic_subclasses[ec]: # handle = ExPASy.get_sprot_raw(seqID.strip(" \n\r\t")) # record = SeqIO.read(handle, "swiss") # SeqIO.write(record, outfile, "fasta") # req = "http://wwwdev.ebi.ac.uk/proteins/api/proteins?offset=0&size=100&accession="+str(seqID) # #res = requests.get(req, headers = {'User-Agent' : 'application/XML Mozilla/5.0 (X11; U; Linux i686) Gecko/20071127 Firefox/2.0.0.11'}) # print commands.getoutput("wget -cq -P "+ output_dir +" '" + req + "'") # tree = ET.parse(os.path.join(output_dir, os.path.basename(req))) # uniprot = tree.getroot() # # parse the returned XML # #uniprot = ET.fromstring(res.text) # for isoform in uniprot.getchildren(): # # get the sequence # iso_sequence = isoform.find('{http://uniprot.org/uniprot}sequence') # # get the accession number # iso_accession = isoform.find('{http://uniprot.org/uniprot}accession') # outfile.write(">"+str(iso_accession.text)+"\n") # outfile.write(split_sequence(str(iso_sequence.text), 60)) # os.remove(os.path.join(output_dir, os.path.basename(req))) #outfile.close() #process = create_process(dic_subclasses[ec], os.path.join(output_dir, str(ec)+".faa")) #while process.is_alive(): # time.sleep(60) # step 3: # to reduce the homology bias, a redundancy cutoff was operated by cd-hit program to winnow # those sequences which have >=40% sequence identity to any other in a same functional class # for ec in dic_subclasses: # print commands.getoutput(cdhit +" -i "+os.path.join(output_dir, str(ec)+".faa") # +" -d 0 -o "+ os.path.join(output_dir, str(ec) +".cdhit.faa") # +" -c 0.4 -n 2 -G 1 -g 1 -b 20 -s 0.0 -aL 0.0 -aS 0.0 -T 4 -M 32000 > " # + os.path.join(output_dir, str(ec) +".out")) print "\tFINISHED"
def get_expasy_enzyme(): """ """ url = "ftp://ftp.expasy.org/databases/enzyme/enzyme.dat" print("Retrieving enzyme records from Expasy Enzyme") enzyme = urllib.request.urlretrieve(url) enzyme_p = bee.parse(open(enzyme[0], 'r')) chebiout = open('chebi_list.txt', 'w') annotations = open('annotations_out.txt', 'w') enz_records = [] chebi_list = [] count = 0 tester = [] for record in enzyme_p: enz_rec = {} count += 1 print(count) enz_rec['ECNumber'] = record['ID'] enz_rec['Reaction(s)'] = [] enz_rec['Substrates'] = {} enz_rec['Products'] = {} #enz_records.append(enz_rec) # split split to seperate multiple reactions reaction1 = record['CA'].split('.') for rxn in reaction1: try: if len(reaction1) > 2: rxn = rxn[3:] enz_rec['Reaction(s)'].append(rxn) #split reactions into [substrates, products] constituents = rxn.split('=') # split each side of reaction on '+' not '(+)' r = re.compile(r'(?:[^\+(]|\([^)]*\))+') subr = r.findall(constituents[0]) for sub in subr: sub = sub.lstrip().rstrip() sub = replace_strings(sub) schebi = link_compound2chebi(sub) enz_rec['Substrates'][sub] = schebi if schebi: chebi_list.append(schebi) prodr = r.findall(constituents[-1]) for prod in prodr: prod = prod.lstrip().rstrip() prod = replace_strings(prod) pchebi = link_compound2chebi(prod) enz_rec['Products'][prod] = pchebi if pchebi: chebi_list.append(pchebi) except Exception as e: print(e) continue enz_records.append(enz_rec) print(chebi_list, file=chebiout) print(enz_records, file=annotations) return enz_records
#Reads a Expasy Enzyme .dat file and writes a numpy data frame where the first column is #EC number, the second column is the reaction description, the third column is the associated #sequenceID ids separated by '|', and the fourth column indicates whether the reactions described #by this EC have been transferred to other EC numbers. if not os.path.exists(os.path.join("database", "enzyme", "enzyme.dat")): curl_enzyme = os.path.join("ftp://ftp.expasy.org", "databases", "enzyme", "enzyme.dat") subprocess.check_output("wget -cq -P database/enzyme " + curl_enzyme, shell=True) if not os.path.exists(os.path.join("database", "enzyme", "enzyme.dat")): print("%s\n", "Missing enzyme database!") exit(0) input_name = os.path.join("database", "enzyme", "enzyme.dat") output_name = os.path.join("database", "enzyme", "enzyme.tsv") records = Enzyme.parse(open(input_name)) out = dict() # dict of dicts, first key: EC number, second key: field transferred = dict() #dict of lists for record in records: if 'Transferred entry:' in record['DE']: record['DE'] = record['DE'].rstrip('.') record['DE'] = record['DE'].replace('Transferred entry:', ' ') record['DE'] = record['DE'].replace(',', ' ') record['DE'] = record['DE'].replace('and', ' ') point_to = record['DE'].split() transferred[record['ID']] = point_to else: out[record['ID']] = dict() out[record['ID']]['sequenceID'] = '|'.join( [x[0] for x in record['DR']]) out[record['ID']]['description'] = record['DE']
def test_parse_zero(self): handle = StringIO("") records = list(Enzyme.parse(handle)) self.assertEqual(len(records), 0)
if line.startswith("NUM_THREADS"): num_threads = float(line.split("=")[1]) return input_file, blast_path, num_threads print(read_config()) ''' Read and parse enzyme.dat file ''' input_name = "DATA/enzyme.dat" output_name = 'DATA/ec_uniprot.tsv' ### program ### handle = open(input_name) records = Enzyme.parse(handle) out = dict() #dict of dicts, first key: EC number, second key: field transferred = dict() #dict of lists for record in records: if 'Transferred entry:' in record['DE']: record['DE'] = record['DE'].rstrip('.') #remove period record['DE'] = record['DE'].replace('Transferred entry:', ' ') #remove title record['DE'] = record['DE'].replace(',', ' ') #remove commas record['DE'] = record['DE'].replace('and', ' ') #remove and point_to = record['DE'].split() transferred[record['ID']] = point_to else: out[record['ID']] = dict() out[record['ID']]['uniprot'] = ' '.join([x[0] for x in record['DR']])
def get_enzyme_ecs(level): ''' Reads a Expasy Enzyme .dat file and writes a tab separated file where the first column is EC number, the second column is the reaction description, the third column is the associated uniprot ids separated by '|', and the fourth column indicates whether the reactions described by this EC have been transferred to other EC numbers. It return the list of the whole ecs numbers of ENZYME database if the latter is complete ''' if not os.path.exists(os.path.join("database", "enzyme", "enzyme.dat")): curl_enzyme = os.path.join("ftp://ftp.expasy.org", "databases", "enzyme", "enzyme.dat") subprocess.check_output("wget -cq -P database/enzyme " + curl_enzyme, shell=True) if not os.path.exists(os.path.join("database", "enzyme", "enzyme.dat")): print("%s\n", "Missing enzyme database!") exit(0) input_name = os.path.join("database", "enzyme", "enzyme.dat") output_name = os.path.join("database", "enzyme", "enzyme.tsv") records = Enzyme.parse(open(input_name)) out = dict() # dict of dicts, first key: EC number, second key: field transferred = dict() #dict of lists for record in records: if 'Transferred entry:' in record['DE']: record['DE'] = record['DE'].rstrip('.') record['DE'] = record['DE'].replace('Transferred entry:', ' ') record['DE'] = record['DE'].replace(',', ' ') record['DE'] = record['DE'].replace('and', ' ') point_to = record['DE'].split() transferred[record['ID']] = point_to else: out[record['ID']] = dict() out[record['ID']]['uniprot'] = '|'.join( [x[0] for x in record['DR']]) out[record['ID']]['description'] = record['DE'] out[record['ID']]['transferred'] = False for id in transferred: out[id] = dict() out[id]['uniprot'] = '|'.join( [out[x]['uniprot'] for x in transferred[id]]) out[id]['description'] = 'Transferred entry: ' + ' '.join( transferred[id]) out[id]['transferred'] = True df = pd.DataFrame.from_dict(out, orient='index') df.index.name = 'EC' # write all data in a enzyme.csv file df.to_csv(output_name, sep='\t') # ignore EC numbers with no uniprot ids associated df.dropna(subset=['uniprot'], inplace=True) # ignore EC numbers that are obsolete due to transfer df = df[df.transferred == False] all_ECs = list(set(df.index.values)) if 4 - int(level) == 0: all_ECs = [ ec for ec in all_ECs if len([x for x in ec.split(".") if x != "-"]) == int(level) ] else: all_ECs = ['.'.join(ec.split('.')[:-4 + int(level)]) for ec in all_ECs \ if len([x for x in ec.split(".")[:-4 + int(level)] if x != "-"]) == int(level)] return list(set(all_ECs))
def load_enzyme_nomenclature_table(): ''' download all SIB enzyme nomenclature from FTP (ftp://ftp.expasy.org/databases/enzyme/) create the enzyme.enzymes table with the list of all EC with associated description create the enzyme.enzymes_dat with detailed information about each EC todo: remove existing tables for uptade if rerun :return: nothing ''' from Bio.ExPASy import Enzyme import MySQLdb import urllib.request import os from io import StringIO sqlpsw = os.environ['SQLPSW'] conn = MySQLdb.connect( host="localhost", # your host, usually localhost user="******", # your username passwd=sqlpsw, # your password db="enzyme") # name of the data base cursor = conn.cursor() conn.set_character_set('utf8') cursor.execute('SET NAMES utf8;') cursor.execute('SET CHARACTER SET utf8;') cursor.execute('SET character_set_connection=utf8;') ''' ID Identification (Begins each entry; 1 per entry) DE Description (official name) (>=1 per entry) AN Alternate name(s) (>=0 per entry) CA Catalytic activity (>=1 per entry) CF Cofactor(s) (>=0 per entry) CC Comments (>=0 per entry) PR Cross-references to PROSITE (>=0 per entry) DR Cross-references to Swiss-Prot (>=0 per entry) ''' enzyme_file = 'ftp://ftp.expasy.org/databases/enzyme/enzyme.dat' data = urllib.request.urlopen(enzyme_file).read().decode('utf-8') sql1 = 'CREATE TABLE IF NOT EXISTS enzymes (enzyme_id INT AUTO_INCREMENT PRIMARY KEY,' \ ' ec VARCHAR(200));' sql2 = 'CREATE TABLE IF NOT EXISTS enzymes_dat (enzyme_dat_id INT,' \ ' line VARCHAR(20),' \ ' value LONG,' \ ' CONSTRAINT fk_enzyme_id' \ ' FOREIGN KEY(enzyme_dat_id) REFERENCES enzymes(enzyme_id)' \ ' ON DELETE CASCADE);' print('create enzyme table') print(sql1) cursor.execute(sql1, ) print('create dat table') print(sql2) cursor.execute(sql2) for n, data in enumerate(Enzyme.parse(StringIO(data))): enzyme = data['ID'] # insert enzyme id into primary TABLE sql = 'INSERT into enzymes (ec) values ("%s");' % enzyme print(n, sql) cursor.execute(sql, ) conn.commit() sql = 'SELECT LAST_INSERT_ID();' cursor.execute(sql, ) id = cursor.fetchall()[0][0] # description sql = 'INSERT into enzymes_dat (enzyme_dat_id, line, value) values (%s, "description", "%s");' % ( id, data['DE']) cursor.execute(sql, ) # alternative names for i in data['AN']: sql = 'INSERT into enzymes_dat (enzyme_dat_id,line, value) values(%s, "alternative name", "%s");' % ( id, i) cursor.execute(sql, ) # Catalytic activity sql = 'INSERT into enzymes_dat (enzyme_dat_id,line, value) values(%s,"catalytic activity", "%s");' % ( id, data['CA']) cursor.execute(sql, ) # Cofactors sql = 'INSERT into enzymes_dat (enzyme_dat_id,line, value) values(%s, "cofactors", "%s");' % ( id, data['CF']) cursor.execute(sql, ) # prosite crossref for i in data['PR']: sql = 'INSERT into enzymes_dat (enzyme_dat_id,line, value) values(%s, "prosite", "%s");' % ( id, i) cursor.execute(sql, ) # comments for i in data['CC']: sql = 'INSERT into enzymes_dat (enzyme_dat_id,line, value) values(%s, "comment", "%s");' % ( id, i) cursor.execute(sql, ) conn.commit()