def test_parse_many(self): """Check parse function with multiple records.""" data = "" for filename in ["Enzymes/lipoprotein.txt", "Enzymes/proline.txt", "Enzymes/valine.txt"]: with open(filename) as handle: data += handle.read() handle = StringIO(data) records = list(Enzyme.parse(handle)) self.assertEqual(len(records), 3) self.assertEqual(records[0]["ID"], "3.1.1.34") self.assertEqual(records[1]["ID"], "5.1.1.4") self.assertEqual(records[2]["ID"], "4.1.1.14")
def get_expasy_enzyme(): """ """ url = "ftp://ftp.expasy.org/databases/enzyme/enzyme.dat" enzyme = urllib.request.urlretrieve(url) enzyme_p = bee.parse(open(enzyme[0], 'r')) enz_records = [] count = 0 for record in enzyme_p: count += 1 enz_rec = {} enz_rec['Reaction(s)'] = record['CA'] #create record for each enzyme with EC number as primary key enz_rec['PreferedName'] = record['DE'] enz_rec['ECNumber'] = record['ID'] enz_rec['Reaction(s)'] = [] enz_rec['Substrates'] = {} enz_rec['Products'] = {} enz_rec['UniProt'] = {} enz_records.append(enz_rec) # split split to seperate multiple reactions reaction1 = record['CA'].split('.') for rxn in reaction1: if len(reaction1) > 2: rxn = rxn[3:] enz_rec['Reaction(s)'].append(rxn) #split reactions into [substrates, products] constituents = rxn.split('=') # split each side of reaction on '+' not '(+)' r = re.compile(r'(?:[^\+(]|\([^)]*\))+') for sub in r.findall(constituents[0]): sub = replace_strings(sub.lstrip().rstrip()) schebi = link_compound2chebi(sub) enz_rec['Substrates'][sub] = schebi for prod in r.findall(constituents[-1]): prod = replace_strings(prod.lstrip().rstrip()) pchebi = link_compound2chebi(prod) enz_rec['Products'][prod] = pchebi # populate enz_rec['UniProt'] with dictionary of uniprotid:name key, value pairs for protein for unpid in record['DR']: enz_rec['UniProt'][unpid[0]] = unpid[1] enz_records.append(enz_rec) return enz_records
def test_parse_one(self): """Check parse function with one record.""" with open("Enzymes/lipoprotein.txt") as handle: records = list(Enzyme.parse(handle)) self.assertEqual(len(records), 1) self.assertEqual(records[0]["ID"], "3.1.1.34")
def test_parse_zero(self): handle = StringIO("") records = list(Enzyme.parse(handle)) self.assertEqual(len(records), 0)
from Bio.ExPASy import Enzyme infile = "/Users/jagodajablonska/oxyphen/DATA/enzyme.dat" handle = open(infile) records = Enzyme.parse(handle) O2_ec_list = [] existing_list = open("/Users/jagodajablonska/oxyphen/DATA/oxygen_ecclasses").read().splitlines() for record in records: EC_num = record['ID'] reaction = record['CA'] if "=" in reaction: substrates = [x.strip() for x in reaction.split("=")[0].split("+")] products = [x.strip() for x in reaction.split("=")[1].split("+")] if "O(2)" in substrates or "O(2)" in products: if EC_num not in existing_list: print(EC_num)
#Reads a Expasy Enzyme .dat file and writes a numpy data frame where the first column is #EC number, the second column is the reaction description, the third column is the associated #sequenceID ids separated by '|', and the fourth column indicates whether the reactions described #by this EC have been transferred to other EC numbers. if not os.path.exists(os.path.join("database", "enzyme", "enzyme.dat")): curl_enzyme = os.path.join("ftp://ftp.expasy.org", "databases", "enzyme", "enzyme.dat") subprocess.check_output("wget -cq -P database/enzyme " + curl_enzyme, shell=True) if not os.path.exists(os.path.join("database", "enzyme", "enzyme.dat")): print("%s\n", "Missing enzyme database!") exit(0) input_name = os.path.join("database", "enzyme", "enzyme.dat") output_name = os.path.join("database", "enzyme", "enzyme.tsv") records = Enzyme.parse(open(input_name)) out = dict() # dict of dicts, first key: EC number, second key: field transferred = dict() #dict of lists for record in records: if 'Transferred entry:' in record['DE']: record['DE'] = record['DE'].rstrip('.') record['DE'] = record['DE'].replace('Transferred entry:', ' ') record['DE'] = record['DE'].replace(',', ' ') record['DE'] = record['DE'].replace('and', ' ') point_to = record['DE'].split() transferred[record['ID']] = point_to else: out[record['ID']] = dict() out[record['ID']]['sequenceID'] = '|'.join( [x[0] for x in record['DR']]) out[record['ID']]['description'] = record['DE']
def get_enzyme_ecs(level): ''' Reads a Expasy Enzyme .dat file and writes a tab separated file where the first column is EC number, the second column is the reaction description, the third column is the associated uniprot ids separated by '|', and the fourth column indicates whether the reactions described by this EC have been transferred to other EC numbers. ''' if not os.path.exists(os.path.join("database", "enzyme", "enzyme.dat")): curl_enzyme = os.path.join("ftp://ftp.expasy.org", "databases", "enzyme", "enzyme.dat") subprocess.check_output("wget -cq -P database/enzyme " + curl_enzyme, shell=True) if not os.path.exists(os.path.join("database", "enzyme", "enzyme.dat")): print("%s\n", "Missing enzyme database!") exit(0) input_name = os.path.join("database", "enzyme", "enzyme.dat") output_name = os.path.join("database", "enzyme", "enzyme.tsv") records = Enzyme.parse(open(input_name)) out = dict() # dict of dicts, first key: EC number, second key: field transferred = dict() #dict of lists for record in records: if 'Transferred entry:' in record['DE']: record['DE'] = record['DE'].rstrip('.') record['DE'] = record['DE'].replace('Transferred entry:', ' ') record['DE'] = record['DE'].replace(',', ' ') record['DE'] = record['DE'].replace('and', ' ') point_to = record['DE'].split() transferred[record['ID']] = point_to else: out[record['ID']] = dict() out[record['ID']]['uniprot'] = '|'.join( [x[0] for x in record['DR']]) out[record['ID']]['description'] = record['DE'] out[record['ID']]['transferred'] = False for id in transferred: out[id] = dict() out[id]['uniprot'] = '|'.join( [out[x]['uniprot'] for x in transferred[id]]) out[id]['description'] = 'Transferred entry: ' + ' '.join( transferred[id]) out[id]['transferred'] = True df = pd.DataFrame.from_dict(out, orient='index') df.index.name = 'EC' # write all data in a enzyme.csv file df.to_csv(output_name, sep='\t') #df = pd.read_table(output_name) # ignore EC numbers with no uniprot ids associated df.dropna(subset=['uniprot'], inplace=True) # ignore EC numbers that are obsolete due to transfer df = df[df.transferred == False] all_ECs = list(set(df.index.values)) if 4 - int(level) == 0: all_ECs = [ ec for ec in all_ECs if len([x for x in ec.split(".") if x != "-"]) == int(level) ] else: all_ECs = ['.'.join(ec.split('.')[:-4 + int(level)]) for ec in all_ECs \ if len([x for x in ec.split(".")[:-4 + int(level)] if x != "-"]) == int(level)] return list(set(all_ECs))
def get_expasy_enzyme(): """ """ url = "ftp://ftp.expasy.org/databases/enzyme/enzyme.dat" print("Retrieving enzyme records from Expasy Enzyme") enzyme = urllib.request.urlretrieve(url) enzyme_p = bee.parse(open(enzyme[0], 'r')) chebiout = open('chebi_list.txt', 'w') annotations = open('annotations_out.txt', 'w') enz_records = [] chebi_list = [] count = 0 tester = [] for record in enzyme_p: enz_rec = {} count += 1 print(count) enz_rec['ECNumber'] = record['ID'] enz_rec['Reaction(s)'] = [] enz_rec['Substrates'] = {} enz_rec['Products'] = {} #enz_records.append(enz_rec) # split split to seperate multiple reactions reaction1 = record['CA'].split('.') for rxn in reaction1: try: if len(reaction1) > 2: rxn = rxn[3:] enz_rec['Reaction(s)'].append(rxn) #split reactions into [substrates, products] constituents = rxn.split('=') # split each side of reaction on '+' not '(+)' r = re.compile(r'(?:[^\+(]|\([^)]*\))+') subr = r.findall(constituents[0]) for sub in subr: sub = sub.lstrip().rstrip() sub = replace_strings(sub) schebi = link_compound2chebi(sub) enz_rec['Substrates'][sub] = schebi if schebi: chebi_list.append(schebi) prodr = r.findall(constituents[-1]) for prod in prodr: prod = prod.lstrip().rstrip() prod = replace_strings(prod) pchebi = link_compound2chebi(prod) enz_rec['Products'][prod] = pchebi if pchebi: chebi_list.append(pchebi) except Exception as e: print(e) continue enz_records.append(enz_rec) print(chebi_list, file=chebiout) print(enz_records, file=annotations) return enz_records
def get_knndataset(cdhit, output_dir, database): ''' Reads a Expasy Enzyme .dat file and writes a tab separated file where the first column is EC number, the second column is the reaction description, the third column is the associated uniprot ids separated by '|', and the fourth column indicates whether the reactions described by this EC have been transferred to other EC numbers. ''' if not os.path.exists(os.path.join("database", "enzyme", "enzyme.dat")): curl_enzyme = os.path.join("ftp://ftp.expasy.org", "databases","enzyme", "enzyme.dat") subprocess.check_output("wget -cq -P database/enzyme " + curl_enzyme, shell = True) if not os.path.exists(os.path.join("database", "enzyme", "enzyme.dat")): print ("%s\n", "Missing enzyme database!") exit(0) input_name = os.path.join("database", "enzyme", "enzyme.dat") output_name = os.path.join("database", "enzyme", "enzyme.tsv") records = Enzyme.parse(open(input_name)) out = dict() # dict of dicts, first key: EC number, second key: field transferred = dict() #dict of lists for record in records: if 'Transferred entry:' in record['DE']: record['DE'] = record['DE'].rstrip('.') record['DE'] = record['DE'].replace('Transferred entry:',' ') record['DE'] = record['DE'].replace(',',' ') record['DE'] = record['DE'].replace('and',' ') point_to = record['DE'].split() transferred[record['ID']] = point_to else: out[record['ID']] = dict() out[record['ID']]['uniprot'] = '|'.join([x[0] for x in record['DR']]) out[record['ID']]['description'] = record['DE'] out[record['ID']]['transferred'] = False for id in transferred: out[id] = dict() out[id]['uniprot'] = '|'.join([out[x]['uniprot'] for x in transferred[id]]) out[id]['description'] = 'Transferred entry: ' + ' '.join(transferred[id]) out[id]['transferred'] = True df = pd.DataFrame.from_dict(out, orient = 'index') df.index.name = 'EC' # write all data in a enzyme.csv file df.to_csv(output_name, sep = '\t') # ignore EC numbers with no uniprot ids associated df.dropna(subset = ['uniprot'], inplace = True) # ignore EC numbers that are obsolete due to transfer df = df[df.transferred == False] # construct a dictionnary from dataframe mydic = df.to_dict() enzyme_protIDS = [mydic["uniprot"][ec].split("|") for ec in mydic["uniprot"].keys()] enzyme_protIDS = list(set(reduce(lambda x, y: x + y, enzyme_protIDS))) enzyme_protIDS = [elt.strip(" \n\t\r") for elt in enzyme_protIDS if elt != ""] dic_ecs = dict() dic_ecs["1."] = set() dic_ecs["2."] = set() dic_ecs["3."] = set() dic_ecs["4."] = set() dic_ecs["5."] = set() dic_ecs["6."] = set() csvfile = open('uniprot-reviewed%3Ayes.tab', 'r') readCSV = csv.reader(csvfile, delimiter = '\t') non_valids_enzyme = set() dic_sp = dict() for row in readCSV: if row[0] != "Entry": seqID = row[0] seqName = row[3] seqLength = row[6] dic_sp[seqID] = dict() dic_sp[seqID]['name'] = seqName dic_sp[seqID]['length'] = seqLength #===================================================o======================================== # Selection rules for the Main functional classes #===================================================o======================================== # step 1 # those enzymes whose sequences were annotated with ‘‘fragment’’ were excluded # those enzymes whose sequences had less than 50 amino acids were excluded for ec in mydic["description"].keys(): uniprot_iDs = mydic["uniprot"][ec] protIDs = uniprot_iDs.strip(" \n\t\r").split("|") protIDs = [elt for elt in protIDs if elt != ""] frag_seqs = list() short_seqs = list() for seqID in protIDs: if "Fragment" in dic_sp[seqID]['name']: frag_seqs.append(seqID) if int(dic_sp[seqID]['length']) < 50: short_seqs.append(seqID) protIDs=[e for e in protIDs if not e in frag_seqs and not e in short_seqs] if ec.startswith("1."): dic_ecs["1."].update(protIDs) elif ec.startswith("2."): dic_ecs["2."].update(protIDs) elif ec.startswith("3."): dic_ecs["3."].update(protIDs) elif ec.startswith("4."): dic_ecs["4."].update(protIDs) elif ec.startswith("5."): dic_ecs["5."].update(protIDs) elif ec.startswith("6."): dic_ecs["6."].update(protIDs) non_valids_enzyme.update(frag_seqs) non_valids_enzyme.update(short_seqs) # step 2 # for the uniqueness, those enzymes that occur in two or more classes were excluded for ec in ["2.", "3.", "4.", "5.", "6."]: dic_ecs["1."] = dic_ecs["1."].difference(dic_ecs[ec]) non_valids_enzyme.update(dic_ecs["1."].intersection(dic_ecs[ec])) for ec in ["1.", "3.", "4.", "5.", "6."]: dic_ecs["2."] = dic_ecs["2."].difference(dic_ecs[ec]) non_valids_enzyme.update(dic_ecs["2."].intersection(dic_ecs[ec])) for ec in ["2.", "1.", "4.", "5.", "6."]: dic_ecs["3."] = dic_ecs["3."].difference(dic_ecs[ec]) non_valids_enzyme.update(dic_ecs["3."].intersection(dic_ecs[ec])) for ec in ["2.", "3.", "1.", "5.", "6."]: dic_ecs["4."] = dic_ecs["4."].difference(dic_ecs[ec]) non_valids_enzyme.update(dic_ecs["4."].intersection(dic_ecs[ec])) for ec in ["2.", "3.", "4.", "1.", "6."]: dic_ecs["5."] = dic_ecs["5."].difference(dic_ecs[ec]) non_valids_enzyme.update(dic_ecs["5."].intersection(dic_ecs[ec])) for ec in ["2.", "3.", "4.", "5.", "1."]: dic_ecs["6."] = dic_ecs["6."].difference(dic_ecs[ec]) non_valids_enzyme.update(dic_ecs["6."].intersection(dic_ecs[ec])) # step 3: # to reduce the homology bias, a redundancy cutoff was operated by cd-hit program to winnow # those sequences which have >=40% sequence identity to any other in a same functional class # # Downlod and constructing fasta file of the Main functional classes def split_sequence(seq, l): new_seq = "" if len(seq) > l: new_seq = seq[:l] k = l while k + l < len(seq): new_seq+= "\n"+str(seq[k:k+l]) k+= l new_seq+= "\n" + str(seq[k:]) return new_seq + "\n" else: return seq + "\n" def run_process(list_seqs, filename): # @nested function session = requests.Session() outfile = open(filename, "a") for seqID in list_seqs: #handle = ExPASy.get_sprot_raw(seqID.strip(" \n\r\t")) #record = SeqIO.read(handle, "swiss") #SeqIO.write(record, outfile, "fasta") req = "http://wwwdev.ebi.ac.uk/proteins/api/proteins?offset=0&size=100&accession="+str(seqID) res = session.get(req, headers = {'User-Agent' : 'application/XML Mozilla/5.0 (X11; U; Linux i686) Gecko/20071127 Firefox/2.0.0.11', "content-type":"text"}) # parse the returned XML uniprot = ET.fromstring(res.text) for isoform in uniprot.getchildren(): # get the sequence iso_sequence = isoform.find('{http://uniprot.org/uniprot}sequence') # get the accession number iso_accession = isoform.find('{http://uniprot.org/uniprot}accession') outfile.write(">"+str(iso_accession.text)+"\n") outfile.write(split_sequence(str(iso_sequence.text), 60)) outfile.close() def create_process(list_seqs, filename): # @nested function: p = Process(target = run_process, args = (list_seqs, filename,)) p.start() return p #ec1 = create_process(dic_ecs["1."], "knnDataset/ec_1.*.faa") #ec2 = create_process(dic_ecs["2."], "knnDataset/ec_2.*.faa") #ec3 = create_process(dic_ecs["3."], "knnDataset/ec_3.*.faa") #ec4 = create_process(dic_ecs["4."], "knnDataset/ec_4.*.faa") #ec5 = create_process(dic_ecs["5."], "knnDataset/ec_5.*.faa") #ec6 = create_process(dic_ecs["6."], "knnDataset/ec_6.*.faa") #===================================================o=========================================== # Selection rules for the subclasses: same screening procedures than the Main functional classes #===================================================o=========================================== # step 1 # those enzymes whose sequences were annotated with 'fragment' were excluded # those enzymes whose sequences had less than 50 amino acids were excluded dic_subclasses = dict() for ec in mydic["description"].keys(): uniprot_iDs = mydic["uniprot"][ec] protIDs = uniprot_iDs.strip(" \n\t\r").split("|") protIDs = [elt for elt in protIDs if elt != ""] frag_seqs = list() short_seqs = list() for seqID in protIDs: if "Fragment" in dic_sp[seqID]['name']: frag_seqs.append(seqID) if int(dic_sp[seqID]['length']) < 50: short_seqs.append(seqID) protIDs=[e for e in protIDs if not e in frag_seqs and not e in short_seqs] list_ec_digits = [x for x in ec.split(".") if x != "-"] if len(list_ec_digits) >= 2: ec_on_l2 = '.'.join(list_ec_digits[:2]) if ec_on_l2 in dic_subclasses: dic_subclasses[ec_on_l2].update(protIDs) else: dic_subclasses[ec_on_l2] = set(protIDs) # step 2 # for the uniqueness, those enzymes that occur in two or more classes were excluded for ec1 in dic_subclasses.keys(): for ec2 in dic_subclasses.keys(): if ec1 != ec2: dic_subclasses[ec1] = dic_subclasses[ec1].difference(dic_subclasses[ec2]) #print(len(dic_subclasses)) excluded_ecs = list() for ec in dic_subclasses: if len(dic_subclasses[ec]) < 10: excluded_ecs.append(ec) dic_subclasses = {k: v for k, v in dic_subclasses.items() if k not in excluded_ecs} # making fasta files # list_process = list() # for ec in dic_subclasses: # process = create_process(dic_subclasses[ec], os.path.join(output_dir, str(ec)+".faa")) # list_process.append(process) # for i in range(len(list_process)): # while list_process[i].is_alive(): time.sleep(60) for ec in dic_subclasses: file = open(os.path.join(output_dir, str(ec)+".ids.list"), 'w') for seqID in dic_subclasses[ec]: file.write("%s\n" % seqID) file.close() for ec in dic_subclasses: batch = os.path.join(output_dir, str(ec)+".ids.list") fasta = os.path.join(output_dir, str(ec)+".faa") print commands.getoutput("blastdbcmd -db "+ database +" -entry_batch "+ batch +" > "+ fasta) #outfile = open(os.path.join(output_dir, str(ec)+".faa"), "a") #for seqID in dic_subclasses[ec]: # handle = ExPASy.get_sprot_raw(seqID.strip(" \n\r\t")) # record = SeqIO.read(handle, "swiss") # SeqIO.write(record, outfile, "fasta") # req = "http://wwwdev.ebi.ac.uk/proteins/api/proteins?offset=0&size=100&accession="+str(seqID) # #res = requests.get(req, headers = {'User-Agent' : 'application/XML Mozilla/5.0 (X11; U; Linux i686) Gecko/20071127 Firefox/2.0.0.11'}) # print commands.getoutput("wget -cq -P "+ output_dir +" '" + req + "'") # tree = ET.parse(os.path.join(output_dir, os.path.basename(req))) # uniprot = tree.getroot() # # parse the returned XML # #uniprot = ET.fromstring(res.text) # for isoform in uniprot.getchildren(): # # get the sequence # iso_sequence = isoform.find('{http://uniprot.org/uniprot}sequence') # # get the accession number # iso_accession = isoform.find('{http://uniprot.org/uniprot}accession') # outfile.write(">"+str(iso_accession.text)+"\n") # outfile.write(split_sequence(str(iso_sequence.text), 60)) # os.remove(os.path.join(output_dir, os.path.basename(req))) #outfile.close() #process = create_process(dic_subclasses[ec], os.path.join(output_dir, str(ec)+".faa")) #while process.is_alive(): # time.sleep(60) # step 3: # to reduce the homology bias, a redundancy cutoff was operated by cd-hit program to winnow # those sequences which have >=40% sequence identity to any other in a same functional class # for ec in dic_subclasses: # print commands.getoutput(cdhit +" -i "+os.path.join(output_dir, str(ec)+".faa") # +" -d 0 -o "+ os.path.join(output_dir, str(ec) +".cdhit.faa") # +" -c 0.4 -n 2 -G 1 -g 1 -b 20 -s 0.0 -aL 0.0 -aS 0.0 -T 4 -M 32000 > " # + os.path.join(output_dir, str(ec) +".out")) print "\tFINISHED"
def do_oxyphen(proteome, output_filename, ec_classes_file): ''' Read and parse enzyme.dat file ''' input_name = "DATA/enzyme.dat" output_name = "DATA/ec_uniprot.tsv" ### program ### handle = open(input_name) records = Enzyme.parse(handle) out = dict() #dict of dicts, first key: EC number, second key: field transferred = dict() #dict of lists for record in records: if 'Transferred entry:' in record['DE']: record['DE'] = record['DE'].rstrip('.') #remove period record['DE'] = record['DE'].replace('Transferred entry:', ' ') #remove title record['DE'] = record['DE'].replace(',', ' ') #remove commas record['DE'] = record['DE'].replace('and', ' ') #remove and point_to = record['DE'].split() transferred[record['ID']] = point_to else: out[record['ID']] = dict() out[record['ID']]['uniprot'] = ' '.join( [x[0] for x in record['DR']]) out[record['ID']]['description'] = record['DE'] out[record['ID']]['transferred'] = False # for id in transferred: # out[id] = dict() # out[id]['uniprot'] = ' '.join([out[x]['uniprot'] for x in transferred[id]]) # out[id]['description'] = 'Transferred entry: ' + ' '.join(transferred[id]) # out[id]['transferred'] = True df = pd.DataFrame.from_dict(out, orient='index') df.index.name = 'EC' df.to_csv(output_name, sep='\t') ''' Take a subset of ecs of interest ''' oxidases = tuple(open("DATA/oxygen_ecclasses", "r").read().splitlines()) infile = open("DATA/ec_uniprot.tsv", "r").readlines() outfile = open("DATA/ec_uniprot_oxidases.tsv", "w") for line in infile: if line.startswith("EC"): outfile.write(line) elif line.startswith(oxidases): outfile.write(line) outfile.close() ''' write a file with one uniprot ID per line, containing all of the uniprot IDs mentioned in uniprot column of the input file Ignore EC numbers that have been transferred ''' input = "DATA/ec_uniprot_oxidases.tsv" output = "DATA/uniprot_ids.txt" df = pd.read_table(input) df.dropna(subset=['uniprot'], inplace=True) #ignore EC numbers with no uniprot ids associated #df = df[df.transferred == False] #ignore EC numbers that are obsolete due to transfer unique_uniprot = set(" ".join(df.uniprot.values).split(" ")) with open(output, "w") as outfile: for id in unique_uniprot: outfile.write(id + "\n") outfile.close() ''' Make blastdb out of the swissprot subset ''' blast_path, num_threads, multinome_folder = read_config() os.system( "%s -in DATA/sprot_subset.fasta -dbtype prot -out DATA/sprot_subset -hash_index" % (os.path.join(blast_path, "makeblastdb"))) ''' Blast our pre-selected proteomes against the uniprot subset ''' print "Performing Blast searches against oxygen-utilizing database..." os.system( "%s -max_target_seqs 1 -outfmt '6 qseqid sseqid pident evalue qcovs' -query %s -db DATA/sprot_subset -out DATA/new_sequences_sprot_enzyme.tab -num_threads %d" % (os.path.join(blast_path, "blastp"), proteome, num_threads)) ''' Filter Blast output. ''' evalue = 10e-3 identity = 40.0 coverage = 40.0 print "Filtering Blast output: evalue", evalue, " identity", identity, " coverage", coverage hits_table_file_name = "DATA/new_sequences_sprot_enzyme.tab" hits_table_file_name_filtered_out = open( "DATA/new_sequences_sprot_enzyme_filtered.tab", "w") hits_table_file_name_filtered_out.write( "\t".join(["hit", "subject", "id", "len", "eval", "cov"]) + "\n") for line in open(hits_table_file_name, "r").read().splitlines(): if line.startswith("#"): continue query, target, ident, eval, cover = line.split("\t") eval = float(eval) ident = float(ident) cover = float(cover) if eval <= evalue and ident >= identity and cover >= coverage: hits_table_file_name_filtered_out.write(line + "\n") hits_table_file_name_filtered_out.close() hits_table_file_name_filtered = "DATA/new_sequences_sprot_enzyme_filtered.tab" enzyme_table_file_name = 'DATA/ec_uniprot_oxidases.tsv' hits = pd.read_csv(hits_table_file_name_filtered, sep="\t", header=0) enzyme = pd.read_csv(enzyme_table_file_name, sep="\t", header=0) hits.fillna('', inplace=True) #replace empty values with blank spaces enzyme.fillna('', inplace=True) enzyme = enzyme[enzyme.transferred == False] #drop transferred EC numbers hits.subject = hits.subject.str[3: 9] #take just the uniprot ID from the name def get_ecs(uniprot): if uniprot == '': #ignore invalid uniprot ids return '' else: return ' '.join( enzyme.EC[enzyme.uniprot.str.contains(uniprot)].values) hits['EC'] = hits.subject.apply(get_ecs) output_file_name = output_filename hits.to_csv(output_file_name, sep="\t", index=False) ### read final mapping output mapping_out = open(output_file_name, "r").read().splitlines() ecs_dict = {} for line in mapping_out[1:]: splitted = line.split("\t") ecs = splitted[-1] for ec in ecs.split(): if ec not in ecs_dict: ecs_dict[ec] = [] ecs_dict[ec].append(splitted[0]) print "\n\n" print len( ecs_dict ), "oxygen-utilizing enzymes were found from classes", ecs_dict.keys() ec_out = open(ec_classes_file, "w") ec_out.write("\t".join(ecs_dict.keys())) ec_out.close() GLOBAL_RESULTS.write( os.path.basename(proteome) + "\t" + str(len(ecs_dict)) + "\t" + ",".join(ecs_dict.keys()) + "\n") #print "Detailed mapping can be found in OUTPUT/oxygen_utilizing_annot.tsv file" #print "Executing SVM classifier..." infile = open("DATA/model_svm", "r").read().splitlines() classifier_input = [] classes = [] ec_classes = [] for line in infile: if line.startswith("@attribute") and "class" not in line: ec_classes.append(line.split()[1].replace("'", ""))
'The name of the strain in the input file. This will be used to name the output file. The default behaviour is to take the input filename minus the ".top" part.' ) args = parser.parse_args() inputFile = args.inputFile enzymeDB = args.enzymeDB outputDir = args.outputDir sepGenes = args.sepGenes sepDist = args.sepDist minClusterSize = args.minClusterSize strainName = args.strainName enzymeDB_dict = {} #db = open(enzymeDB) with open(enzymeDB) as db: for record in Enzyme.parse(db): id_ec = record["ID"] de = record["DE"] enzymeDB_dict[id_ec] = de #db.close() fileName = inputFile.split("/")[-1] if not (fileName.split(".")[-1] == "top"): sys.exit('ERROR! Wrong filetype! Input should be a ".top" file!') if not strainName: strainName = fileName.split(".")[0] # open the input file (for reading by default) #fh = open(inputFile) # initialise dictionary to hold enzyme data for each contig group_enzymes = defaultdict(list)
def get_knndataset(cdhit, output_dir, database): #Reads a Expasy Enzyme .dat file and writes a numpy data frame where the first column is #EC number, the second column is the reaction description, the third column is the associated #sequenceID ids separated by '|', and the fourth column indicates whether the reactions described #by this EC have been transferred to other EC numbers. if not os.path.exists(os.path.join("database", "enzyme", "enzyme.dat")): curl_enzyme = os.path.join("ftp://ftp.expasy.org", "databases", "enzyme", "enzyme.dat") subprocess.check_output("wget -cq -P database/enzyme " + curl_enzyme, shell=True) if not os.path.exists(os.path.join("database", "enzyme", "enzyme.dat")): print("%s\n", "Missing enzyme database!") exit(0) input_name = os.path.join("database", "enzyme", "enzyme.dat") output_name = os.path.join("database", "enzyme", "enzyme.tsv") records = Enzyme.parse(open(input_name)) out = dict() # dict of dicts, first key: EC number, second key: field transferred = dict() #dict of lists for record in records: if 'Transferred entry:' in record['DE']: record['DE'] = record['DE'].rstrip('.') record['DE'] = record['DE'].replace('Transferred entry:', ' ') record['DE'] = record['DE'].replace(',', ' ') record['DE'] = record['DE'].replace('and', ' ') point_to = record['DE'].split() transferred[record['ID']] = point_to else: out[record['ID']] = dict() out[record['ID']]['sequenceID'] = '|'.join( [x[0] for x in record['DR']]) out[record['ID']]['description'] = record['DE'] out[record['ID']]['transferred'] = False for id in transferred: out[id] = dict() out[id]['sequenceID'] = '|'.join( [out[x]['sequenceID'] for x in transferred[id]]) out[id]['description'] = 'Transferred entry: ' + ' '.join( transferred[id]) out[id]['transferred'] = True df = pd.DataFrame.from_dict(out, orient='index') df.index.name = 'EC' # write all data in a enzyme.csv file df.to_csv(output_name, sep='\t') # ignore EC numbers with no sequenceID ids associated df.dropna(subset=['sequenceID'], inplace=True) # ignore EC numbers that are obsolete due to transfer df = df[df.transferred == False] # The numpy data frame is converted to a python dictionnary mydic = df.to_dict() enzyme_protIDS = [ mydic["sequenceID"][ec].split("|") for ec in mydic["sequenceID"].keys() ] enzyme_protIDS = list(set(reduce(lambda x, y: x + y, enzyme_protIDS))) enzyme_protIDS = [ elt.strip(" \n\t\r") for elt in enzyme_protIDS if elt != "" ] dic_ecs = dict() dic_ecs["1"] = set() dic_ecs["2"] = set() dic_ecs["3"] = set() dic_ecs["4"] = set() dic_ecs["5"] = set() dic_ecs["6"] = set() if not os.path.exists(os.path.join("database", "uniprot", "sp.tab")): url = os.path.join("http://www.uniprot.org", "uniprot", "?query=reviewed:yes&format=tab") subprocess.check_output("wget -cq -P database/uniprot '" + url + "'", shell=True) subprocess.check_output( "mv database/uniprot/*=tab database/uniprot/sp.tab", shell=True) if not os.path.exists(os.path.join("database", "uniprot", "sp.tab")): print("%s\n", "Missing uniprot database!") exit(0) csvfile = os.path.join("database", "uniprot", "sp.tab") readCSV = csv.reader(csvfile, delimiter='\t') non_valids_enzyme = set() dic_sp = dict() for row in readCSV: if row[0] != "Entry": seqID = row[0] seqName = row[3] seqLength = row[6] dic_sp[seqID] = dict() dic_sp[seqID]['name'] = seqName dic_sp[seqID]['length'] = seqLength #===================================================o======================================== # Selection rules for the Main functional classes #===================================================o======================================== # step 1 # those enzymes whose sequences were annotated with ‘‘fragment’’ were excluded # those enzymes whose sequences had less than 50 amino acids were excluded for ec in mydic["description"].keys(): sequenceID_iDs = mydic["sequenceID"][ec] protIDs = sequenceID_iDs.strip(" \n\t\r").split("|") protIDs = [elt for elt in protIDs if elt != ""] frag_seqs = list() short_seqs = list() for seqID in protIDs: if "Fragment" in dic_sp[seqID]['name']: frag_seqs.append(seqID) if int(dic_sp[seqID]['length']) < 50: short_seqs.append(seqID) protIDs = [ e for e in protIDs if not e in frag_seqs and not e in short_seqs ] if ec.startswith("1"): dic_ecs["1"].update(protIDs) elif ec.startswith("2"): dic_ecs["2"].update(protIDs) elif ec.startswith("3"): dic_ecs["3"].update(protIDs) elif ec.startswith("4"): dic_ecs["4"].update(protIDs) elif ec.startswith("5"): dic_ecs["5"].update(protIDs) elif ec.startswith("6"): dic_ecs["6"].update(protIDs) non_valids_enzyme.update(frag_seqs) non_valids_enzyme.update(short_seqs) # step 2 # for the uniqueness, those enzymes that occur in two or more classes were excluded for ec in ["2", "3", "4", "5", "6"]: dic_ecs["1"] = dic_ecs["1"].difference(dic_ecs[ec]) non_valids_enzyme.update(dic_ecs["1"].intersection(dic_ecs[ec])) for ec in ["1", "3", "4", "5", "6"]: dic_ecs["2"] = dic_ecs["2"].difference(dic_ecs[ec]) non_valids_enzyme.update(dic_ecs["2"].intersection(dic_ecs[ec])) for ec in ["2", "1", "4", "5", "6"]: dic_ecs["3"] = dic_ecs["3"].difference(dic_ecs[ec]) non_valids_enzyme.update(dic_ecs["3"].intersection(dic_ecs[ec])) for ec in ["2", "3", "1", "5", "6"]: dic_ecs["4"] = dic_ecs["4"].difference(dic_ecs[ec]) non_valids_enzyme.update(dic_ecs["4"].intersection(dic_ecs[ec])) for ec in ["2", "3", "4", "1", "6"]: dic_ecs["5"] = dic_ecs["5"].difference(dic_ecs[ec]) non_valids_enzyme.update(dic_ecs["5"].intersection(dic_ecs[ec])) for ec in ["2", "3", "4", "5", "1"]: dic_ecs["6"] = dic_ecs["6"].difference(dic_ecs[ec]) non_valids_enzyme.update(dic_ecs["6"].intersection(dic_ecs[ec])) # these following two functions are internal and allow to create processes to parallel the fasta # files downloading and their passage to the cd-hit program def run_process(list_seqs, filename, output_dir, database, cdhit): # @nested function # Downlod and constructing fasta file of suclasses file = open(os.path.join(output_dir, filename + ".ids.list"), 'w') for seqID in list_seqs: file.write("%s\n" % seqID) file.close() fasta = os.path.join(output_dir, filename + ".faa") batch = os.path.join(output_dir, filename + ".ids.list") print commands.getoutput("blastdbcmd -db " + database + " -entry_batch " + batch + " > " + fasta) os.remove(batch) # run cdhit program cdhitout = os.path.join(output_dir, filename + ".cdhit.faa") cdhitverbose = os.path.join(output_dir, filename + ".out") print commands.getoutput( cdhit + " -i " + fasta + " -d 0 -o " + cdhitout + " -c 0.4 -n 2 -G 1 -g 1 -b 20 -s 0.0 -aL 0.0 -aS 0.0 -T 4 -M 32000 > " + cdhitverbose) def create_process(list_seqs, filename, output_dir, database, cdhit): # @nested function: p = Process(target=run_process, args=( list_seqs, filename, output_dir, database, cdhit, )) p.start() return p # step 3: # to reduce the homology bias, a redundancy cutoff was operated by cd-hit program to winnow # those sequences which have >=40% sequence identity to any other in a same functional class # making fasta files for the six main classes for ec in dic_ecs: create_process(dic_ecs[ec], str(ec), output_dir, database, cdhit) #===================================================o=========================================== # Selection rules for the subclasses: same screening procedures than the Main functional classes #===================================================o=========================================== # step 1 # those enzymes whose sequences were annotated with 'fragment' were excluded # those enzymes whose sequences had less than 50 amino acids were excluded dic_subclasses = dict() for ec in mydic["description"].keys(): sequenceID_iDs = mydic["sequenceID"][ec] protIDs = sequenceID_iDs.strip(" \n\t\r").split("|") protIDs = [elt for elt in protIDs if elt != ""] frag_seqs = list() short_seqs = list() for seqID in protIDs: if "Fragment" in dic_sp[seqID]['name']: frag_seqs.append(seqID) if int(dic_sp[seqID]['length']) < 50: short_seqs.append(seqID) protIDs = [ e for e in protIDs if not e in frag_seqs and not e in short_seqs ] list_ec_digits = [x for x in ec.split(".") if x != "-"] if len(list_ec_digits) >= 2: ec_on_l2 = '.'.join(list_ec_digits[:2]) if ec_on_l2 in dic_subclasses: dic_subclasses[ec_on_l2].update(protIDs) else: dic_subclasses[ec_on_l2] = set(protIDs) # step 2 # for the uniqueness, those enzymes that occur in two or more classes were excluded for ec1 in dic_subclasses.keys(): for ec2 in dic_subclasses.keys(): if ec1 != ec2: dic_subclasses[ec1] = dic_subclasses[ec1].difference( dic_subclasses[ec2]) #print(len(dic_subclasses)) excluded_ecs = list() for ec in dic_subclasses: if len(dic_subclasses[ec]) < 10: excluded_ecs.append(ec) dic_subclasses = { k: v for k, v in dic_subclasses.items() if k not in excluded_ecs } # step 3: # to reduce the homology bias, a redundancy cutoff was operated by cd-hit program to winnow # those sequences which have >=40% sequence identity to any other in a same functional class for ec in dic_subclasses: # making fasta files for the subclasses: after retrieving associated fasta file and # reducing redundancy with cd-hit program create_process(dic_subclasses[ec], str(ec), output_dir, database, cdhit)