Exemple #1
0
 def test_parse_many(self):
     """Check parse function with multiple records."""
     data = ""
     for filename in ["Enzymes/lipoprotein.txt",
                      "Enzymes/proline.txt",
                      "Enzymes/valine.txt"]:
         with open(filename) as handle:
             data += handle.read()
     handle = StringIO(data)
     records = list(Enzyme.parse(handle))
     self.assertEqual(len(records), 3)
     self.assertEqual(records[0]["ID"], "3.1.1.34")
     self.assertEqual(records[1]["ID"], "5.1.1.4")
     self.assertEqual(records[2]["ID"], "4.1.1.14")
def get_expasy_enzyme():
    """

    """
    url = "ftp://ftp.expasy.org/databases/enzyme/enzyme.dat"
    enzyme = urllib.request.urlretrieve(url)
    enzyme_p = bee.parse(open(enzyme[0], 'r'))
    enz_records = []
    count = 0
    for record in enzyme_p:
        count += 1
    
        enz_rec = {}
        enz_rec['Reaction(s)'] = record['CA']
        #create record for each enzyme with EC number as primary key
        enz_rec['PreferedName'] = record['DE']
        enz_rec['ECNumber'] = record['ID']
        enz_rec['Reaction(s)'] = []
        enz_rec['Substrates'] = {}
        enz_rec['Products'] = {}
        enz_rec['UniProt'] = {}
        enz_records.append(enz_rec)

        # split split to seperate multiple reactions
        reaction1 = record['CA'].split('.')
        for rxn in reaction1:
            if len(reaction1) > 2:
                rxn = rxn[3:]
            enz_rec['Reaction(s)'].append(rxn)
            #split reactions into [substrates, products]
            constituents = rxn.split('=')
            # split each side of reaction on '+' not '(+)'
            r = re.compile(r'(?:[^\+(]|\([^)]*\))+')
            for sub in r.findall(constituents[0]):
                sub = replace_strings(sub.lstrip().rstrip())
                schebi = link_compound2chebi(sub)
                enz_rec['Substrates'][sub] = schebi

            for prod in r.findall(constituents[-1]):
                prod = replace_strings(prod.lstrip().rstrip())
                pchebi = link_compound2chebi(prod)
                enz_rec['Products'][prod] = pchebi

                # populate enz_rec['UniProt'] with dictionary of uniprotid:name key, value pairs for protein
            for unpid in record['DR']:
                enz_rec['UniProt'][unpid[0]] = unpid[1]
        enz_records.append(enz_rec)

    return enz_records
Exemple #3
0
 def test_parse_one(self):
     """Check parse function with one record."""
     with open("Enzymes/lipoprotein.txt") as handle:
         records = list(Enzyme.parse(handle))
     self.assertEqual(len(records), 1)
     self.assertEqual(records[0]["ID"], "3.1.1.34")
Exemple #4
0
 def test_parse_zero(self):
     handle = StringIO("")
     records = list(Enzyme.parse(handle))
     self.assertEqual(len(records), 0)
Exemple #5
0
 def test_parse_one(self):
     """Check parse function with one record."""
     with open("Enzymes/lipoprotein.txt") as handle:
         records = list(Enzyme.parse(handle))
     self.assertEqual(len(records), 1)
     self.assertEqual(records[0]["ID"], "3.1.1.34")
Exemple #6
0
 def test_parse_zero(self):
     handle = StringIO("")
     records = list(Enzyme.parse(handle))
     self.assertEqual(len(records), 0)
Exemple #7
0
from Bio.ExPASy import Enzyme

infile = "/Users/jagodajablonska/oxyphen/DATA/enzyme.dat"
handle = open(infile)
records = Enzyme.parse(handle)

O2_ec_list = []
existing_list  =  open("/Users/jagodajablonska/oxyphen/DATA/oxygen_ecclasses").read().splitlines()


for record in records:
	EC_num = record['ID']
	reaction = record['CA']

	if "=" in reaction:
		substrates = [x.strip() for x in reaction.split("=")[0].split("+")]
		products = [x.strip() for x in reaction.split("=")[1].split("+")]

	if "O(2)" in substrates or "O(2)" in products:
		if EC_num not in existing_list:
			print(EC_num)



Exemple #8
0
#Reads a Expasy Enzyme .dat file and writes a numpy data frame where the first column is
#EC number, the second column is the reaction description, the third column is the associated
#sequenceID ids separated by '|', and the fourth column indicates whether the reactions described
#by this EC have been transferred to other EC numbers.
if not os.path.exists(os.path.join("database", "enzyme", "enzyme.dat")):
    curl_enzyme = os.path.join("ftp://ftp.expasy.org", "databases", "enzyme",
                               "enzyme.dat")
    subprocess.check_output("wget -cq -P database/enzyme " + curl_enzyme,
                            shell=True)
if not os.path.exists(os.path.join("database", "enzyme", "enzyme.dat")):
    print("%s\n", "Missing enzyme database!")
    exit(0)
input_name = os.path.join("database", "enzyme", "enzyme.dat")
output_name = os.path.join("database", "enzyme", "enzyme.tsv")
records = Enzyme.parse(open(input_name))
out = dict()  # dict of dicts, first key: EC number, second key: field
transferred = dict()  #dict of lists
for record in records:
    if 'Transferred entry:' in record['DE']:
        record['DE'] = record['DE'].rstrip('.')
        record['DE'] = record['DE'].replace('Transferred entry:', ' ')
        record['DE'] = record['DE'].replace(',', ' ')
        record['DE'] = record['DE'].replace('and', ' ')
        point_to = record['DE'].split()
        transferred[record['ID']] = point_to
    else:
        out[record['ID']] = dict()
        out[record['ID']]['sequenceID'] = '|'.join(
            [x[0] for x in record['DR']])
        out[record['ID']]['description'] = record['DE']
Exemple #9
0
def get_enzyme_ecs(level):
    '''
	Reads a Expasy Enzyme .dat file and writes a tab separated file where the 
	first column is EC number, the second column is the reaction description, 
	the third column is the associated uniprot ids separated by '|', 
	and the fourth column indicates whether the reactions described 
	by this EC have been transferred to other EC numbers.
	'''
    if not os.path.exists(os.path.join("database", "enzyme", "enzyme.dat")):
        curl_enzyme = os.path.join("ftp://ftp.expasy.org", "databases",
                                   "enzyme", "enzyme.dat")
        subprocess.check_output("wget -cq -P database/enzyme " + curl_enzyme,
                                shell=True)
    if not os.path.exists(os.path.join("database", "enzyme", "enzyme.dat")):
        print("%s\n", "Missing enzyme database!")
        exit(0)

    input_name = os.path.join("database", "enzyme", "enzyme.dat")
    output_name = os.path.join("database", "enzyme", "enzyme.tsv")
    records = Enzyme.parse(open(input_name))

    out = dict()  # dict of dicts, first key: EC number, second key: field
    transferred = dict()  #dict of lists
    for record in records:
        if 'Transferred entry:' in record['DE']:
            record['DE'] = record['DE'].rstrip('.')
            record['DE'] = record['DE'].replace('Transferred entry:', ' ')
            record['DE'] = record['DE'].replace(',', ' ')
            record['DE'] = record['DE'].replace('and', ' ')
            point_to = record['DE'].split()
            transferred[record['ID']] = point_to
        else:
            out[record['ID']] = dict()
            out[record['ID']]['uniprot'] = '|'.join(
                [x[0] for x in record['DR']])
            out[record['ID']]['description'] = record['DE']
            out[record['ID']]['transferred'] = False
    for id in transferred:
        out[id] = dict()
        out[id]['uniprot'] = '|'.join(
            [out[x]['uniprot'] for x in transferred[id]])
        out[id]['description'] = 'Transferred entry: ' + ' '.join(
            transferred[id])
        out[id]['transferred'] = True
    df = pd.DataFrame.from_dict(out, orient='index')
    df.index.name = 'EC'

    # write all data in a enzyme.csv file
    df.to_csv(output_name, sep='\t')
    #df = pd.read_table(output_name)
    # ignore EC numbers with no uniprot ids associated
    df.dropna(subset=['uniprot'], inplace=True)
    # ignore EC numbers that are obsolete due to transfer
    df = df[df.transferred == False]
    all_ECs = list(set(df.index.values))
    if 4 - int(level) == 0:
        all_ECs = [
            ec for ec in all_ECs
            if len([x for x in ec.split(".") if x != "-"]) == int(level)
        ]
    else:
        all_ECs = ['.'.join(ec.split('.')[:-4 + int(level)]) for ec in all_ECs \
        if len([x for x in ec.split(".")[:-4 + int(level)] if x != "-"]) == int(level)]
    return list(set(all_ECs))
def get_expasy_enzyme():
    """

    """
    url = "ftp://ftp.expasy.org/databases/enzyme/enzyme.dat"
    print("Retrieving enzyme records from Expasy Enzyme")
    enzyme = urllib.request.urlretrieve(url)
    enzyme_p = bee.parse(open(enzyme[0], 'r'))
    chebiout = open('chebi_list.txt', 'w')
    annotations = open('annotations_out.txt', 'w')

    enz_records = []
    chebi_list = []
    count = 0
    tester = []
    for record in enzyme_p:
        enz_rec = {}
        count += 1
        print(count)
        enz_rec['ECNumber'] = record['ID']
        enz_rec['Reaction(s)'] = []
        enz_rec['Substrates'] = {}
        enz_rec['Products'] = {}
        #enz_records.append(enz_rec)

        # split split to seperate multiple reactions
        reaction1 = record['CA'].split('.')

        for rxn in reaction1:
            try:
                if len(reaction1) > 2:
                    rxn = rxn[3:]
                enz_rec['Reaction(s)'].append(rxn)
                #split reactions into [substrates, products]
                constituents = rxn.split('=')
                # split each side of reaction on '+' not '(+)'
                r = re.compile(r'(?:[^\+(]|\([^)]*\))+')
                subr = r.findall(constituents[0])
                for sub in subr:
                    sub = sub.lstrip().rstrip()
                    sub = replace_strings(sub)
                    schebi = link_compound2chebi(sub)
                    enz_rec['Substrates'][sub] = schebi

                    if schebi:
                        chebi_list.append(schebi)
                prodr = r.findall(constituents[-1])
                for prod in prodr:
                    prod = prod.lstrip().rstrip()
                    prod = replace_strings(prod)
                    pchebi = link_compound2chebi(prod)
                    enz_rec['Products'][prod] = pchebi
                    if pchebi:
                        chebi_list.append(pchebi)
            except Exception as e:
                print(e)
                continue

        enz_records.append(enz_rec)
    print(chebi_list, file=chebiout)
    print(enz_records, file=annotations)
    return enz_records
Exemple #11
0
def get_knndataset(cdhit, output_dir, database): 
	'''
	Reads a Expasy Enzyme .dat file and writes a tab separated file where the first column is 
	EC number, the second column is the reaction description, the third column is the associated 
	uniprot ids separated by '|', and the fourth column indicates whether the reactions described 
	by this EC have been transferred to other EC numbers.
	'''
	if not os.path.exists(os.path.join("database", "enzyme", "enzyme.dat")):
		curl_enzyme = os.path.join("ftp://ftp.expasy.org", "databases","enzyme", "enzyme.dat")
		subprocess.check_output("wget -cq -P database/enzyme " + curl_enzyme, shell = True)
	if not os.path.exists(os.path.join("database", "enzyme", "enzyme.dat")): 
		print ("%s\n", "Missing enzyme database!")
		exit(0)
	input_name = os.path.join("database", "enzyme", "enzyme.dat")
	output_name = os.path.join("database", "enzyme", "enzyme.tsv")
	records = Enzyme.parse(open(input_name))
	out = dict() # dict of dicts, first key: EC number, second key: field
	transferred = dict() #dict of lists
	for record in records:
		if 'Transferred entry:' in record['DE']:
			record['DE'] = record['DE'].rstrip('.')
			record['DE'] = record['DE'].replace('Transferred entry:',' ')
			record['DE'] = record['DE'].replace(',',' ')
			record['DE'] = record['DE'].replace('and',' ')
			point_to = record['DE'].split()
			transferred[record['ID']] = point_to
		else:
			out[record['ID']] = dict()
			out[record['ID']]['uniprot'] = '|'.join([x[0] for x in record['DR']])
			out[record['ID']]['description'] = record['DE']
			out[record['ID']]['transferred'] = False
	for id in transferred:
		out[id] = dict()
		out[id]['uniprot'] = '|'.join([out[x]['uniprot'] for x in transferred[id]])
		out[id]['description'] = 'Transferred entry: ' + ' '.join(transferred[id])
		out[id]['transferred'] = True
	df = pd.DataFrame.from_dict(out, orient = 'index')
	df.index.name = 'EC'
	
	# write all data in a enzyme.csv file
	df.to_csv(output_name, sep = '\t')
	
	# ignore EC numbers with no uniprot ids associated
	df.dropna(subset = ['uniprot'], inplace = True)
	
	# ignore EC numbers that are obsolete due to transfer 
	df = df[df.transferred == False]
	
	# construct a dictionnary from dataframe
	mydic = df.to_dict()
	
	enzyme_protIDS = [mydic["uniprot"][ec].split("|") for ec in mydic["uniprot"].keys()]
	enzyme_protIDS = list(set(reduce(lambda x, y: x + y, enzyme_protIDS)))
	enzyme_protIDS = [elt.strip(" \n\t\r") for elt in enzyme_protIDS if elt != ""]
	dic_ecs = dict()
	dic_ecs["1."] = set()
	dic_ecs["2."] = set()
	dic_ecs["3."] = set()
	dic_ecs["4."] = set()
	dic_ecs["5."] = set()
	dic_ecs["6."] = set()

	csvfile = open('uniprot-reviewed%3Ayes.tab', 'r')
	readCSV = csv.reader(csvfile, delimiter = '\t')
	non_valids_enzyme = set()
	dic_sp = dict()
	for row in readCSV: 
		if row[0] != "Entry":
			seqID = row[0]
			seqName = row[3]
			seqLength = row[6]
			dic_sp[seqID] = dict()
			dic_sp[seqID]['name'] = seqName
			dic_sp[seqID]['length'] = seqLength

	#===================================================o========================================
	# Selection rules for the Main functional classes							 
	#===================================================o========================================
	# step 1
	# those enzymes whose sequences were annotated with ‘‘fragment’’ were excluded
	# those enzymes whose sequences had less than 50 amino acids were excluded
	for ec in mydic["description"].keys():
		uniprot_iDs = mydic["uniprot"][ec]
		protIDs = uniprot_iDs.strip(" \n\t\r").split("|")
		protIDs = [elt for elt in protIDs if elt != ""]
		frag_seqs = list()
		short_seqs = list()
		for seqID in protIDs:
			if "Fragment" in dic_sp[seqID]['name']:
				 frag_seqs.append(seqID)
			if int(dic_sp[seqID]['length']) < 50:
				short_seqs.append(seqID)
		protIDs=[e for e in protIDs if not e in frag_seqs and not e in short_seqs]
		if ec.startswith("1."):
			dic_ecs["1."].update(protIDs)
		elif ec.startswith("2."):
			dic_ecs["2."].update(protIDs)
		elif ec.startswith("3."):
			dic_ecs["3."].update(protIDs)
		elif ec.startswith("4."):
			dic_ecs["4."].update(protIDs)
		elif ec.startswith("5."):
			dic_ecs["5."].update(protIDs)
		elif ec.startswith("6."):
			dic_ecs["6."].update(protIDs)
		non_valids_enzyme.update(frag_seqs)
		non_valids_enzyme.update(short_seqs)
	
	# step 2
	# for the uniqueness, those enzymes that occur in two or more classes were excluded
	for ec in ["2.", "3.", "4.", "5.", "6."]:
		dic_ecs["1."] = dic_ecs["1."].difference(dic_ecs[ec])
		non_valids_enzyme.update(dic_ecs["1."].intersection(dic_ecs[ec]))
	for ec in ["1.", "3.", "4.", "5.", "6."]:
		dic_ecs["2."] = dic_ecs["2."].difference(dic_ecs[ec])
		non_valids_enzyme.update(dic_ecs["2."].intersection(dic_ecs[ec]))
	for ec in ["2.", "1.", "4.", "5.", "6."]:
		dic_ecs["3."] = dic_ecs["3."].difference(dic_ecs[ec])
		non_valids_enzyme.update(dic_ecs["3."].intersection(dic_ecs[ec]))
	for ec in ["2.", "3.", "1.", "5.", "6."]:
		dic_ecs["4."] = dic_ecs["4."].difference(dic_ecs[ec])
		non_valids_enzyme.update(dic_ecs["4."].intersection(dic_ecs[ec]))
	for ec in ["2.", "3.", "4.", "1.", "6."]:
		dic_ecs["5."] = dic_ecs["5."].difference(dic_ecs[ec])
		non_valids_enzyme.update(dic_ecs["5."].intersection(dic_ecs[ec]))
	for ec in ["2.", "3.", "4.", "5.", "1."]:
		dic_ecs["6."] = dic_ecs["6."].difference(dic_ecs[ec])
		non_valids_enzyme.update(dic_ecs["6."].intersection(dic_ecs[ec]))
	
	# step 3: 
	# to reduce the homology bias, a redundancy cutoff was operated by cd-hit program to winnow
	# those sequences which have >=40% sequence identity to any other in a same functional class

	#
	# Downlod and constructing fasta file of the Main functional classes
	def split_sequence(seq, l):
		new_seq = ""
		if len(seq) > l:
			new_seq = seq[:l]
			k = l
			while k + l < len(seq):
				new_seq+= "\n"+str(seq[k:k+l])
				k+= l
			new_seq+= "\n" + str(seq[k:])
			return new_seq + "\n"
		else: return seq + "\n"
	def run_process(list_seqs, filename):
		# @nested function
		session = requests.Session()
		outfile = open(filename, "a")
		for seqID in list_seqs:
			#handle = ExPASy.get_sprot_raw(seqID.strip(" \n\r\t"))
			#record = SeqIO.read(handle, "swiss")
			#SeqIO.write(record, outfile, "fasta")
			req = "http://wwwdev.ebi.ac.uk/proteins/api/proteins?offset=0&size=100&accession="+str(seqID)
			res = session.get(req, headers = {'User-Agent' : 'application/XML Mozilla/5.0 (X11; U; Linux i686) Gecko/20071127 Firefox/2.0.0.11',
			"content-type":"text"})
			# parse the returned XML
			uniprot = ET.fromstring(res.text)
			for isoform in uniprot.getchildren():
				# get the sequence
				iso_sequence = isoform.find('{http://uniprot.org/uniprot}sequence')
				# get the accession number
				iso_accession = isoform.find('{http://uniprot.org/uniprot}accession')
				outfile.write(">"+str(iso_accession.text)+"\n")
				outfile.write(split_sequence(str(iso_sequence.text), 60))		   		
		outfile.close()
	def create_process(list_seqs, filename):
		# @nested function:
		p = Process(target = run_process, args = (list_seqs, filename,))
		p.start()
		return p
	
	#ec1 = create_process(dic_ecs["1."], "knnDataset/ec_1.*.faa")
	#ec2 = create_process(dic_ecs["2."], "knnDataset/ec_2.*.faa")
	#ec3 = create_process(dic_ecs["3."], "knnDataset/ec_3.*.faa")
	#ec4 = create_process(dic_ecs["4."], "knnDataset/ec_4.*.faa")
	#ec5 = create_process(dic_ecs["5."], "knnDataset/ec_5.*.faa")
	#ec6 = create_process(dic_ecs["6."], "knnDataset/ec_6.*.faa")
	
	
	#===================================================o===========================================
	# Selection rules for the subclasses: same screening procedures	than the Main functional classes 
	#===================================================o===========================================
	# step 1
	# those enzymes whose sequences were annotated with 'fragment' were excluded
	# those enzymes whose sequences had less than 50 amino acids were excluded
	dic_subclasses = dict()
	for ec in mydic["description"].keys():
		uniprot_iDs = mydic["uniprot"][ec]
		protIDs = uniprot_iDs.strip(" \n\t\r").split("|")
		protIDs = [elt for elt in protIDs if elt != ""]
		frag_seqs = list()
		short_seqs = list()
		for seqID in protIDs:
			if "Fragment" in dic_sp[seqID]['name']:
				 frag_seqs.append(seqID)
			if int(dic_sp[seqID]['length']) < 50:
				short_seqs.append(seqID)
		protIDs=[e for e in protIDs if not e in frag_seqs and not e in short_seqs]
		list_ec_digits = [x for x in ec.split(".") if x != "-"]
		if len(list_ec_digits) >= 2:
			ec_on_l2 = '.'.join(list_ec_digits[:2])
			if ec_on_l2 in dic_subclasses: dic_subclasses[ec_on_l2].update(protIDs)
			else: dic_subclasses[ec_on_l2] = set(protIDs)
	
	# step 2
	# for the uniqueness, those enzymes that occur in two or more classes were excluded
	for ec1 in dic_subclasses.keys():
		for ec2 in dic_subclasses.keys():
			if ec1 != ec2: dic_subclasses[ec1] = dic_subclasses[ec1].difference(dic_subclasses[ec2])
	#print(len(dic_subclasses))
	excluded_ecs = list()
	for ec in dic_subclasses:
		if len(dic_subclasses[ec]) < 10: excluded_ecs.append(ec)
	dic_subclasses = {k: v for k, v in dic_subclasses.items() if k not in excluded_ecs}
	
	# making fasta files
#	list_process = list()
#	for ec in dic_subclasses:
#		process = create_process(dic_subclasses[ec], os.path.join(output_dir, str(ec)+".faa"))
#		list_process.append(process)
#	for i in range(len(list_process)):
#		while list_process[i].is_alive(): time.sleep(60)
	for ec in dic_subclasses:
		file = open(os.path.join(output_dir, str(ec)+".ids.list"), 'w')
		for seqID in dic_subclasses[ec]: file.write("%s\n" % seqID)
	  	file.close()
	for ec in dic_subclasses:
		batch = os.path.join(output_dir, str(ec)+".ids.list")
		fasta  = os.path.join(output_dir, str(ec)+".faa")
		print commands.getoutput("blastdbcmd -db "+ database +" -entry_batch "+ batch +" > "+ fasta)
		#outfile = open(os.path.join(output_dir, str(ec)+".faa"), "a")
		
		#for seqID in dic_subclasses[ec]:
#			handle = ExPASy.get_sprot_raw(seqID.strip(" \n\r\t"))
#			record = SeqIO.read(handle, "swiss")
#			SeqIO.write(record, outfile, "fasta") 
#			req = "http://wwwdev.ebi.ac.uk/proteins/api/proteins?offset=0&size=100&accession="+str(seqID)
#			#res = requests.get(req, headers = {'User-Agent' : 'application/XML Mozilla/5.0 (X11; U; Linux i686) Gecko/20071127 Firefox/2.0.0.11'})
#			print commands.getoutput("wget -cq -P "+ output_dir +" '" + req + "'")
#			tree = ET.parse(os.path.join(output_dir, os.path.basename(req)))
#			uniprot = tree.getroot()
#			# parse the returned XML
#			#uniprot = ET.fromstring(res.text)
#			for isoform in uniprot.getchildren():
#				# get the sequence
#				iso_sequence = isoform.find('{http://uniprot.org/uniprot}sequence')
#				# get the accession number
#				iso_accession = isoform.find('{http://uniprot.org/uniprot}accession')
#				outfile.write(">"+str(iso_accession.text)+"\n")
#				outfile.write(split_sequence(str(iso_sequence.text), 60))
#			os.remove(os.path.join(output_dir, os.path.basename(req)))		
		#outfile.close()
		#process = create_process(dic_subclasses[ec], os.path.join(output_dir, str(ec)+".faa"))
		#while process.is_alive():
		#	time.sleep(60)
	
	# step 3: 
	# to reduce the homology bias, a redundancy cutoff was operated by cd-hit program to winnow
	# those sequences which have >=40% sequence identity to any other in a same functional class
#	for ec in dic_subclasses:
#		print commands.getoutput(cdhit +" -i "+os.path.join(output_dir, str(ec)+".faa")
#			+" -d 0 -o "+ os.path.join(output_dir, str(ec) +".cdhit.faa")
#			+" -c 0.4 -n 2  -G 1 -g 1 -b 20 -s 0.0 -aL 0.0 -aS 0.0 -T 4 -M 32000 > "
#			+ os.path.join(output_dir, str(ec) +".out"))

	print "\tFINISHED"
def do_oxyphen(proteome, output_filename, ec_classes_file):
    '''
    Read and parse enzyme.dat file
    '''
    input_name = "DATA/enzyme.dat"
    output_name = "DATA/ec_uniprot.tsv"

    ### program ###
    handle = open(input_name)
    records = Enzyme.parse(handle)

    out = dict()  #dict of dicts, first key: EC number, second key: field
    transferred = dict()  #dict of lists
    for record in records:
        if 'Transferred entry:' in record['DE']:
            record['DE'] = record['DE'].rstrip('.')  #remove period
            record['DE'] = record['DE'].replace('Transferred entry:',
                                                ' ')  #remove title
            record['DE'] = record['DE'].replace(',', ' ')  #remove commas
            record['DE'] = record['DE'].replace('and', ' ')  #remove and
            point_to = record['DE'].split()
            transferred[record['ID']] = point_to
        else:
            out[record['ID']] = dict()
            out[record['ID']]['uniprot'] = ' '.join(
                [x[0] for x in record['DR']])
            out[record['ID']]['description'] = record['DE']
            out[record['ID']]['transferred'] = False

    # for id in transferred:
    #     out[id] = dict()
    #     out[id]['uniprot'] = ' '.join([out[x]['uniprot'] for x in transferred[id]])
    #     out[id]['description'] = 'Transferred entry: ' + ' '.join(transferred[id])
    #     out[id]['transferred'] = True

    df = pd.DataFrame.from_dict(out, orient='index')
    df.index.name = 'EC'
    df.to_csv(output_name, sep='\t')
    '''
    Take a subset of ecs of interest
    '''

    oxidases = tuple(open("DATA/oxygen_ecclasses", "r").read().splitlines())

    infile = open("DATA/ec_uniprot.tsv", "r").readlines()
    outfile = open("DATA/ec_uniprot_oxidases.tsv", "w")

    for line in infile:
        if line.startswith("EC"):
            outfile.write(line)
        elif line.startswith(oxidases):
            outfile.write(line)

    outfile.close()
    '''
    write a file with one uniprot ID per line, containing all of the
    uniprot IDs mentioned in uniprot column of the input file

    Ignore EC numbers that have been transferred
    '''

    input = "DATA/ec_uniprot_oxidases.tsv"
    output = "DATA/uniprot_ids.txt"

    df = pd.read_table(input)
    df.dropna(subset=['uniprot'],
              inplace=True)  #ignore EC numbers with no uniprot ids associated

    #df = df[df.transferred == False] #ignore EC numbers that are obsolete due to transfer

    unique_uniprot = set(" ".join(df.uniprot.values).split(" "))

    with open(output, "w") as outfile:
        for id in unique_uniprot:
            outfile.write(id + "\n")
    outfile.close()
    '''
    Make blastdb out of the swissprot subset
    '''

    blast_path, num_threads, multinome_folder = read_config()

    os.system(
        "%s -in DATA/sprot_subset.fasta -dbtype prot -out DATA/sprot_subset -hash_index"
        % (os.path.join(blast_path, "makeblastdb")))
    '''
    Blast our pre-selected proteomes against the uniprot subset
    '''
    print "Performing Blast searches against oxygen-utilizing database..."
    os.system(
        "%s -max_target_seqs 1 -outfmt '6 qseqid sseqid pident evalue qcovs' -query %s -db DATA/sprot_subset -out DATA/new_sequences_sprot_enzyme.tab -num_threads %d"
        % (os.path.join(blast_path, "blastp"), proteome, num_threads))
    '''
    Filter Blast output.
    '''
    evalue = 10e-3
    identity = 40.0
    coverage = 40.0

    print "Filtering Blast output: evalue", evalue, " identity", identity, " coverage", coverage
    hits_table_file_name = "DATA/new_sequences_sprot_enzyme.tab"
    hits_table_file_name_filtered_out = open(
        "DATA/new_sequences_sprot_enzyme_filtered.tab", "w")

    hits_table_file_name_filtered_out.write(
        "\t".join(["hit", "subject", "id", "len", "eval", "cov"]) + "\n")

    for line in open(hits_table_file_name, "r").read().splitlines():
        if line.startswith("#"): continue

        query, target, ident, eval, cover = line.split("\t")
        eval = float(eval)
        ident = float(ident)
        cover = float(cover)

        if eval <= evalue and ident >= identity and cover >= coverage:
            hits_table_file_name_filtered_out.write(line + "\n")

    hits_table_file_name_filtered_out.close()

    hits_table_file_name_filtered = "DATA/new_sequences_sprot_enzyme_filtered.tab"
    enzyme_table_file_name = 'DATA/ec_uniprot_oxidases.tsv'

    hits = pd.read_csv(hits_table_file_name_filtered, sep="\t", header=0)
    enzyme = pd.read_csv(enzyme_table_file_name, sep="\t", header=0)

    hits.fillna('', inplace=True)  #replace empty values with blank spaces
    enzyme.fillna('', inplace=True)

    enzyme = enzyme[enzyme.transferred == False]  #drop transferred EC numbers

    hits.subject = hits.subject.str[3:
                                    9]  #take just the uniprot ID from the name

    def get_ecs(uniprot):
        if uniprot == '':  #ignore invalid uniprot ids
            return ''
        else:
            return ' '.join(
                enzyme.EC[enzyme.uniprot.str.contains(uniprot)].values)

    hits['EC'] = hits.subject.apply(get_ecs)

    output_file_name = output_filename
    hits.to_csv(output_file_name, sep="\t", index=False)

    ### read final mapping output

    mapping_out = open(output_file_name, "r").read().splitlines()
    ecs_dict = {}

    for line in mapping_out[1:]:
        splitted = line.split("\t")
        ecs = splitted[-1]

        for ec in ecs.split():
            if ec not in ecs_dict:
                ecs_dict[ec] = []
            ecs_dict[ec].append(splitted[0])

    print "\n\n"
    print len(
        ecs_dict
    ), "oxygen-utilizing enzymes were found from classes", ecs_dict.keys()

    ec_out = open(ec_classes_file, "w")
    ec_out.write("\t".join(ecs_dict.keys()))

    ec_out.close()

    GLOBAL_RESULTS.write(
        os.path.basename(proteome) + "\t" + str(len(ecs_dict)) + "\t" +
        ",".join(ecs_dict.keys()) + "\n")
    #print "Detailed mapping can be found in OUTPUT/oxygen_utilizing_annot.tsv file"
    #print "Executing SVM classifier..."

    infile = open("DATA/model_svm", "r").read().splitlines()

    classifier_input = []
    classes = []
    ec_classes = []

    for line in infile:

        if line.startswith("@attribute") and "class" not in line:
            ec_classes.append(line.split()[1].replace("'", ""))
Exemple #13
0
    'The name of the strain in the input file. This will be used to name the output file. The default behaviour is to take the input filename minus the ".top" part.'
)

args = parser.parse_args()
inputFile = args.inputFile
enzymeDB = args.enzymeDB
outputDir = args.outputDir
sepGenes = args.sepGenes
sepDist = args.sepDist
minClusterSize = args.minClusterSize
strainName = args.strainName

enzymeDB_dict = {}
#db = open(enzymeDB)
with open(enzymeDB) as db:
    for record in Enzyme.parse(db):
        id_ec = record["ID"]
        de = record["DE"]
        enzymeDB_dict[id_ec] = de
#db.close()

fileName = inputFile.split("/")[-1]
if not (fileName.split(".")[-1] == "top"):
    sys.exit('ERROR! Wrong filetype! Input should be a ".top" file!')
if not strainName:
    strainName = fileName.split(".")[0]

# open the input file (for reading by default)
#fh = open(inputFile)
# initialise dictionary to hold enzyme data for each contig
group_enzymes = defaultdict(list)
Exemple #14
0
def get_knndataset(cdhit, output_dir, database):
    #Reads a Expasy Enzyme .dat file and writes a numpy data frame where the first column is
    #EC number, the second column is the reaction description, the third column is the associated
    #sequenceID ids separated by '|', and the fourth column indicates whether the reactions described
    #by this EC have been transferred to other EC numbers.
    if not os.path.exists(os.path.join("database", "enzyme", "enzyme.dat")):
        curl_enzyme = os.path.join("ftp://ftp.expasy.org", "databases",
                                   "enzyme", "enzyme.dat")
        subprocess.check_output("wget -cq -P database/enzyme " + curl_enzyme,
                                shell=True)
    if not os.path.exists(os.path.join("database", "enzyme", "enzyme.dat")):
        print("%s\n", "Missing enzyme database!")
        exit(0)
    input_name = os.path.join("database", "enzyme", "enzyme.dat")
    output_name = os.path.join("database", "enzyme", "enzyme.tsv")
    records = Enzyme.parse(open(input_name))
    out = dict()  # dict of dicts, first key: EC number, second key: field
    transferred = dict()  #dict of lists
    for record in records:
        if 'Transferred entry:' in record['DE']:
            record['DE'] = record['DE'].rstrip('.')
            record['DE'] = record['DE'].replace('Transferred entry:', ' ')
            record['DE'] = record['DE'].replace(',', ' ')
            record['DE'] = record['DE'].replace('and', ' ')
            point_to = record['DE'].split()
            transferred[record['ID']] = point_to
        else:
            out[record['ID']] = dict()
            out[record['ID']]['sequenceID'] = '|'.join(
                [x[0] for x in record['DR']])
            out[record['ID']]['description'] = record['DE']
            out[record['ID']]['transferred'] = False
    for id in transferred:
        out[id] = dict()
        out[id]['sequenceID'] = '|'.join(
            [out[x]['sequenceID'] for x in transferred[id]])
        out[id]['description'] = 'Transferred entry: ' + ' '.join(
            transferred[id])
        out[id]['transferred'] = True
    df = pd.DataFrame.from_dict(out, orient='index')
    df.index.name = 'EC'
    # write all data in a enzyme.csv file
    df.to_csv(output_name, sep='\t')
    # ignore EC numbers with no sequenceID ids associated
    df.dropna(subset=['sequenceID'], inplace=True)
    # ignore EC numbers that are obsolete due to transfer
    df = df[df.transferred == False]

    # The numpy data frame is converted to a python dictionnary
    mydic = df.to_dict()
    enzyme_protIDS = [
        mydic["sequenceID"][ec].split("|")
        for ec in mydic["sequenceID"].keys()
    ]
    enzyme_protIDS = list(set(reduce(lambda x, y: x + y, enzyme_protIDS)))
    enzyme_protIDS = [
        elt.strip(" \n\t\r") for elt in enzyme_protIDS if elt != ""
    ]
    dic_ecs = dict()
    dic_ecs["1"] = set()
    dic_ecs["2"] = set()
    dic_ecs["3"] = set()
    dic_ecs["4"] = set()
    dic_ecs["5"] = set()
    dic_ecs["6"] = set()
    if not os.path.exists(os.path.join("database", "uniprot", "sp.tab")):
        url = os.path.join("http://www.uniprot.org", "uniprot",
                           "?query=reviewed:yes&format=tab")
        subprocess.check_output("wget -cq -P database/uniprot '" + url + "'",
                                shell=True)
        subprocess.check_output(
            "mv database/uniprot/*=tab database/uniprot/sp.tab", shell=True)
    if not os.path.exists(os.path.join("database", "uniprot", "sp.tab")):
        print("%s\n", "Missing uniprot database!")
        exit(0)
    csvfile = os.path.join("database", "uniprot", "sp.tab")
    readCSV = csv.reader(csvfile, delimiter='\t')
    non_valids_enzyme = set()
    dic_sp = dict()
    for row in readCSV:
        if row[0] != "Entry":
            seqID = row[0]
            seqName = row[3]
            seqLength = row[6]
            dic_sp[seqID] = dict()
            dic_sp[seqID]['name'] = seqName
            dic_sp[seqID]['length'] = seqLength

    #===================================================o========================================
    # Selection rules for the Main functional classes
    #===================================================o========================================
    # step 1
    # those enzymes whose sequences were annotated with ‘‘fragment’’ were excluded
    # those enzymes whose sequences had less than 50 amino acids were excluded
    for ec in mydic["description"].keys():
        sequenceID_iDs = mydic["sequenceID"][ec]
        protIDs = sequenceID_iDs.strip(" \n\t\r").split("|")
        protIDs = [elt for elt in protIDs if elt != ""]
        frag_seqs = list()
        short_seqs = list()
        for seqID in protIDs:
            if "Fragment" in dic_sp[seqID]['name']:
                frag_seqs.append(seqID)
            if int(dic_sp[seqID]['length']) < 50:
                short_seqs.append(seqID)
        protIDs = [
            e for e in protIDs if not e in frag_seqs and not e in short_seqs
        ]
        if ec.startswith("1"):
            dic_ecs["1"].update(protIDs)
        elif ec.startswith("2"):
            dic_ecs["2"].update(protIDs)
        elif ec.startswith("3"):
            dic_ecs["3"].update(protIDs)
        elif ec.startswith("4"):
            dic_ecs["4"].update(protIDs)
        elif ec.startswith("5"):
            dic_ecs["5"].update(protIDs)
        elif ec.startswith("6"):
            dic_ecs["6"].update(protIDs)
        non_valids_enzyme.update(frag_seqs)
        non_valids_enzyme.update(short_seqs)

    # step 2
    # for the uniqueness, those enzymes that occur in two or more classes were excluded
    for ec in ["2", "3", "4", "5", "6"]:
        dic_ecs["1"] = dic_ecs["1"].difference(dic_ecs[ec])
        non_valids_enzyme.update(dic_ecs["1"].intersection(dic_ecs[ec]))
    for ec in ["1", "3", "4", "5", "6"]:
        dic_ecs["2"] = dic_ecs["2"].difference(dic_ecs[ec])
        non_valids_enzyme.update(dic_ecs["2"].intersection(dic_ecs[ec]))
    for ec in ["2", "1", "4", "5", "6"]:
        dic_ecs["3"] = dic_ecs["3"].difference(dic_ecs[ec])
        non_valids_enzyme.update(dic_ecs["3"].intersection(dic_ecs[ec]))
    for ec in ["2", "3", "1", "5", "6"]:
        dic_ecs["4"] = dic_ecs["4"].difference(dic_ecs[ec])
        non_valids_enzyme.update(dic_ecs["4"].intersection(dic_ecs[ec]))
    for ec in ["2", "3", "4", "1", "6"]:
        dic_ecs["5"] = dic_ecs["5"].difference(dic_ecs[ec])
        non_valids_enzyme.update(dic_ecs["5"].intersection(dic_ecs[ec]))
    for ec in ["2", "3", "4", "5", "1"]:
        dic_ecs["6"] = dic_ecs["6"].difference(dic_ecs[ec])
        non_valids_enzyme.update(dic_ecs["6"].intersection(dic_ecs[ec]))

    # these following two functions are internal and allow to create processes to parallel the fasta
    # files downloading and their passage to the cd-hit program
    def run_process(list_seqs, filename, output_dir, database, cdhit):
        # @nested function
        # Downlod and constructing fasta file of suclasses
        file = open(os.path.join(output_dir, filename + ".ids.list"), 'w')
        for seqID in list_seqs:
            file.write("%s\n" % seqID)
        file.close()
        fasta = os.path.join(output_dir, filename + ".faa")
        batch = os.path.join(output_dir, filename + ".ids.list")
        print commands.getoutput("blastdbcmd -db " + database +
                                 " -entry_batch " + batch + " > " + fasta)
        os.remove(batch)
        # run cdhit program
        cdhitout = os.path.join(output_dir, filename + ".cdhit.faa")
        cdhitverbose = os.path.join(output_dir, filename + ".out")
        print commands.getoutput(
            cdhit + " -i " + fasta + " -d 0 -o " + cdhitout +
            " -c 0.4 -n 2  -G 1 -g 1 -b 20 -s 0.0 -aL 0.0 -aS 0.0 -T 4 -M 32000 > "
            + cdhitverbose)

    def create_process(list_seqs, filename, output_dir, database, cdhit):
        # @nested function:
        p = Process(target=run_process,
                    args=(
                        list_seqs,
                        filename,
                        output_dir,
                        database,
                        cdhit,
                    ))
        p.start()
        return p

    # step 3:
    # to reduce the homology bias, a redundancy cutoff was operated by cd-hit program to winnow
    # those sequences which have >=40% sequence identity to any other in a same functional class
    # making fasta files for the six main classes
    for ec in dic_ecs:
        create_process(dic_ecs[ec], str(ec), output_dir, database, cdhit)

    #===================================================o===========================================
    # Selection rules for the subclasses: same screening procedures	than the Main functional classes
    #===================================================o===========================================
    # step 1
    # those enzymes whose sequences were annotated with 'fragment' were excluded
    # those enzymes whose sequences had less than 50 amino acids were excluded
    dic_subclasses = dict()
    for ec in mydic["description"].keys():
        sequenceID_iDs = mydic["sequenceID"][ec]
        protIDs = sequenceID_iDs.strip(" \n\t\r").split("|")
        protIDs = [elt for elt in protIDs if elt != ""]
        frag_seqs = list()
        short_seqs = list()
        for seqID in protIDs:
            if "Fragment" in dic_sp[seqID]['name']:
                frag_seqs.append(seqID)
            if int(dic_sp[seqID]['length']) < 50:
                short_seqs.append(seqID)
        protIDs = [
            e for e in protIDs if not e in frag_seqs and not e in short_seqs
        ]
        list_ec_digits = [x for x in ec.split(".") if x != "-"]
        if len(list_ec_digits) >= 2:
            ec_on_l2 = '.'.join(list_ec_digits[:2])
            if ec_on_l2 in dic_subclasses:
                dic_subclasses[ec_on_l2].update(protIDs)
            else:
                dic_subclasses[ec_on_l2] = set(protIDs)

    # step 2
    # for the uniqueness, those enzymes that occur in two or more classes were excluded
    for ec1 in dic_subclasses.keys():
        for ec2 in dic_subclasses.keys():
            if ec1 != ec2:
                dic_subclasses[ec1] = dic_subclasses[ec1].difference(
                    dic_subclasses[ec2])
    #print(len(dic_subclasses))
    excluded_ecs = list()
    for ec in dic_subclasses:
        if len(dic_subclasses[ec]) < 10: excluded_ecs.append(ec)
    dic_subclasses = {
        k: v
        for k, v in dic_subclasses.items() if k not in excluded_ecs
    }

    # step 3:
    # to reduce the homology bias, a redundancy cutoff was operated by cd-hit program to winnow
    # those sequences which have >=40% sequence identity to any other in a same functional class
    for ec in dic_subclasses:
        # making fasta files for the subclasses: after retrieving associated fasta file and
        # reducing redundancy with cd-hit program
        create_process(dic_subclasses[ec], str(ec), output_dir, database,
                       cdhit)