def read_embl(path_to_embls: list, num_of_entries: int, exclude_csv: str, queue): """ Reads entries from a list of existing embl files """ if exclude_csv is None: # If no exclude csv is provided, we execute the reading without an if checking! (performance) for input_f in path_to_embls: # For each entry: try to read it and # add it to the queue try: entries = SwissProt.parse(input_f) for entry in entries: queue.put(entry) except Exception as e: print("File '{}' could not be parsed and was excluded. Reason: {}".format(input_f, e)) else: # If a exclude csv is provided, then a simple if check is added (reduced performance) with open(exclude_csv) as in_f: # Read the contents of the csv csv_reader = csv.reader(in_f) exclude_set = set(x[0] for x in list(csv_reader)) for input_f in path_to_embls: # For each entry: try to read it and # add it to the queue try: entries = SwissProt.parse(input_f) for entry in entries: if entry.accessions[0] in exclude_set: # This effectively skips an entry at the cost to check whether to skip in EACH entry! continue queue.put(entry) except Exception as e: print("File '{}' could not be parsed and was excluded. Reason: {}".format(input_f, e))
def access_sequence(accession): handle = ExPASy.get_sprot_raw(accession) try: record = SwissProt.read(handle) except ValueException: print("WARNING: Accession %s not found" % accession) return record.sequence
def write_swissprot_annotations(outf, indentation_level, uniprot, uniprot_f): uniprot_dat_indices = UniProtDatIndex.objects.filter(uniprot=uniprot) for uniprot_dat_index in uniprot_dat_indices: if uniprot_dat_index.uniprot_accession == uniprot.accession: break uniprot_f.seek(uniprot_dat_index.file_char) record = SwissProt.parse(uniprot_f).next() indented_write(outf, indentation_level + 1, "Length: %d\n" % record.sequence_length) if len(record.gene_name) > 0: for name_spec in record.gene_name.replace('\n', ' ').split('; '): name_type, names = name_spec.split('=') indented_write(outf, indentation_level + 1, "%s: %s\n" % (name_type, names)) for keyword in record.keywords: indented_write(outf, indentation_level + 1, 'Keyword: %s\n' % keyword) for comment in record.comments: if comment[0:5] == '-----': continue components = comment.replace(':\n', ': ').split(': ') comment_type = components[0] comment_lines = ': '.join(components[1:]).split('\n') indented_write(outf, indentation_level + 1, "%s:\n" % comment_type) for line in comment_lines: indented_write(outf, indentation_level + 2, "%s\n" % line) for cross_reference in record.cross_references: indented_write( outf, indentation_level + 1, "%s: %s\n" % (cross_reference[0], '; '.join(cross_reference[1:])))
def pull_uniprot(repull=False): xmlname = os.path.join(os.path.dirname(__file__), 'uniprot_sprot_human.dat') if repull: xmldata = pull_and_decompress( 'ftp.uniprot.org', '/pub/databases/uniprot/current_release/knowledgebase/taxonomic_divisions/', 'uniprot_sprot_human.dat.gz') with open(xmlname, 'w') as xmlfile: xmlfile.write(xmldata) seq_to_idlist = defaultdict(set) #I only want the PRO sequences. One day, I could get the -1 -2 sequences as well if # there were a reason. with open(xmlname, 'r') as unif: for record in SwissProt.parse(unif): uniprotid = f'UniProtKB:{record.accessions[0]}' #xrefs = [ f"{x[0]}:{x[1]}" for x in record.cross_references if x[0].lower() in ['mint','string','nextprot']] #xrefs.append( f'PR:{record.accessions[0]}' ) #xrefs.append( uniprotid ) feats = [ f for f in record.features if f[4].startswith('PRO_') and isinstance(f[1], int) and isinstance(f[2], int) ] fseq = [(record.sequence[f[1] - 1:f[2]], f[4]) for f in feats] #seq_to_idlist[record.sequence].update(xrefs) for fs, fn in fseq: seq_to_idlist[fs].add(f'{uniprotid}#{fn}') return seq_to_idlist
def get_genes (self,gene_name=""): if gene_name != "": print "Finding \"{}\" gene in Uniprot database...".format(gene_name) upper_name = gene_name.upper() # Rho --> RHO output_handle = open(self.fasta_file, "w") for record in SwissProt.parse (self.fd): match = record.gene_name[5:5+len(upper_name)+1].upper() # Name=Rhodop; --> RHOD (Length of the queried name (rho)+1) # For matching the two possibilities # 1) Name=Rho; # 2) Name=rho {ECO.....} # So, it fill compare the queried gene name and match one e.g. # in 1st case "RHO " == "RHO;" or "RHO;" == "RHO;" # in 2nd case "RHO " == "RHO " or "RHO;" == "RHO " # We do not consider gene names differ to "Name=...;" in swisprot file if (upper_name+" ") == match or (upper_name+";") == match: print "Add protein to fasta file: " + record.entry_name + ", ...." + record.gene_name output = ">"+record.entry_name+"\n"+record.sequence.format("fasta")+"\n" #print output output_handle.write(output) output_handle.write("") output_handle.close()
def get_records(ids): records = [] for id in ids: handle = ExPASy.get_sprot_raw(id) record = SwissProt.read(handle) records.append(record.sequence) return records
def download_sequences(accessions): records = {} for accession in accessions: handle = ExPASy.get_sprot_raw(accession) record = SwissProt.read(handle) records[accession] = record.sequence return records
def sync_query_list_with_response(response_fn, query_list): db_data = {} with open(response_fn, 'r') as fh: for record in SwissProt.parse(fh): acc = record.accessions[0] # Select only EMBL and RefSeq crossrefs refseq_refs, embl_refs = [], [] for db_ref in record.cross_references: if db_ref[0] == 'RefSeq': refseq_refs.append(db_ref[1:]) elif db_ref[0] == 'EMBL': embl_refs.append(db_ref[1:]) db_data[acc] = {'RefSeq': refseq_refs, 'EMBL': embl_refs} # This is to handle isoforms # E.g. P03692 and P03692-1 can both be included in the query list # P03705-2 can be in the query list but not P03705 for prot in query_list: # For each of the original queries if (prot not in db_data) and ('-' in prot): # If the query is not returned base_name = prot.split('-')[0] # Search for the fist part if base_name in db_data: # If it is present in the db_data if base_name not in query_list: # If it's not in the original query list db_data[prot] = db_data[base_name] # Fill in the information for the corresponding protein db_data.pop(base_name) # AND remove the original part elif base_name in query_list: # If the first part is in the original query db_data[prot] = db_data[base_name] # Fill in the information WITHOUT removing the original part elif prot not in db_data: print("I don't know what to do with this id: {}".format(prot)) pass return db_data
def fetch_swp_expasy(uniprot_acc): """ Fetch information on SwissProt accession (manually reviewed UniProt entry). http://biopython.org/DIST/docs/api/Bio.SwissProt.Record-class.html Parameters ---------- arg1 : str SwissProt accession or identifier. Returns ------- list list of length 2 with the name of the attributes found and their values. """ #generates record object with information regarding SwissProt identifier handle = ExPASy.get_sprot_raw(uniprot_acc) record = SwissProt.read(handle) #checks all the attributes possibles for the record object generated and their type #attributes are of type: str, tuple, or list #attribute list found here: http://biopython.org/DIST/docs/api/Bio.SwissProt.Record-class.html attrib_names = [ 'accessions', 'data created', 'date created (ISO)', 'organism', 'gene names', 'description', 'comments', 'keywords' ] swp_info_list = [ record.accessions, record.created[0], dating(record.created[0]), record.organism, record.gene_name, record.description, record.comments, record.keywords ] return (attrib_names, swp_info_list)
def sequence_file(*args): '''The function sequence_file save the sequence of the protein in fasta format, to do so the sequence is retrieved and the other necessary information to make the fasta header. We included a try/except chunck to display an Error if the code is invalid''' a = code.get() try: from Bio import ExPASy from Bio import SwissProt with ExPASy.get_sprot_raw(a) as handle: record = SwissProt.read(handle) except: if a == "": open_window("No Code", "Please Insert an Uniprot Code", "#FFC3C3", '200x30') else: open_window("No Valid Code", "Please Insert a valid Uniprot Code", "#FFC3C3", '200x30') descrip = record.description.split(";")[0] num = descrip.find("Full=") + 5 descrip = descrip[num:] fasta_header = ">sp|" + code.get( ) + "|" + record.entry_name + " " + descrip + " OS=" + record.organism filename = filedialog.asksaveasfilename(defaultextension='.fasta', filetypes=[("fasta", "*.fasta")]) TextFile = open(filename, "w") TextFile.write(fasta_header + '\n') TextFile.write(record.sequence) TextFile.close()
def parseBlast(): result_handle = open("./output/blastOut.xml") blast_records = NCBIXML.parse(result_handle) E_VALUE_THRESH = 1 blastHits = {} accessions = {} #Loop through each protein query results for blast_record in blast_records: keyword_list = [] #stores running keyword list queryID = blast_record.query.split()[0].split(':')[ 1] #parse for the query protein ID #Loop through the hits associated with particular sequence for alignment in blast_record.alignments: for hsp in alignment.hsps: #Hit must have e-value < threshold to be considered if hsp.expect < E_VALUE_THRESH: title = alignment.title #title of hit splittitle = title.split() raw_protein_title = title.split('OS')[ 0] #specific keywords in title protein_title = " ".join(raw_protein_title.split()[2:]) keyword_list.append(protein_title) accession = splittitle[1].split('|')[ 1] #parse for the accession number accessions.setdefault(queryID, []).append(accession) handle = ExPASy.get_sprot_raw(accession) record = SwissProt.read(handle) keyword_list += record.keywords keyword_string = '; '.join(keyword_list) blastHits[queryID] = keyword_string break #only take top hit for now return (blastHits, accessions)
def features(files): ft=['ZN_FING', 'REGION','METAL','SITE','SIGNAL','REPEAT', 'NP_REGION', 'BINDING','MOTIF','MOD_RES', 'LIPID','DOMAIN','DNA_BIND','DISULFID','CROSSLNK', 'CARBOHYD','CA_BIND', 'ACT_SITE'] for record in SwissProt.parse(open(files)): for l in record.features: if l[0] in ft: print l[0]+','+str(l[1])+'-'+str(l[2])+','+l[3]
def parse_uniprot(input_file): dic_pfam = {} dic_dom = {} dic_king = {} # probably faster/easier to use the XML parser directly #print (input_file) handle = open(input_file) for record in SwissProt.parse(handle): #print (record) #print (record.entry_name) #print (record.cross_references) entry = record.entry_name id = entry dic_pfam[id] = '' dic_dom[id] = 0 dic_king[id] = 'Unique' for db in record.cross_references: if (db[0] == "Pfam"): dic_pfam[id] = dic_pfam[id] + db[1] + ";" dic_dom[id] += 1 if (db[1] in shared_domains.keys()): dic_king[id] = "Shared" if (dic_dom[id] == 0): dic_king[id] = 'None' return dic_pfam, dic_dom, dic_king
def main(filename): with open(filename) as fin: my_seq = fin.read().strip() handle = ExPASy.get_sprot_raw(my_seq) record = SwissProt.read(handle) for s in [f[2].split(':')[1] for f in record.cross_references if f[0]=='GO' and f[2][0]=='P']: print s
def find_COG2(self): """Find records from uniprotIDs without use of keggIDs.""" handle = ExPASy.get_sprot_raw(self.uprotID) record = SwissProt.read(handle) query = record.gene_name.strip("Name""="";") url_open = urllib.urlopen("http://rest.genome.jp/oc/?"+query) return url_open.read()
def get_SwissProt(dict, accession): try: handle = ExPASy.get_sprot_raw(accession) record = SwissProt.read(handle) dict[accession] = record except urllib2.HTTPError, error: print accession + ": protein not found on UniProt . "
def main(): # Read the UniProt ID for a txt file. with open('problem_datasets/rosalind_dbpr.txt', 'r') as infile: uni_id = infile.read().strip() # Retrieve the data from UniProt (separated IDs by commas). raw_data = ExPASy.get_sprot_raw(uni_id) record = SwissProt.read( raw_data) # use SwissProt.parse for multiple proteins # Collect the relevant information. go = [] for i in record.cross_references: if i[2].startswith('P:'): go.append(i[2][2:]) # Output answer. with open('output/rosalind_dbpr_out.txt', 'w') as outfile: outfile.write('\n'.join(go)) # Optional: Print answer and gene ID/name name = record.gene_name.split(' ')[0][5:] print('Gene:\n', name, ' (UniProt ID = ', uni_id, ')\n\nBiological Processes:\n', '\n'.join(go), sep='')
def go_in_papers(sp_path): # Returns: papers: key: pubmed_id; value: list of go_rec records # go_rec record is a dictionary. Keys (values): 'sp_id' (swissprot id); # 'go_id': (GO ID); 'go_ec': (GO Evidence Code). # To be used with SP data, not GOA papers = {} go_ids = {} sp_recs = {} papers_prots = {} sph = open(sp_path) for sp_rec in SP.parse(sph): cur_go_recs = get_go_evidence_codes(sp_rec) # print cur_go_recs if not cur_go_recs: continue cur_papers = get_papers(sp_rec) for paper in cur_papers: if paper not in papers_prots: papers_prots[paper] = {sp_rec.entry_name: 1} else: papers_prots[paper][sp_rec.entry_name] = \ papers_prots[paper].get(sp_rec.entry_name,0)+1 for cur_go_rec in cur_go_recs: d1 = dict(sp_id=sp_rec.entry_name, go_id=cur_go_rec[0], go_ec=cur_go_rec[1]) papers.setdefault(paper, []).append(d1) return papers, papers_prots
def test_can_parse_record_into_protein_objects(self): for record in SwissProt.parse(self.records): obj = parse_record_into_protein(record) break self.assertEqual(obj.uniprot_id, "P31946") self.assertEqual(obj.gene_id, "YWHAB") self.assertEqual(obj.reviewed, True)
def file_parse(): file = gzip.open("uniprot.gz") #Declaration of arrays which check for repitions non_rep_id = [] non_rep_org = [] non_rep_tax = [] swiss_records = SwissProt.parse(file) for swiss_record in swiss_records: #NCBI ID id = swiss_record.taxonomy_id if id not in non_rep_id: non_rep_id.append(id) #ORGANISM NAME organism = (swiss_record.organism.strip('.')) if organism not in non_rep_org: non_rep_org.append(organism) #TAXONOMY taxonomy= (swiss_record.organism_classification) if taxonomy not in non_rep_tax: non_rep_tax.append(taxonomy) #ZIP arrays to column/tab seperated output for i in zip(non_rep_id, non_rep_org, non_rep_tax): print ("".join(map((str), list(format(i)))))
def gen_uniprot_features_for_pdb(infile): for line in open(infile,'r'): (pdb_dom, count, uniprot_ids) = line.replace('\n','').split('\t') uniprot_ids = uniprot_ids.split('|') for uniprot_id in uniprot_ids: data = SwissProt.read(ExPASy.get_sprot_raw(uniprot_id)).__dict__ keep = False go = []; interpro = ''; evo_trace = '' for xref in data['cross_references']: if xref[0] == 'GO': go.append(xref[1]) if xref[0] == 'InterPro': interpro = xref[1] if xref[0] == 'EvolutionaryTrace': evo_trace = xref[1] if xref[0] == 'PDB' and xref[1].lower() == pdb_dom.lower(): keep = True if keep == False: continue organism = data['organism'] loc = '' for comment in data['comments']: if comment.startswith('SUBCELLULAR LOCATION'): loc = comment print '%s\t%s\t%s\t%s\t%s\t%s\t%s' %(pdb_dom,uniprot_id,'|'.join(go),interpro,evo_trace,organism,loc)
def test_compute_features_returns_None_if_target_is_None(self): for record in SwissProt.parse(self.records): protein = parse_record_into_protein(record) break protein.save(self.session, commit=True) protein = Protein.query.get(protein.id) # Refresh self.assertIsNone(compute_interaction_features(protein, None))
def test_compute_features_return_empty_list_if_features_are_empty(self): for record in SwissProt.parse(self.records): protein = parse_record_into_protein(record) break protein.go_mf = None protein.go_bp = None protein.go_cc = None protein.interpro = None protein.pfam = None protein.keywords = None protein.save(self.session, commit=True) protein = Protein.query.get(protein.id) # Refresh features = compute_interaction_features(protein, protein) expected = dict(go_mf=[], go_bp=[], go_cc=[], ulca_go_mf=[], ulca_go_bp=[], ulca_go_cc=[], interpro=[], pfam=[], keywords=[]) self.assertEqual(expected, features)
def main(input_string): record = SwissProt.read(ExPASy.get_sprot_raw(input_string)) for ref in record.cross_references: if ref[0] == 'GO' and ref[2].startswith('P:'): # if reference is a Gene Ontology reference and refers to a # biological process print(ref[2][2:])
def get_SwissProt(dict,accession): try: handle = ExPASy.get_sprot_raw(accession) record = SwissProt.read(handle) dict[accession] = record except urllib2.HTTPError, error: print accession + ": protein not found on UniProt . "
def go_in_papers(sp_path): # Returns: papers: key: pubmed_id; value: list of go_rec records # go_rec record is a dictionary. Keys (values): 'sp_id' (swissprot id); # 'go_id': (GO ID); 'go_ec': (GO Evidence Code). # To be used with SP data, not GOA papers = {} go_ids = {} sp_recs = {} papers_prots = {} sph = open(sp_path) for sp_rec in SP.parse(sph): cur_go_recs = get_go_evidence_codes(sp_rec) # print cur_go_recs if not cur_go_recs: continue cur_papers = get_papers(sp_rec) for paper in cur_papers: if paper not in papers_prots: papers_prots[paper] = {sp_rec.entry_name: 1} else: papers_prots[paper][sp_rec.entry_name] = \ papers_prots[paper].get(sp_rec.entry_name,0)+1 for cur_go_rec in cur_go_recs: d1 = dict(sp_id=sp_rec.entry_name, go_id=cur_go_rec[0], go_ec=cur_go_rec[1]) papers.setdefault(paper,[]).append(d1) return papers, papers_prots
def _parse_features( self ): print( 'uniprot flat files, to get features...' ) with open( path + files[16], 'wt' ) as outf: for j in [11,12,13,14]: print( files[j] + '...' ) with open(path + files[j], 'rt') as handle: for record in SwissProt.parse(handle): if record.taxonomy_id[0] in ['9606', '10090', '10116']: accs = record.accessions acc = accs.pop(0) feats = record.features for f in feats: f = list(f) f.insert(3, '') if re.search(r'^[^\.]+\.\s*$', f[4]): m = re.match(r'^(.+)\.\s*$', f[4]) if m: f[3] = m.group(1) f[4] = '' elif re.search(r'.+\.\s+\{', f[4]): m = re.match(r'^(.+)\.\s*\{(.+)\}\.$', f[4]) if m: f[3] = m.group(1) f[4] = m.group(2) elif re.search(r'.+\.\s+\/', f[4]): m = re.match(r'^(.+)\.\s*\/(.+)\.$', f[4]) if m: f[3] = m.group(1) f[4] = m.group(2) else : f[4] = re.sub(r'[\{\}\.\/]', '', f[4]) #print(f) outf.write( acc + "\t" + '\t'.join(map(str, f)) + '\n')
def MouseHomolog(self, dfs): print('\nFinding mouse homologs') ind = 0 new_dfs = [] for acc in self.accs: try: handle = ExPASy.get_sprot_raw(acc) record = SwissProt.read(handle) name = record.entry_name except: print('\nNo entry for', acc, ',continuing') ind += 1 continue try: mname = name.split('_')[0] + '_MOUSE' mhandle = ExPASy.get_sprot_raw(mname) mrecord = SwissProt.read(mhandle) mseq = mrecord.sequence print(f'\nFound mouse homolog for {name}: {mname}') except: print(f'\nNo mouse gene entry for {acc}-{name}, continuing') ind += 1 continue df = dfs[ind] mcol = [] for row in range(len(df)): pepseq = df.Sequence[df.index[row]] print(pepseq) if str(pepseq) in mseq: mcol.append('True') else: mcol.append('False') df['Mouse'] = mcol new_dfs.append(df) ind += 1 df_final = pd.concat(new_dfs, sort=True) df_final.to_excel(self.out_folder + '/' + 'MouseHomologPeptides.xlsx', index=True)
def get(self,id): """Open and Read a Swiss-Prot file locally from remote source (ExPASy database) Swiss-Prot file over the internet from the ExPASy database. Input must be a accession number stored on the swissprot site. """ handle = ExPASy.get_sprot_raw(id) record = SwissProt.read(handle) return record
def test_parses_function_as_None_for_entry_with_no_comment(self): for record in SwissProt.parse(self.records): r = record break r.comments = [x for x in r.comments if "FUNCTION: " not in x] result = function(r) expected = None self.assertEqual(result, expected)
def swissprot_search(): f = open('output/seq_accession.txt') db = f.readline() for accession in f: handle = ExPASy.get_sprot_raw(accession) record = SwissProt.read(handle) print(record)
def load_uniprot(self): self.uniprot = None if not self.exists('uniprot.txt'): return with self.open('uniprot.txt') as fp: self.uniprot = [] for record in SwissProt.parse(fp): self.uniprot.append(record)
def main(argv): # input() reads stdin handle = ExPASy.get_sprot_raw(input().strip()) #you can give several IDs separated by commas record = SwissProt.read(handle) # use SwissProt.parse for multiple proteins # there ought to be a better way to pull GO information from the record! maybe there is... for p in filter(lambda x:x[0]=='GO' and x[2].startswith('P:'),record.cross_references): print(p[2][2:])
def getgo(id): handle = ExPASy.get_sprot_raw(id) record = SwissProt.read(handle) go = [ r[2].split(":")[1] for r in record.cross_references if r[0] == "GO" and r[2].startswith("P") ] print("\n".join(go))
def generate_uniprot_record(self): for file_handle, file_number in self._uniprot_file_handle(): data_source = self._file_number_to_source(file_number) for record in SwissProt.parse(file_handle): if self._check_id_to_use(record.accessions[0]): current_record_dict = self._parse_record( record, data_source) yield current_record_dict
def main(): with open("dbpr") as f: handle = ExPASy.get_sprot_raw(f.readline().strip()) record = SwissProt.read(handle) record = [x[2] for x in record.cross_references if x[0] == 'GO'] record = [x[2:] for x in record if x[0] == 'P'] sys.stdout = open("dbpr.out","w") print "\n".join(record)
def main(id): handle = ExPASy.get_sprot_raw(id) record = SwissProt.read(handle) for cr in record.cross_references: if cr[0] == "GO": bits = cr[2].split(":") if bits[0] == "P": print bits[1]
def main(argv): line = files.read_line(argv[0]) handle = ExPASy.get_sprot_raw(line) record = SwissProt.read(handle) go = filter(lambda x: x[0] == 'GO' and 'P:' in x[2], record.cross_references) print '\n'.join(g[2].split(':')[1] for g in go)
def dbpr(): uniprot_id = open("rosalind_dbpr.txt").read().strip() handle = ExPASy.get_sprot_raw(uniprot_id) record = SwissProt.read(handle) # return the list of biological functions for ref in record.cross_references: if ref[0] == 'GO' and ref[2].startswith('P:'): print ref[2][2:]
def _parse_flat_files( self ): print( 'uniprot flat files...' ) with open( path + files[15], 'wt' ) as outf: for j in [11,12,13,14]: print( files[j] + '...' ) with open(path + files[j], 'rt') as handle: for record in SwissProt.parse(handle): if record.taxonomy_id[0] in ['9606', '10090', '10116']: accs = record.accessions acc = accs.pop(0) rev = record.data_class gname = re.sub(r'.*Name=([^;{]+)[{;].*', r'\1', record.gene_name).strip() uid = record.entry_name taxid = record.taxonomy_id[0] seq = record.sequence sinfo = str(record.seqinfo[0]) srcdb = 'sp' if re.search(r'trembl', files[j]): srcdb = 'tr' rname = '' fname = '' sname = '' flags = '' if 'RecName' in record.description: rname = re.sub(r'.*RecName: *Full=([^;{]+)[{;].*', r'\1', record.description, re.IGNORECASE).strip() elif 'SubName' in record.description: rname = re.sub(r'.*SubName: *Full=([^;{]+) *[;{].*', r'\1', record.description, re.IGNORECASE).strip() if 'AltName' in record.description: if re.search(r'AltName:[^:]*Full=', record.description, re.IGNORECASE): fname = re.sub(r'.*AltName:[^:]*Full=([^;{]+)[{;].*', r'\1', record.description, re.IGNORECASE).strip() if re.search(r'AltName:[^:]*Short=', record.description, re.IGNORECASE): sname = re.sub(r'.*AltName:[^:]*Short=([^;{]+)[{;].*', r'\1', record.description, re.IGNORECASE).strip() if 'Flags:' in record.description: flags = re.sub(r'.*Flags: *([^;]+);.*', r'\1', record.description, re.IGNORECASE).strip() refs = list() eids = list() mgis = list() hgnc = list() dids = list() dnms = list() ddbs = list() for i in range(0, len(record.cross_references)): if record.cross_references[i][0] == 'GeneID': eids.append(record.cross_references[i][1]) if record.cross_references[i][0] == 'RefSeq': refs.append(re.sub(r'\.\d+$', r'', record.cross_references[i][1])) if record.cross_references[i][0] == 'MGI': mgis.append(record.cross_references[i][1]) if record.cross_references[i][0] == 'HGNC': hgnc.append(record.cross_references[i][1]) if record.cross_references[i][0] in xdoms: dids.append(record.cross_references[i][1]) ddbs.append(record.cross_references[i][0]) dnms.append(record.cross_references[i][2]) outf.write( '\t'.join([ acc, uid, srcdb, taxid, rev, gname, rname, fname, sname, flags, '|'.join(accs), '|'.join(eids), '|'.join(refs), '|'.join(hgnc), '|'.join(mgis), '|'.join(ddbs), '|'.join(dids), '|'.join(dnms), sinfo, seq ]) + '\n' )
def get_ancestors_list(): i = 0 handle = open("uniprot_sprot.dat") for record in SwissProt.parse(handle): descriptions.append(record.sequence) print(descriptions) i += 1 if i == 1: break
def acession(self): self.rec=[] for ide in self.ids: if ide!='ND': results=ExPASy.get_sprot_raw(ide) rec=SwissProt.read(results) self.rec.append(rec) else: self.rec.append('ND') return self.rec
def get_keywords(lookup): try: handle = ExPASy.get_sprot_raw(lookup) except: print("Error in ExPASy") sys.exit(1) try: record = SwissProt.read(handle) except ValueError, error: print(error) sys.exit(1)
def BiologicalProcesses(UniProtID): Handle = ExPASy.get_sprot_raw(UniProtID) Record = SwissProt.read(Handle) Processes = [] for i in Record.cross_references: if "GO" in i: for j in i: if re.match("P:.*", j): Processes.append(j[j.rfind(':')+1:]) return "\n".join(Processes)
def fetch(acc) : '''Downloads data from UniProt. Input: acc: accession code of the record database: database name Return: the Entrez record ''' base_url = 'http://www.uniprot.org/uniprot/' handle = urllib.request.urlopen(base_url + acc + '.txt') record = SwissProt.read(handle) return record
def obtain_taxons(self, protein_dict, fh_sprot): found = False for rec in sp.parse(fh_sprot): for ac in range(len(rec.accessions)): if rec.accessions[ac] in protein_dict.keys(): # assign rec.taxonomy_id list to the protein protein_dict[rec.accessions[ac]] = rec.taxonomy_id found = True break #if found: # break return protein_dict
def main(protein_id): handle = ExPASy.get_sprot_raw(protein_id) #you can give several IDs separated by commas record = SwissProt.read(handle) # use SwissProt.parse for multiple proteins answer = "" for r in record.cross_references: print r if r[0] == "GO": if r[2].split(":")[0] == 'P': answer += r[2].split(":")[1] + "\n" return answer.strip()
def __init__(self, sprot_cache='', trembl_cache='', organism='h**o sapien'): self.records = {} self.organism = organism.strip().lower() if sprot_cache: # Load the swissprot records if file can be found try: with open(sprot_cache) as fp: for record in SwissProt.parse(fp): for accession in record.accessions: self.records[accession] = record except IOError, e: print(e); print("SwissProt cache not loaded")
def download_entry(self, accession): try: handle = ExPASy.get_sprot_raw(accession) record = SwissProt.read(handle) except: raise KeyError('{}'.format(accession)) record_org = record.organism.strip().lower() if self.organism not in record_org: print('{} ortholog of {} not found.'.format(self.organism, accession)) raise KeyError('{} ortholog of {} not found.'.format(self.organism, accession)) else: self.records[accession] = record return record
def main(): #Grab our input id value uniprot_id = get_uniprot_id_from_file(arguments['<input>']) #Get a handle on the data for the uniprot id handle = ExPASy.get_sprot_raw(uniprot_id) #Parse our data record = SwissProt.read(handle) handle.close() #Process out the stuff of interest, GO values in this case go_refs = [ref[1:] for ref in record.cross_references if ref[0] == 'GO'] for go_entry in go_refs: pre, val = go_entry[1].split(':') if pre == 'P': print(val)
def main(fichier): """ navigate into protein database """ f = open(fichier,'r') fline = f.readline().strip() from Bio import ExPASy from Bio import SwissProt handle = ExPASy.get_sprot_raw(fline) record = SwissProt.read(handle) go = [] for i in record.cross_references: if i[0] == 'GO' and i[2][0]=='P': go.append(i[2].lstrip('P:')) print '\n'.join(go)
def __build_NEXP_accession_singleSpecies(fh_sprot, taxon_id, ontType, EXP_default=set([])): ''' This method builds a list of accessions of the proteins whose annotations have non-EXP evidence but no EXP evidence codes in a specific UniProtKB/SwissProt file (file pointer fh_sprot) for some ontology type (ontType). The method returns the list. ''' # nexp_accessions: Initialize a list to store the accessions of the # proteins that meet the criteria: (1) the protein whose annotation # is supported some Non-EXP evidence code in the specific ontology # ontType, but (2) the annotation is NOT supported by any EXP # evidence code. nexp_accessions = [] print(' Building the accession list with the proteins ' + \ 'that have only non-EXP evidence codes at time t1 ...') for rec in sp.parse(fh_sprot): # Selects records that are related to a specific # taxonomy id taxon_id: if taxon_id in rec.taxonomy_id: # ont_specific_code_exist: this varilable is initialized to False # at the beginning of each iteration. If an evidence code (either # EXP or Non-EXP) for the current record is found, this varilable # will be set to True ont_specific_code_exist = False # exp_code: this variable is initialized to False at the beginning # of each iteration. If an EXP evidence for the current record is # found, this variable will be set to True. exp_code = False # Going over the list of DB reference entries: for crossRef in rec.cross_references: # Consider the cross_reference entries # that relate to GO DB: if crossRef[0] == 'GO': goList = [crossRef[1], (crossRef[3].split(':'))[0], crossRef[2][0]] if not ont_specific_code_exist and goList[2] == ontType: ont_specific_code_exist = True if goList[2] == ontType and \ (crossRef[3].split(':'))[0] in EXP_default: exp_code = True break # If the protein's annotation is supported by some Non-EXP evidence # code but is not supported by any EXP evidence code, append the # protein's accessions list to the nexp_accessions list: if ont_specific_code_exist and not exp_code: nexp_accessions.append(rec.accessions) return nexp_accessions
def obtain_goterms(self, goterm_dict, fh_sprot): found = False for rec in sp.parse(fh_sprot): for ac in range(len(rec.accessions)): goList = [] if rec.accessions[ac] in goterm_dict.keys(): for crossRef in rec.cross_references: if crossRef[0] == 'GO': goDef = (crossRef[1], (crossRef[3].split(':'))[0], \ crossRef[2][0]) goterm_dict[rec.accessions[ac]].add(goDef) found = True break #if found: #break return goterm_dict
def UNIPROT_GENE_PLUS(UNIPROT): #LIST-The difference between this and UNIPROT_GENE is that UNIPROT_GENE_PLUS returns synonim genes as well if #any and the gene name in the first entry import urllib, urllib2 from Bio import SwissProt url=urllib2.urlopen("http://www.uniprot.org/uniprot/%s.txt"%UNIPROT) GENES=[] for record in SwissProt.parse(url): if len(record.gene_name.split(";"))>2: GENES.append(record.gene_name.split(";")[0].split("=")[1]) SYN=record.gene_name.split(";")[1].split("=")[1].split(",") for syno in SYN: GENES.append("".join(syno.split())) else: GENES.append(record.gene_name.split(";")[0].split("=")[1]) return GENES
def count_genes_with_EXP_old(fh_sprot, taxon_id, EXP_default=set([])): # The exp_bpo_ct variable counts total number of genes in # the sprot file related to the taxonomy id taxon_id whose # annotations have EXP evidence and in BPO ontological category: exp_bpo_ct = 0 # The exp_cco_ct variable counts total number of genes in # the sprot file related to the taxonomy id taxon_id whose # annotations have EXP evidence and in CCO ontological category: exp_cco_ct = 0 # The exp_mfo_ct variable counts total number of genes in # the sprot file related to the taxonomy id taxon_id whose # annotations have EXP evidence and in MFO ontological category: exp_mfo_ct = 0 for rec in sp.parse(fh_sprot): # SELECT records that are related to a specific # taxon_id such as 559292 for yeast: if taxon_id in rec.taxonomy_id: bpo_exp_flag = cco_exp_flag = mfo_exp_flag = False # Go over the list of GO information: for crossRef in rec.cross_references: # Consider the cross_reference entries that # relate to GO DB: if crossRef[0] == 'GO': goList = [crossRef[1], (crossRef[3].split(':'))[0], crossRef[2][0]] if (crossRef[3].split(':'))[0] in EXP_default: if goList[-1].upper() == 'P': bpo_exp_flag = True elif goList[-1].upper() == 'C': cco_exp_flag = True elif goList[-1].upper() == 'F': mfo_exp_flag = True if (bpo_exp_flag and cco_exp_flag and mfo_exp_flag): break # Increase gene counts in BPO, CCO, and MFO categories # depending on the corresponding flag values: if bpo_exp_flag: exp_bpo_ct += 1 if cco_exp_flag: exp_cco_ct += 1 if mfo_exp_flag: exp_mfo_ct += 1 return (exp_bpo_ct, exp_cco_ct, exp_mfo_ct)
def count_GOterms_with_EXP(fh_sprot, taxon_id, EXP_default=set([])): ''' This method extract the distinct GO terms for each gene that have validation with any of the experimental evidence codes. A set is created for these GO terms for each gene and then are placed in a dictionary of each ontological categories. At the end, these THREE dictionaries are returned. ''' mfo_terms = OrderedDict() bpo_terms = OrderedDict() cco_terms = OrderedDict() count = 0 for rec in sp.parse(fh_sprot): # SELECT records that are related to a specific # taxon_id such as 559292 for yeast: if taxon_id in rec.taxonomy_id: protName = rec.accessions[0] # Initialize lists for adding GO terms: terms_mfo = set() terms_bpo = set() terms_cco = set() # Go over the list of DB cross references: for crossRef in rec.cross_references: # Consider the cross_reference entries that # relate to GO DB: if crossRef[0] == 'GO': goList = [crossRef[1], (crossRef[3].split(':'))[0], crossRef[2][0]] if (crossRef[3].split(':'))[0] in EXP_default: # print goList if goList[-1].upper() == 'F': terms_mfo.add(goList[0]) elif goList[-1].upper() == 'P': terms_bpo.add(goList[0]) elif goList[-1].upper() == 'C': terms_cco.add(goList[0]) # Increase gene counts in BPO, CCO, and MFO categories # depending on the corresponding flag values: mfo_terms[protName] = terms_mfo bpo_terms[protName] = terms_bpo cco_terms[protName] = terms_cco count += 1 if count > 20: break #break return (mfo_terms, bpo_terms, cco_terms)
def check_sprot_format(fh_sprot): """ This method checks whether the format of the file (with file handle fh_sprot) is in UniProtKB/Swissprot format. If the file is in UniProtKB/Swissprot format format, it returns True Otherwise, it returns False. """ iter_handle = sp.parse(fh_sprot) # sp.parse method returns a generator try: for rec in iter_handle: break except: return False else: return True
def count_genes_with_EXP(fh_sprot, taxon_id, EXP_default=set([])): gene_count = {} gene_count['MFO'] = 0 gene_count['BPO'] = 0 gene_count['CCO'] = 0 for rec in sp.parse(fh_sprot): # SELECT records that are related to a specific # taxon_id such as 559292 for yeast: if taxon_id in rec.taxonomy_id: # Three flags to check whether an Exp evidence is found # in any of BPO, CCO, and MFO ontological categories: exp_flag = {} exp_flag['MFO'] = False exp_flag['BPO'] = False exp_flag['CCO'] = False # Go over the list of DB cross references: for crossRef in rec.cross_references: # Consider the cross_reference entries that # relate to GO DB: if crossRef[0] == 'GO': goList = [crossRef[1], (crossRef[3].split(':'))[0], crossRef[2][0]] if (crossRef[3].split(':'))[0] in EXP_default: if goList[-1].upper() == 'F': exp_flag['MFO'] = True elif goList[-1].upper() == 'P': exp_flag['BPO'] = True elif goList[-1].upper() == 'C': exp_flag['CCO'] = True # Whenever an exp evidence for all three ontological # categories are found, break out the loop: if (exp_flag['MFO'] and exp_flag['BPO'] and exp_flag['CCO']): break # Increase gene counts in BPO, CCO, and MFO categories # depending on the corresponding flag values: if exp_flag['MFO']: gene_count['MFO'] += 1 if exp_flag['BPO']: gene_count['BPO'] += 1 if exp_flag['CCO']: gene_count['CCO'] += 1 return gene_count
def read_sprot_dat(sprot_dat_file, seq_dict): num_record = 0 for record in SwissProt.parse(open(sprot_dat_file)): # Use Bio.SwissProt to parse the uniprot_sprot.dat file for seqID in record.accessions: if seqID in seq_dict: num_record += 1 if num_record % 10000 == 0: sys.stderr.write("{} records read so far\n".format(num_record)) go_terms = [i[1][3:] for i in record.cross_references if i[0] == 'GO'] # GO terms ['GO:0031012', 'GO:0005576', 'GO:0004222', 'GO:0008270'] organism = record.organism # organism name lineage = record.organism_classification # taxonomic classification ['Viruses', 'dsDNA viruses, no RNA stage', 'Iridoviridae', 'Chloriridovirus'] tax_id = record.taxonomy_id[0] # taxonomy id '345201' gene_name, OLN, ORF = parse_GN(record.gene_name) # GN line,include gene names, ordered locus names, and ORF names full_name, EC = parse_DE(record.description) # DE line with descriptive information. RecName, AltName (Full=, short=, EC=, ...) seq_dict[seqID] = {'organism' : organism, 'EC' : EC, 'gene_name' : gene_name,'OLN' : OLN, 'ORF' : ORF, 'GO': go_terms, 'KW': record.keywords, 'full_name': full_name, 'tax_id': tax_id, 'lineage': lineage} # map primary ID to annotation dictionary else: continue sys.stderr.write("\nnumber of sequences is {}\n".format(len(seq_dict))) return seq_dict
def UNIPROT_CHAIN_LIMITS(UNIPROT_ID): #Given a uniprot, it returns the limits of the mature protein numbering import urllib2 from Bio import SwissProt PAGE=urllib2.urlopen("http://www.uniprot.org/uniprot/%s.txt"%UNIPROT_ID) PARSED_PAGE=SwissProt.parse(PAGE) for record in PARSED_PAGE: CHAIN_VALUES=[] for feature in record.features: if feature[0]=="CHAIN": CHAIN_VALUES=CHAIN_VALUES+[str(feature[1]), str(feature[2])] if any(X.isdigit()==False for X in CHAIN_VALUES) or not CHAIN_VALUES: CHAIN_START=1 CHAIN_END=record.sequence_length else: CHAIN_START=min(int(X) for X in CHAIN_VALUES) CHAIN_END=max(int(X) for X in CHAIN_VALUES) return[CHAIN_START, CHAIN_END]