def download_sequences(accessions): records = {} for accession in accessions: handle = ExPASy.get_sprot_raw(accession) record = SwissProt.read(handle) records[accession] = record.sequence return records
def access_sequence(accession): handle = ExPASy.get_sprot_raw(accession) try: record = SwissProt.read(handle) except ValueException: print("WARNING: Accession %s not found" % accession) return record.sequence
def parseBlast(): result_handle = open("./output/blastOut.xml") blast_records = NCBIXML.parse(result_handle) E_VALUE_THRESH = 1 blastHits = {} accessions = {} #Loop through each protein query results for blast_record in blast_records: keyword_list = [] #stores running keyword list queryID = blast_record.query.split()[0].split(':')[ 1] #parse for the query protein ID #Loop through the hits associated with particular sequence for alignment in blast_record.alignments: for hsp in alignment.hsps: #Hit must have e-value < threshold to be considered if hsp.expect < E_VALUE_THRESH: title = alignment.title #title of hit splittitle = title.split() raw_protein_title = title.split('OS')[ 0] #specific keywords in title protein_title = " ".join(raw_protein_title.split()[2:]) keyword_list.append(protein_title) accession = splittitle[1].split('|')[ 1] #parse for the accession number accessions.setdefault(queryID, []).append(accession) handle = ExPASy.get_sprot_raw(accession) record = SwissProt.read(handle) keyword_list += record.keywords keyword_string = '; '.join(keyword_list) blastHits[queryID] = keyword_string break #only take top hit for now return (blastHits, accessions)
def get_SwissProt(dict, accession): try: handle = ExPASy.get_sprot_raw(accession) record = SwissProt.read(handle) dict[accession] = record except urllib2.HTTPError, error: print accession + ": protein not found on UniProt . "
def get_SwissProt(dict,accession): try: handle = ExPASy.get_sprot_raw(accession) record = SwissProt.read(handle) dict[accession] = record except urllib2.HTTPError, error: print accession + ": protein not found on UniProt . "
def sequence_file(*args): '''The function sequence_file save the sequence of the protein in fasta format, to do so the sequence is retrieved and the other necessary information to make the fasta header. We included a try/except chunck to display an Error if the code is invalid''' a = code.get() try: from Bio import ExPASy from Bio import SwissProt with ExPASy.get_sprot_raw(a) as handle: record = SwissProt.read(handle) except: if a == "": open_window("No Code", "Please Insert an Uniprot Code", "#FFC3C3", '200x30') else: open_window("No Valid Code", "Please Insert a valid Uniprot Code", "#FFC3C3", '200x30') descrip = record.description.split(";")[0] num = descrip.find("Full=") + 5 descrip = descrip[num:] fasta_header = ">sp|" + code.get( ) + "|" + record.entry_name + " " + descrip + " OS=" + record.organism filename = filedialog.asksaveasfilename(defaultextension='.fasta', filetypes=[("fasta", "*.fasta")]) TextFile = open(filename, "w") TextFile.write(fasta_header + '\n') TextFile.write(record.sequence) TextFile.close()
def find_COG2(self): """Find records from uniprotIDs without use of keggIDs.""" handle = ExPASy.get_sprot_raw(self.uprotID) record = SwissProt.read(handle) query = record.gene_name.strip("Name""="";") url_open = urllib.urlopen("http://rest.genome.jp/oc/?"+query) return url_open.read()
def fetch_swp_expasy(uniprot_acc): """ Fetch information on SwissProt accession (manually reviewed UniProt entry). http://biopython.org/DIST/docs/api/Bio.SwissProt.Record-class.html Parameters ---------- arg1 : str SwissProt accession or identifier. Returns ------- list list of length 2 with the name of the attributes found and their values. """ #generates record object with information regarding SwissProt identifier handle = ExPASy.get_sprot_raw(uniprot_acc) record = SwissProt.read(handle) #checks all the attributes possibles for the record object generated and their type #attributes are of type: str, tuple, or list #attribute list found here: http://biopython.org/DIST/docs/api/Bio.SwissProt.Record-class.html attrib_names = [ 'accessions', 'data created', 'date created (ISO)', 'organism', 'gene names', 'description', 'comments', 'keywords' ] swp_info_list = [ record.accessions, record.created[0], dating(record.created[0]), record.organism, record.gene_name, record.description, record.comments, record.keywords ] return (attrib_names, swp_info_list)
def main(input_string): record = SwissProt.read(ExPASy.get_sprot_raw(input_string)) for ref in record.cross_references: if ref[0] == 'GO' and ref[2].startswith('P:'): # if reference is a Gene Ontology reference and refers to a # biological process print(ref[2][2:])
def gen_uniprot_features_for_pdb(infile): for line in open(infile,'r'): (pdb_dom, count, uniprot_ids) = line.replace('\n','').split('\t') uniprot_ids = uniprot_ids.split('|') for uniprot_id in uniprot_ids: data = SwissProt.read(ExPASy.get_sprot_raw(uniprot_id)).__dict__ keep = False go = []; interpro = ''; evo_trace = '' for xref in data['cross_references']: if xref[0] == 'GO': go.append(xref[1]) if xref[0] == 'InterPro': interpro = xref[1] if xref[0] == 'EvolutionaryTrace': evo_trace = xref[1] if xref[0] == 'PDB' and xref[1].lower() == pdb_dom.lower(): keep = True if keep == False: continue organism = data['organism'] loc = '' for comment in data['comments']: if comment.startswith('SUBCELLULAR LOCATION'): loc = comment print '%s\t%s\t%s\t%s\t%s\t%s\t%s' %(pdb_dom,uniprot_id,'|'.join(go),interpro,evo_trace,organism,loc)
def main(filename): with open(filename) as fin: my_seq = fin.read().strip() handle = ExPASy.get_sprot_raw(my_seq) record = SwissProt.read(handle) for s in [f[2].split(':')[1] for f in record.cross_references if f[0]=='GO' and f[2][0]=='P']: print s
def get_records(ids): records = [] for id in ids: handle = ExPASy.get_sprot_raw(id) record = SwissProt.read(handle) records.append(record.sequence) return records
def main(): # Read the UniProt ID for a txt file. with open('problem_datasets/rosalind_dbpr.txt', 'r') as infile: uni_id = infile.read().strip() # Retrieve the data from UniProt (separated IDs by commas). raw_data = ExPASy.get_sprot_raw(uni_id) record = SwissProt.read( raw_data) # use SwissProt.parse for multiple proteins # Collect the relevant information. go = [] for i in record.cross_references: if i[2].startswith('P:'): go.append(i[2][2:]) # Output answer. with open('output/rosalind_dbpr_out.txt', 'w') as outfile: outfile.write('\n'.join(go)) # Optional: Print answer and gene ID/name name = record.gene_name.split(' ')[0][5:] print('Gene:\n', name, ' (UniProt ID = ', uni_id, ')\n\nBiological Processes:\n', '\n'.join(go), sep='')
def swissprot_search(): f = open('output/seq_accession.txt') db = f.readline() for accession in f: handle = ExPASy.get_sprot_raw(accession) record = SwissProt.read(handle) print(record)
def get(self,id): """Open and Read a Swiss-Prot file locally from remote source (ExPASy database) Swiss-Prot file over the internet from the ExPASy database. Input must be a accession number stored on the swissprot site. """ handle = ExPASy.get_sprot_raw(id) record = SwissProt.read(handle) return record
def getgo(id): handle = ExPASy.get_sprot_raw(id) record = SwissProt.read(handle) go = [ r[2].split(":")[1] for r in record.cross_references if r[0] == "GO" and r[2].startswith("P") ] print("\n".join(go))
def main(id): handle = ExPASy.get_sprot_raw(id) record = SwissProt.read(handle) for cr in record.cross_references: if cr[0] == "GO": bits = cr[2].split(":") if bits[0] == "P": print bits[1]
def main(argv): # input() reads stdin handle = ExPASy.get_sprot_raw(input().strip()) #you can give several IDs separated by commas record = SwissProt.read(handle) # use SwissProt.parse for multiple proteins # there ought to be a better way to pull GO information from the record! maybe there is... for p in filter(lambda x:x[0]=='GO' and x[2].startswith('P:'),record.cross_references): print(p[2][2:])
def main(): with open("dbpr") as f: handle = ExPASy.get_sprot_raw(f.readline().strip()) record = SwissProt.read(handle) record = [x[2] for x in record.cross_references if x[0] == 'GO'] record = [x[2:] for x in record if x[0] == 'P'] sys.stdout = open("dbpr.out","w") print "\n".join(record)
def MouseHomolog(self, dfs): print('\nFinding mouse homologs') ind = 0 new_dfs = [] for acc in self.accs: try: handle = ExPASy.get_sprot_raw(acc) record = SwissProt.read(handle) name = record.entry_name except: print('\nNo entry for', acc, ',continuing') ind += 1 continue try: mname = name.split('_')[0] + '_MOUSE' mhandle = ExPASy.get_sprot_raw(mname) mrecord = SwissProt.read(mhandle) mseq = mrecord.sequence print(f'\nFound mouse homolog for {name}: {mname}') except: print(f'\nNo mouse gene entry for {acc}-{name}, continuing') ind += 1 continue df = dfs[ind] mcol = [] for row in range(len(df)): pepseq = df.Sequence[df.index[row]] print(pepseq) if str(pepseq) in mseq: mcol.append('True') else: mcol.append('False') df['Mouse'] = mcol new_dfs.append(df) ind += 1 df_final = pd.concat(new_dfs, sort=True) df_final.to_excel(self.out_folder + '/' + 'MouseHomologPeptides.xlsx', index=True)
def protfunction(query_proteins): """Shows the proteins function given their names or ids str -> list""" function_list = [] for prot in query_proteins: with ExPASy.get_sprot_raw(prot) as handle: record = SwissProt.read(handle) function_list.append((prot, record.comments[0][10:])) return function_list
def print_bio_process(file): Uniprot_id = file.read().rstrip() handle = ExPASy.get_sprot_raw(Uniprot_id) rec = SwissProt.read(handle) bio_process = [ i[2][2:] for i in rec.cross_references if i[0] == 'GO' and i[2].startswith('P') ] print('\n'.join(bio_process))
def dbpr(): uniprot_id = open("rosalind_dbpr.txt").read().strip() handle = ExPASy.get_sprot_raw(uniprot_id) record = SwissProt.read(handle) # return the list of biological functions for ref in record.cross_references: if ref[0] == 'GO' and ref[2].startswith('P:'): print ref[2][2:]
def main(argv): line = files.read_line(argv[0]) handle = ExPASy.get_sprot_raw(line) record = SwissProt.read(handle) go = filter(lambda x: x[0] == 'GO' and 'P:' in x[2], record.cross_references) print '\n'.join(g[2].split(':')[1] for g in go)
def DBPR(id): handle = ExPASy.get_sprot_raw(id) # several IDs can be separated by commas record = SwissProt.read( handle) # use SwissProt.parse for multiple proteins GO = [] for item in record.cross_references: if item[0] == 'GO': if item[2].split(':')[0] == 'P': GO.append(item[2].split(':')[1]) return GO
def find_function(prot): handle = ExPASy.get_sprot_raw(prot) # Can give several IDs separated by commas record = SwissProt.read(handle) # Use SwissProt.parse for multiple proteins functions = [] for ref in record.cross_references: if ref[0] == 'GO' and ref[2][0] == 'P': print(ref) functions.append(ref[2][2:]) return functions
def acession(self): self.rec=[] for ide in self.ids: if ide!='ND': results=ExPASy.get_sprot_raw(ide) rec=SwissProt.read(results) self.rec.append(rec) else: self.rec.append('ND') return self.rec
def BiologicalProcesses(UniProtID): Handle = ExPASy.get_sprot_raw(UniProtID) Record = SwissProt.read(Handle) Processes = [] for i in Record.cross_references: if "GO" in i: for j in i: if re.match("P:.*", j): Processes.append(j[j.rfind(':')+1:]) return "\n".join(Processes)
def fetch(acc) : '''Downloads data from UniProt. Input: acc: accession code of the record database: database name Return: the Entrez record ''' base_url = 'http://www.uniprot.org/uniprot/' handle = urllib.request.urlopen(base_url + acc + '.txt') record = SwissProt.read(handle) return record
def eachget(self, id_list): a = [] for eachid in id_list: try: record = SwissProt.read(ExPASy.get_sprot_raw(eachid)) #print 'testing\n' a += [record] except: #something to do if no summary found print('something wrong with this id:%s\n' % eachid) #here do nothing return a
def geneontology(query_proteins): """Retrieves gene ontology biological processes given protein names or ids str -> set""" gene_ontology = [] for prot in query_proteins: with ExPASy.get_sprot_raw(prot) as handle: record = SwissProt.read(handle) for ref in record.cross_references: if ref[0] == "GO" and ref[2].startswith("P"): gene_ontology.append((prot, ref[2].split(":")[1])) return gene_ontology
def get_keywords(lookup): try: handle = ExPASy.get_sprot_raw(lookup) except: print("Error in ExPASy") sys.exit(1) try: record = SwissProt.read(handle) except ValueError, error: print(error) sys.exit(1)
def get_bio_processes(protein): processes = [] handle = ExPASy.get_sprot_raw(protein) record = SwissProt.read(handle) for ref in record.cross_references: if ref[0] == 'GO': if ref[2].startswith('P'): processes.append(ref[2].split(":")[1]) return processes
def records(self): """Return a dictionary of ID and swissprot records from query.""" record_dict = {} except_ids = [] for i in self.IDs(): try: handle = ExPASy.get_sprot_raw(i) record_dict[i] = SwissProt.read(handle) except HTTPError, AssertionError: print("there was a problem finding uniprotID {} \n\ try Records_fromfile-method".format(i))
def accessionSearch(): try: sInput = entryAccession.get() # get text field contents handle = ExPASy.get_sprot_raw(sInput) # for use in SwissProt.read method record = SwissProt.read(handle) # generates record from fasta code handle.close() # close handle since it's no longer in use except: # if exception is raised, display message to user lblResults.configure(text="invalid accession code!\n please try again...") else: # otherwise, submit sequence to motifFinder function motifFinder(record.sequence)
def main(protein_id): handle = ExPASy.get_sprot_raw(protein_id) #you can give several IDs separated by commas record = SwissProt.read(handle) # use SwissProt.parse for multiple proteins answer = "" for r in record.cross_references: print r if r[0] == "GO": if r[2].split(":")[0] == 'P': answer += r[2].split(":")[1] + "\n" return answer.strip()
def getDataFromProt(protid): url = "http://www.uniprot.org/uniprot/" + protid + ".txt" txt = urlopen(url).read() f = open((protid + ".dat"), "w") f.write(txt.decode('utf-8')) f.close() handle = open((protid + ".dat")) parsed = SwissProt.read(handle) status, locale, fmol, bio, name, id, function, length, ec = getInfoTxt( parsed) return name, id, locale, status, fmol, bio, function, length, ec
def run(user_input="""Q5SLP9"""): uniprot_id = user_input.strip() handle = ExPASy.get_sprot_raw(uniprot_id) record = SwissProt.read(handle) gene_onotology = list(filter(lambda x: x[0] == "GO", record.cross_references)) bio_processes = list(filter(lambda x: str(x[2]).startswith("P:"), gene_onotology)) process_names = [str(process[2])[2:] for process in bio_processes] print(process_names) result = "\n".join(process_names) print(result) return result
def _shell_lookup(args): """This function is called when the script is used from command line: [jakni@nissen scripts]$ python unifetch.py -a A6XGL2 -ncis Name: A6XGL2_HUMAN Data class: Unreviewed TaxID: 9606 Sequence: MALWMRLLPLLALLALWGPDPAAAFVNQHLCGSHLVEALYLVCGERGFFYTPKTRRE [ ... ] """ with _gnu.open(args.database) as database: data = database.get(args.accession, None) # If no accession is found, return "Not found." if data is None: return 'Not found.' fields = {'Name': [args.name], 'Date': [args.date], 'Data class': [args.dataclass], 'Organism': [args.organism], 'Taxonomy': [args.taxonomy], 'TaxID': [args.taxid], 'Sequence': [args.sequence] } # If nothing particular is specified, return the entire accession if not any(arr[0] for arr in fields.values()): text = _gzip.decompress(data).decode() return text else: # If output specified, return the relevant parts. fileobject = _io.BytesIO(_gzip.decompress(data)) record = _SwissProt.read(fileobject) fields['Name'].append(record.entry_name) fields['Date'].append(record.created[0]) fields['Data class'].append(record.data_class) fields['Organism'].append(record.organism) species = get_species(record) fields['Taxonomy'].append( ';'.join(record.organism_classification + ([species] if species else []))) fields['TaxID'].append(';'.join(record.taxonomy_id)) fields['Sequence'].append(record.sequence) output = list() for title, (state, information) in fields.items(): if state: output.append('{}: {}'.format(title, information)) return '\n'.join(output)
def download_entry(self, accession): try: handle = ExPASy.get_sprot_raw(accession) record = SwissProt.read(handle) except: raise KeyError('{}'.format(accession)) record_org = record.organism.strip().lower() if self.organism not in record_org: print('{} ortholog of {} not found.'.format(self.organism, accession)) raise KeyError('{} ortholog of {} not found.'.format(self.organism, accession)) else: self.records[accession] = record return record
def main(): #Grab our input id value uniprot_id = get_uniprot_id_from_file(arguments['<input>']) #Get a handle on the data for the uniprot id handle = ExPASy.get_sprot_raw(uniprot_id) #Parse our data record = SwissProt.read(handle) handle.close() #Process out the stuff of interest, GO values in this case go_refs = [ref[1:] for ref in record.cross_references if ref[0] == 'GO'] for go_entry in go_refs: pre, val = go_entry[1].split(':') if pre == 'P': print(val)
def main(fichier): """ navigate into protein database """ f = open(fichier,'r') fline = f.readline().strip() from Bio import ExPASy from Bio import SwissProt handle = ExPASy.get_sprot_raw(fline) record = SwissProt.read(handle) go = [] for i in record.cross_references: if i[0] == 'GO' and i[2][0]=='P': go.append(i[2].lstrip('P:')) print '\n'.join(go)
def main(): # Read the UniProt ID for a txt file. with open('problem_datasets/rosalind_dbpr.txt', 'r') as infile: uni_id = infile.read().strip() # Retrieve the data from UniProt (separated IDs by commas). raw_data = ExPASy.get_sprot_raw(uni_id) record = SwissProt.read(raw_data) # use SwissProt.parse for multiple proteins # Collect the relevant information. go = [] for i in record.cross_references: if i[2].startswith('P:'): go.append(i[2][2:]) # Output answer. with open('output/rosalind_dbpr_out.txt', 'w') as outfile: outfile.write('\n'.join(go)) # Optional: Print answer and gene ID/name name = record.gene_name.split(' ')[0][5:] print('Gene:\n', name, ' (UniProt ID = ', uni_id, ')\n\nBiological Processes:\n', '\n'.join(go), sep='')
def snp_uniprot(uniprotname, selection='(all)', label=1, name='', quiet=0): ''' DESCRIPTION Selects all UniProt annotated nsSNPs (natural variants) in given structure. Does a sequence alignment of UniProt sequence and PDB sequence. USAGE snp_uniprot uniprotname [, selection [, label [, name [, quiet]]]] ARGUMENTS uniprotname = string: UniProt reference (like HBB_HUMAN or P68871) selection = string: atom selection label = 0 or 1: Label CA atoms of nsSNPs with mutation {default: 1} name = string: name of new selection {default: nsSNPs} EXAMPLE fetch 3HBT snp_uniprot ACTG_HUMAN, chain A SEE ALSO snp_ncbi ''' from Bio import ExPASy from Bio import SwissProt handle = ExPASy.get_sprot_raw(uniprotname) record = SwissProt.read(handle) snp_common(record, selection, label, name, quiet)
records = KeyWList.parse(handle) codes = [] review = open("proteinas_uniprot.txt", "w") for record in records: review.write("\n"+record['ID']+"\n") review.write("\n"+record['DE']+"\n") codes.append(record['AC'][:-1])#remover ";" no final de cada código de acesso review.close() #análise individual das proteínas relevantes (baseado nos códigos desenvolvidos pelos grupos 10 e 7) f = open("analise_reviewed.txt","w") for code in codes: data = urllib.urlopen("http://www.uniprot.org/uniprot/" + code + ".txt") while True: try: record = SwissProt.read(data) for ref in record.references: f.write("\n\n****Informacao sobre a proteina %s****\n" %code) f.write("\n\nNome: %s\n" %record.entry_name) f.write("\nClasse: %s\n" %record.data_class) f.write("\nTipo de molecula: %s\n" %record.molecule_type) f.write("\nTamanho da sequencia: %s\n" %record.sequence_length) f.write("\nCodigo de Accesso: %s\n" %record.accessions) f.write("\nCriado: %s\n"% str(record.created)) f.write("\nAdaptacao da sequencia: %s\n" %str(record.sequence_update)) f.write("\nAdaptacao da anotacao: %s\n" %str(record.annotation_update)) f.write("\nDescricao: %s\n" %record.description) f.write("\nNome do gene: %s\n" %record.gene_name) f.write("\nOrganismo: %s\n" %record.organism) f.write("\nOrganelo: %s\n" %record.organelle) f.write("\nClassificacao do Organismo: %s\n" %record.organism_classification)
from Bio import ExPASy from Bio import SwissProt id = "Q5SLP9" handle = ExPASy.get_sprot_raw(id) record = SwissProt.read(handle) for x in record.cross_references: if x[2][0:2] == 'P:': print x[2][2:]
#!/usr/bin/env python import sys from Bio import ExPASy from Bio import SwissProt if __name__ == '__main__': rec = SwissProt.read(ExPASy.get_sprot_raw(sys.stdin.read().strip())) gos = [r[2].split(':')[1] for r in rec.cross_references if r[0] == 'GO' and r[2].startswith('P')] print('\n'.join(gos))
# http://rosalind.info/problems/dbpr/ from Bio import ExPASy, SwissProt if __name__ == '__main__': # Getting the UniProd ID of a protein uniprot_id = SwissProt.read(ExPASy.get_sprot_raw(open('rosalind_dbpr.txt').read().strip())) # Getting a list of biological processes processes = [r[2].split(':')[1] for r in uniprot_id.cross_references if r[0] == 'GO' and r[2].startswith('P')] print('\n'.join(processes))
def protein_record(protein): """Return the SwissProt record of a protein with id protein.""" handle = ExPASy.get_sprot_raw(protein) # you can give several IDs separated by commas return SwissProt.read(handle) # use SwissProt.parse for multiple proteins
""" BioPython + regular expression demo based on http://www.pasteur.fr/recherche/unites/sis/formation/python/ch11s04.html """ from Bio import SwissProt import re fd = open('ceru_human.sp') # file descriptor (handle) r = SwissProt.read(fd) # record from file print r.entry_name print r.sequence PS00079 = 'G.[FYW].[LIVMFYW].[CST].{8,8}G[LM]...[LIVMFYW]' # pattern for regexp p = re.compile(PS00079) # regular expression pattern object m = p.search(r.sequence) # matching string in sequence i = m.start() # index of start of match j = m.end() # index of end of match print i print j print r.sequence[i:j] # print a slice of the sequence
def get_swissrec(accession): handle = ExPASy.get_sprot_raw(accession) record = SwissProt.read(handle) return record
#print "<br/>stringWithProteins:"+str(stringWithProteins) arrayWithProteins=stringWithProteins.split(","); #print "<br/>arrayWithProteins "+str(arrayWithProteins) #Recorremos el arrayWithProteins y las guardamos en la tabla proteins, si es que no estan. De cada proteina este o no este tenemos que guardar el id_protein para generar el string #con las id_proteinas que guardaremos en la tabla enzymes proteins mas adelante arrayWithIdProteins=[] for protein in arrayWithProteins: #obtenemos el proteinName para esta protein url="http://www.uniprot.org/uniprot/"+str(protein)+".txt" #print "<br/>"+url try: filehandle = urllib.urlopen(url) except: print "Location: "+str(redirectionKOcurated)+"error=UniprotConnectionProblem&idEvidence="+idEvidence+" \n\n" sys.exit() record = SwissProt.read(filehandle) #print dir(record) description=str(record.description) #En description podemos tener algo así: #RecName: Full=Aspartate aminotransferase, mitochondrial; Short=mAspAT; EC=2.6.1.1; AltName: Full=Fatty acid-binding protein; Short=FABP-1; AltName: Full=Glutamate oxaloacetate transaminase 2; AltName: Full=Plasma membrane-associated fatty acid-binding protein; Short=FABPpm; AltName: Full=Transaminase A; Flags: Fragment; #Nos quedamos con la primera parte. arrayNombres=description.split(";") proteinName=arrayNombres[0] #En proteinName ahora tenemos algo así: ##RecName: Full=Aspartate aminotransferase, mitochondrial #Tenemos que quitar la parte de RecName: Full= proteinName=proteinName.replace("RecName: Full=","") proteinName=proteinName.replace("SubName: Full=","") #Ya tenemos todos los datos para guardar la proteina filehandle.close() selectProtein="select id_protein, id_uniprot, protein_name from proteins where id_uniprot='"+str(protein)+"'"
print "*Using SequenceParser" test_handle = open(datafile) seq_record = SeqIO.read(test_handle, "swiss") test_handle.close() assert isinstance(seq_record, SeqRecord) print seq_record.id print seq_record.name print seq_record.description print repr(seq_record.seq) print "*Using RecordParser" test_handle = open(datafile) record = SwissProt.read(test_handle) test_handle.close() # test a couple of things on the record -- this is not exhaustive print record.entry_name print record.accessions print record.organism_classification print record.seqinfo print "***Features:" for feature in record.features: print feature print "***References:" for ref in record.references: print "authors:", ref.authors
PDB_subset_nochain = [x.split('_')[0] for x in PDB_subset] # parse the pdb headers for DBREF to uniprot pdb_to_uniprot = find_uniprot_in_pdb(PDB_subset_nochain, pdb_folder) #determine the uniprot references to fetch to_fetch = [] for entry in pdb_to_uniprot.keys(): for ref in pdb_to_uniprot[entry].values(): if ref not in to_fetch: to_fetch.append(ref) #fetch uniprot references as Record objects, then move them to a serializable dict uniprot_records = {} uniprot_failed = [] for ref in to_fetch: try: with ExPASy.get_sprot_raw(ref) as handle: uniprot_records[ref] = {'record' : SwissProt.read(handle)} except (HTTPError, ValueError): uniprot_failed.append(ref) #deprecated uniprot entries fail on urllib problems serialize(uniprot_records, uniprot_folder, uniprot_file) serialize(uniprot_failed, uniprot_folder, uniprot_file.replace('.', '_failed.')) ###### Let's get all the pdb xreferences from the uniprot entries we have, and put them in # a sensible data structure clean = uniprot_records.copy() for ref in list(uniprot_records.keys()): clean[ref]['xrefs'] = {} clean[ref]['seq'] = uniprot_records[ref]['record'].sequence for xref in uniprot_records[ref]['record'].cross_references: #xref format ('PDB', ....... ,'A=1-451') for what we want if xref[0] == 'PDB': # possible format (1): E=28-337, F=731-744 - this will raise ValueError
# input CSV file should be one line per protein, in the format: # [SwissProt ID],[Domain start residue],[Domain stop],[cDNA sequence] import re, csv, sys from Bio import ExPASy, SwissProt, SeqIO from Bio.Seq import Seq from Bio.Alphabet import IUPAC reader = csv.reader(open('test.csv')) extracted = [] j=0 for row in reader: input_prot = row[0] get_prot = ExPASy.get_sprot_raw(input_prot) prot_record = SwissProt.read(get_prot) get_prot.close() prot_seq = prot_record.sequence prot_gene = prot_record.gene_name prot_domain = prot_seq[int(row[1])-1:int(row[2])] cdna = Seq(row[3], IUPAC.unambiguous_dna) outputfile = open('cDNA_extracted.csv', 'w') writer = csv.writer(outputfile) i=0 # Steps through each possible frame of the input cDNA while i < 3: frame = cdna[i::] trans = frame.translate() orf_find = re.search(str(prot_domain), str(trans)) if orf_find:
def SWAT(id): handle = ExPASy.get_sprot_raw(id) # several IDs can be separated by commas record = SwissProt.read(handle) # use SwissProt.parse for multiple proteins return record.sequence
from Bio import SeqIO from Bio import SwissProt #Ler Ficheiro de Interesse record = SeqIO.read("sequence.gb", "genbank") #Associar referencia Swissprot a cada feature acess= {"lpg2594":"Q5ZSC5","lpg2608":"Q5ZSB1","lpg26158":"Q5ZSA4", "lpg2624":"Q5ZS95","lpg2636":"Q5ZS83","lpg2645":"Q5ZS74", "lpg2657":"Q5ZS62","lpg2709":"Q5ZS10","lpg2768":"Q5ZRV8",} #Exportar informacao relevante save_file = open("My_Swissprot.txt", "w") save_file.write ("SWISSPROT REGIST" + "\n") save_file.write ("\n") for f in record.features: if f.type == "CDS" and acess.has_key(f.qualifiers["locus_tag"][0]): #Verifica se existe registo da feature no SwissProt handle = ExPASy.get_sprot_raw(acess[f.qualifiers["locus_tag"][0]]) swiss_record = SwissProt.read(handle) #Cria objeto SwissProt.Record text1= "Gene name: " + f.qualifiers["locus_tag"][0] + "\n" + "Entry name: " + swiss_record.entry_name + "\n" text2= "Sequence length: " + str(swiss_record.sequence_length)+ "\n" + "Organism: " + str(swiss_record.organism) + "\n" text3= "Organism Classification: " + str(swiss_record.organism_classification) + "\n" + "Taxonomic ID: " + str(swiss_record.taxonomy_id[0])+ "\n" text4= "Description: " + str(swiss_record.description).strip("RecName: Full=")+ "\n" save_file.write(text1+text2+text3+text4) save_file.write("\n") handle.close() save_file.close() #Terminar print "Registo exportado com sucesso!"
def getgo(id): handle = ExPASy.get_sprot_raw(id) record = SwissProt.read(handle) go = [r[2].split(":")[1] for r in record.cross_references if r[0] == "GO" and r[2].startswith('P')] print "\n".join(go)
#!/usr/bin/python from Bio import ExPASy from Bio import SwissProt handle = ExPASy.get_sprot_raw('B5ZC00') #you can give several IDs separated by commas record = SwissProt.read(handle) # use SwissProt.parse for multiple proteins print dir(record)