def get_uniprot_metadata_online(uniprot_ids): uniprot_ids = list(set(uniprot_ids)) print('get_uniprot_metadata', len(uniprot_ids)) BATCH_SIZE = 200 uniprot = UniProt() uniprot_lookup = {} cumulative_total = 0 for x in batch(uniprot_ids, BATCH_SIZE): batch_ids = [i for i in x] cumulative_total += len(batch_ids) print(cumulative_total, '/', len(uniprot_ids)) res = uniprot.retrieve(batch_ids) for r in res: for key in r['accession']: protein_id = key.contents[0] for x in r['recommendedname']: tag = x.find('shortname') if tag is None: tag = x.find('fullname') label = tag.contents[0] uniprot_lookup[protein_id] = {'display_name': label} return uniprot_lookup
def get_protein_info(uniprot_ids): """ Retrieves EMBL accession numbers and taxonomy ids for list of proteins. Creates a dict to map each protein's uid to its EMBL accession number and tax id. :param uniprot_ids: List of Uniprot IDs, e.g., ['P0AAJ3', 'A0NAQ1'] :return: dictionary mapping each uid to its info """ from bioservices import UniProt missing_embl = [] missing_taxid = [] orthos_map = {} u = UniProt() uniprot_records = list(map(lambda x: x.decode("utf-8"), u.retrieve(uniprot_ids, frmt='txt'))) # WSL CLI # uniprot_records = u.retrieve(uniprot_ids, frmt='txt') # PyCharm embl_pattern = re.compile(r"DR\s+EMBL;.*?;\s+(.*?);") taxid_pattern = re.compile(r"OX\s+NCBI_TaxID=(\d+)") for i, record in enumerate(uniprot_records): embl_acc = get_match(embl_pattern, record, uniprot_ids, missing_embl, i) # EMBL accession number for coding seq taxonomy_id = get_match(taxid_pattern, record, uniprot_ids, missing_taxid, i) # tax_id of organism protein belongs to orthos_map[uniprot_ids[i]] = [embl_acc, taxonomy_id] # map protein info to its uid if missing_embl: print('\n{} Protein(s) Missing EMBL Accession Number: '.format(len(missing_embl)) + ', '.join(missing_embl)) if missing_taxid: print('\n{} Protein(s) Missing NCBI TaxID: '.format(len(missing_taxid)) + ', '.join(missing_taxid)) return orthos_map
def get_fasta(self, id_): """Fetches FASTA from uniprot and loads into attrbiute :attr:`fasta` :param str id_: a given uniprot identifier :returns: the FASTA contents """ print("get_fasta is deprecated. Use load_fasta instead") from bioservices import UniProt u = UniProt(verbose=False) res = u.retrieve(id_, frmt="fasta") self._fasta = res[:] return res
def write_fasta_for_ids(uniprot_ids, output_file): u = UniProt(verbose=False) count = 1 all_seqs = [] for uni_id in uniprot_ids: all_seqs.append(u.retrieve(uni_id, 'fasta')) if count % 500 == 0: print("Retrieved sequence for {}/{} IDs".format( count, len(uniprot_ids))) count += 1 all_fasta_seqs = [i for i in all_seqs if not type(i) == int] final_fasta = ''.join(all_fasta_seqs) with open(output_file, 'w') as f: f.write(final_fasta)
def load_fasta(self, id_): """Fetches FASTA from uniprot and loads into attribute :attr:`fasta` :param str id_: a given uniprot identifier :returns: nothing .. note:: same as :meth:`get_fasta` but returns nothing """ # save fasta into attributes fasta from bioservices import UniProt u = UniProt(verbose=False) try: res = u.retrieve(id_, frmt="fasta") # some entries in uniprot are valid but obsolet and return empty string if res == "": raise Exception self._fasta = res[:] except: pass
def hitrate(proteins, indexes, subclass): columns = ['subsequence', 'sprot_start', 'sprot_end', 'sprot_loc', 'dl_start', 'dl_end', 'dl_loc'] u = UniProt() match, total = 0,0 dl_peptides, dl_starts, dl_ends, sprot_starts, sprot_ends, sprot_locs =[], [], [], [], [], [] if proteins != None: for prot in proteins: locs = None try: entry = u.retrieve(prot.ac, frmt='xml') locs = entry['subcellularlocation'] except: continue if locs: pep_metadata = prot.matching_peptide[0] seq_range = pep_metadata.match_range[0] peptide = pep_metadata.peptide dl_peptides.append(peptide) start, end = indexes[peptide] dl_starts.append(start) dl_ends.append(end) pos = seq_range.start sprot_starts.append(pos) sprot_ends.append(seq_range.end) seq_len = seq_range.end - pos offset_weight = 1 if pos == start else min(abs(seq_len / (pos - start)), 1) loc = list((locs[0].children))[1].string sprot_locs.append(loc) match_weight = determine_locations(loc, subclass) * offset_weight match += match_weight # assert(match_weight <= 1),'match_weight {}'.format(match_weight) total += offset_weight if total == 0: hitrate = 0 else: hitrate = match/total vals = [[dl_peptides, sprot_starts, sprot_ends, sprot_locs, dl_starts, dl_ends, subclass]] df = pd.DataFrame(vals, columns=columns) return (hitrate, df)
def get_single_uniprot_metadata_online(uniprot_id): uniprot = UniProt() res = uniprot.retrieve(uniprot_id) return res
Entry_selection_list = [] # empty list created called Entry_selection_list a = 2 # the numerical value 2 is assigned to for y in (Number_selections ): # iterates n times, whereby n = Number_selections a = a + 1 # for every iteration a increases by one Entry_selection_list.append( int(sys.argv[a]) ) # sets up sys.arg[] for n proteins, whereby n = Number_selections for w in Entry_selection_list: # iterates over members of list called Entry_selection_list entry, gene_names = res.split("\n")[w].split( "\t" ) # protein's entry number and gene names are saved in entry and gene_names, respectively seq = u.retrieve(str(entry), frmt="fasta") # gets the fasta sequence for protein print(seq) # outputs value of seq for each iteration entry1 = Entry_selection( int(sys.argv[3]) ) # selects the n entry, whereby n = value of sys.argv[3] in numerical form f = open("Clustalfastaseq.txt", "w+") # opens the file called Clustalfastaseq.txt f.write(entry1) # writes a fasta sequence that entry1 currently equals f.close() # closes the file length_list = int( len(Entry_selection_list)) # length of Entry_selection_list list for i in Entry_selection_list[ 1:length_list]: # iteraretes n-1 times, whereby n = length_list
unilines = d.split("\n") #print(unilines, "bitchHHHHHHHHHHHHHHHHHHHHHH") if len(unilines) > 1: uni_entry = unilines[1].split("\t")[0] uni_entries.append(uni_entry) else: uni_entries.append("NULL") #print("\t", i[0], i[1]) #print(uni_entries) for i in uni_entries: if (i == "NULL"): listPfam.append("NULL") listProsite.append("NULL") else: with open("uniprot_out.txt", "w") as outfile: p = u.retrieve(i, "txt") #newone = open("wow.txt", "w") #print(p) outfile.write(p) #newone.write(p) with open("uniprot_out.txt", "r") as infile: listuno = [] list2 = [] list3 = [] for j in infile.readlines(): listuno.append(j.strip()) #print(listuno) for j in listuno: if "Pfam;" in j: list2.append(j) #print(list2, "lISTT2222222222222222222")