def parse_uniprot(entries): ent = ['id:' + s for s in entries] print(ent[2]) u = UniProt(verbose=True) df = u.get_df(ent) return df
def __init__(self, verbose=True): """.. rubric:: Constructor :param bool verbose: print informative messages .. doctest:: >>> from bioservices import PSICQUIC >>> s = PSICQUIC() """ self.services = REST( "PSICQUIC", verbose=verbose, url="https://www.ebi.ac.uk/Tools/webservices/psicquic", url_defined_later=True) # this prevent annoying warning self._registry = None try: self.uniprot = UniProt(verbose=False) except: self.services.logging.warning( "UniProt service could be be initialised") self.buffer = {}
def kegg_to_symbol_through_uniprot(unknown_genes): # create string to call uniprot for mapping search_string = '\t'.join(unknown_genes) kegg_to_gene_name = dict() missing = set() uniprot = UniProt(verbose=True) # This is where it gets tricky. Checking to see if there is a uniprot # mapping for the species, if not, trying from KEGG side. Sometimes # kegg links to a different uniprot, or uniprot links to a diff kegg. uni_dict = dict(uniprot.mapping("KEGG_ID", "ACC", query=search_string)) for i in unknown_genes: if i in uni_dict: for n in uni_dict[i]: x = uniprot.search("accession:{}".format(n), columns='genes(PREFERRED),reviewed,id', limit=1) _, data = x.rstrip('\n').split('\n') name, review, entry = data.split('\t') if n != entry: print(i, n, entry, x, "dont match") elif review == 'reviewed': kegg_to_gene_name[i] = name else: missing.add(i) print("{} mappings not found from kegg to" " gene name".format(len(missing))) print(missing) return kegg_to_gene_name
def __init__(self, input, parent=None): QThread.__init__(self, parent) self.seqs = input[0] self.nodes = None if not input[1] else input[1] self.u = UniProt(verbose=False) self.PDBLogger = logging.getLogger("PDBSearch") del input, parent
def search_uniprot(genes, columns): start_time = datetime.datetime.now() with yaspin(text="Performing UniProt search...", color="cyan") as sp: uniprot = UniProt(verbose=False) raw_data = '' headers = True for chunk in chunks(genes, 10): gene_search = "+OR+".join(list(chunk)) new_data = uniprot.search(gene_search, frmt="tab", columns=f"entry name, {','.join(columns)}") # Removes first line if this is second or next batches. try: if not headers: new_data = new_data.split("\n")[1] raw_data = raw_data + "\n" + new_data headers = False except IndexError: pass data = pandas.read_csv(pandas.compat.StringIO(raw_data), sep="\t") time_diff = (datetime.datetime.now() - start_time).total_seconds() sp.text = f"Performing UniProt Search => Task done in {time_diff} seconds." sp.ok("✔") return data
def kegg_to_symbol_through_uniprot(unknown_genes): # create string to call uniprot for mapping search_string = '\t'.join(unknown_genes) kegg_to_gene_name = dict() missing = set() uniprot = UniProt(verbose=True) # This is where it gets tricky. Checking to see if there is a uniprot # mapping for the species, if not, trying from KEGG side. Sometimes # kegg links to a different uniprot, or uniprot links to a diff kegg. uni_dict = dict(uniprot.mapping("KEGG_ID", "ACC", query=search_string)) for i in unknown_genes: if i in uni_dict: for n in uni_dict[i]: x = uniprot.search("accession:{}".format(n), columns='genes(PREFERRED),reviewed,id', limit=1) header, data = x.rstrip('\n').split('\n') name, review, entry = data.split('\t') if n != entry: print(i, n, entry, x, "dont match") elif review == 'reviewed': kegg_to_gene_name[i] = name else: missing.add(i) print("{} mappings not found from kegg to" " gene name".format(len(missing))) print(missing) return kegg_to_gene_name
def test_extract_protein_interactions_kgml(self, kgml_file, expected_no_rel): # Arrange sut = KeggProteinInteractionsExtractor() with open( os.path.join(os.path.dirname(os.path.realpath(__file__)), kgml_file), 'r') as myfile: kgml_string = myfile.read() # Mock Kegg ops mock_kegg = KEGG() sut.kegg = mock_kegg # No matter what the input is, return the ko numbers that map to hsa numbers mock_kegg.link = MagicMock(return_value="ko:K00922 hsa:5293\n" + "ko:K00922 hsa:5291\n" + "ko:K02649 hsa:5295") # No matter what the input is, return the hsa numbers that map to uniprot numbers mock_kegg.conv = MagicMock(return_value={"hsa:5293": "up:B0LPE5"}) # Mock Uni Prot mock_uniprot = UniProt() sut.uniprot = mock_uniprot mock_uniprot.mapping = MagicMock( return_value={"B0LPE5": ["gene1", "gene2"]}) # Act actual = sut.extract_protein_interactions_kgml(kgml_string) # Assert self.assertEqual(expected_no_rel, len(actual))
def get_uniprot_metadata_online(uniprot_ids): uniprot_ids = list(set(uniprot_ids)) print('get_uniprot_metadata', len(uniprot_ids)) BATCH_SIZE = 200 uniprot = UniProt() uniprot_lookup = {} cumulative_total = 0 for x in batch(uniprot_ids, BATCH_SIZE): batch_ids = [i for i in x] cumulative_total += len(batch_ids) print(cumulative_total, '/', len(uniprot_ids)) res = uniprot.retrieve(batch_ids) for r in res: for key in r['accession']: protein_id = key.contents[0] for x in r['recommendedname']: tag = x.find('shortname') if tag is None: tag = x.find('fullname') label = tag.contents[0] uniprot_lookup[protein_id] = {'display_name': label} return uniprot_lookup
def get_protein_info(uniprot_ids): """ Retrieves EMBL accession numbers and taxonomy ids for list of proteins. Creates a dict to map each protein's uid to its EMBL accession number and tax id. :param uniprot_ids: List of Uniprot IDs, e.g., ['P0AAJ3', 'A0NAQ1'] :return: dictionary mapping each uid to its info """ from bioservices import UniProt missing_embl = [] missing_taxid = [] orthos_map = {} u = UniProt() uniprot_records = list(map(lambda x: x.decode("utf-8"), u.retrieve(uniprot_ids, frmt='txt'))) # WSL CLI # uniprot_records = u.retrieve(uniprot_ids, frmt='txt') # PyCharm embl_pattern = re.compile(r"DR\s+EMBL;.*?;\s+(.*?);") taxid_pattern = re.compile(r"OX\s+NCBI_TaxID=(\d+)") for i, record in enumerate(uniprot_records): embl_acc = get_match(embl_pattern, record, uniprot_ids, missing_embl, i) # EMBL accession number for coding seq taxonomy_id = get_match(taxid_pattern, record, uniprot_ids, missing_taxid, i) # tax_id of organism protein belongs to orthos_map[uniprot_ids[i]] = [embl_acc, taxonomy_id] # map protein info to its uid if missing_embl: print('\n{} Protein(s) Missing EMBL Accession Number: '.format(len(missing_embl)) + ', '.join(missing_embl)) if missing_taxid: print('\n{} Protein(s) Missing NCBI TaxID: '.format(len(missing_taxid)) + ', '.join(missing_taxid)) return orthos_map
def main(): """ Main function.""" args = parse_args() if args.log: logfile = args.log logging.basicConfig(filename=logfile, level=logging.DEBUG, \ filemode='w', format='%(asctime)s %(message)s', \ datefmt='%Y-%m-%d %H:%M:%S') else: logfile = sys.stdout #Interface to the UniProt service u = UniProt(verbose=False) con = sqlite3.connect("PDB_Chain_Uniprot.db") cur = con.cursor() cur.execute("SELECT SP_PRIMARY FROM PDB_Chain_Uniprot WHERE Swissprot_Id = '' LIMIT 1000" ) #cur.execute("SELECT SP_PRIMARY FROM PDB_Chain_Uniprot WHERE Swissprot_Id =?",(str(current_swissprot),)) rows = cur.fetchall() for row in rows: #print(str(row[0])) res = u.search(str(row[0]),limit=1) #print(res) if res != "" : for line in res.split("\n")[1:-1]: if(line != ""): res_id, res_Entry_Name, res_status, res_protein_names, res_gene_names, res_organism, Length = line.split("\t") cur.execute("UPDATE PDB_Chain_Uniprot SET Swissprot_Id=? WHERE SP_PRIMARY = ?",(str(res_Entry_Name),str(row[0]),)) con.commit() con.close()
def get_id_from_bioservice(entity): from bioservices import UniProt u = UniProt(cache=True) # 数据库API查询 Ids = None temp = [] temp_cc = [] # df = u.get_df(["Q9HCK8"]) # print(df) res_reviewed = u.search(entity + '+reviewed:yes', frmt="tab", columns="id, entry name, genes, comment(FUNCTION)", limit=5) # , protein names res_unreviewed = u.search( entity, frmt="tab", columns="id, entry name, genes, genes(PREFERRED)", limit=5) # print(res_reviewed) # print(res_unreviewed) if res_reviewed == 400: print('请求无效\n') return Ids if res_reviewed: # 若是有返回结果 results = res_reviewed.split('\n')[1:-1] # 去除开头一行和最后的'' for line in results: results = line.split('\t') temp.append(results[0]) temp_cc.append(results[-1]) # break return temp, temp_cc
def __init__(self,specie,taxid): self.query = "" self.specie = specie self.taxid = taxid self.dicotested = {} #avoid to test several time the same ID self.unip = UniProt()
def uniprot_acc_to_taxonmy(self, accesion): """From one uniprot ID to taxonomy""" from bioservices import UniProt u = UniProt() data = u.search(accesion, frmt="xml") from bs4 import BeautifulSoup soup = BeautifulSoup(data, "html.parser") return ' (' + ', '.join([t.text for t in soup.find_all('taxon')]) + ')'
def retrieve_label_from_uniprot_df(ID): uniprot = UniProt() df = uniprot.get_df(ID) label = df["Taxonomic lineage (PHYLUM)"][0] if type(label) == np.float64 and np.isnan(label): raise ValueError("Label was NaN") return label
def get_cnograph_intact(self, label="entry_name"): """Return cnograph made of the protein names found in the interactions of the annotations. .. plot:: :include-source: :width: 50% from msdas import * a = annotations.Annotations(get_yeast_small_data(), "YEAST") a.get_uniprot_entries() a.set_annotations() n = network.NetworkFromUniProt(a.annotations) c = n.get_cnograph_intact() c.plotdot() """ assert label in ["entry_id", "entry_name"] c = CNOGraph() interactions = self.annotations["Interacts with"] # add all nodes c.add_nodes_from(interactions.index) # some have no interactions in which case, it is filled with NaN. let us drop those # entries. interactions = interactions.dropna() indices = interactions.index for i, index in enumerate(indices): print("{}/{}".format(i+1, len(indices))) these_interactions = interactions.ix[index].split(';') these_interactions = [x.strip() for x in these_interactions] for interaction in these_interactions: if interaction == "Itself": interaction = index c.add_reaction("{}={}".format(index, interaction)) if label == "entry_id": c._signals = list(self.annotations.index) else: # bioservices required because interacting species may not be part # of the list of measurements, from bioservices import UniProt u = UniProt(verbose=self.verbose) mapping = u.multi_mapping(fr="ACC", to="ID", query=c.nodes()) for k, v in mapping.iteritems(): if len(mapping[k])>1: print("ambigous case {} with more than 1 mapping. will take only first".format(k)) mapping[str(k)] = str(v[0].split("_")[0]) c.relabel_nodes(mapping) measured = [x.split("_")[0] for x in self.annotations['Entry name']] c._signals = measured return c
def uniprot2genename(self, name): """Return the gene names of a UniProt identifier""" from bioservices import UniProt c = UniProt(cache=True) try: res = pd.read_csv(StringIO(c.search(name, limit=1)), sep='\t') return list(res['Gene names'].values) except: print("Could not find %s" % name)
def retrieve_label_from_uniparc(ID): uniprot = UniProt() columns, values = uniprot.search(ID, database="uniparc", limit=1)[:-1].split("\n") name_idx = columns.split("\t").index("Organisms") name = values.split("\t")[name_idx].split("; ")[0] columns, values = uniprot.search(name, database="taxonomy", limit=1)[:-1].split("\n") lineage_idx = columns.split("\t").index("Lineage") label = values.split("\t")[lineage_idx].split("; ")[:2][-1] return label
def pI_calc(dataframe): df = dataframe u = UniProt() for index, row in df.iterrows(): seqce = u.search(df.loc[index, "prot_acc"], frmt="tab", columns="sequence").split('\n') p_i = ipc.predict_isoelectric_point(seqce[1]) df.loc[index, "pI"] = p_i return df
def get_fasta(self, id_): """Fetches FASTA from uniprot and loads into attrbiute :attr:`fasta` :param str id_: a given uniprot identifier :returns: the FASTA contents """ print("get_fasta is deprecated. Use load_fasta instead") from bioservices import UniProt u = UniProt(verbose=False) res = u.retrieve(id_, frmt="fasta") self._fasta = res[:] return res
def quick_getprotinfo(protlist): """get protein information from uniprot database uniprot(http://www.uniprot.org) based on the package of bioservices. input ::= protlist: list of proteins idnm: type of protein names, such as AC. output:: dict of protein information:: Entry name; Gene names; Length; Organism; Protein names; Status. """ u = UniProt(verbose=False) return u.quick_search(protlist)
def write_fasta_for_ids(uniprot_ids, output_file): u = UniProt(verbose=False) count = 1 all_seqs = [] for uni_id in uniprot_ids: all_seqs.append(u.retrieve(uni_id, 'fasta')) if count % 500 == 0: print("Retrieved sequence for {}/{} IDs".format( count, len(uniprot_ids))) count += 1 all_fasta_seqs = [i for i in all_seqs if not type(i) == int] final_fasta = ''.join(all_fasta_seqs) with open(output_file, 'w') as f: f.write(final_fasta)
def getUniprotInfo(uni_id): u = UniProt() #verbose=False) frmt = "tab" columns = ','.join(columns_name) alldata = u.search(uni_id, frmt=frmt, columns=columns) dataline = alldata.split("\n") data = [l.split("\t") for l in dataline[1:]] header = dataline[0].split("\t") dic_data = [] for j in range(len(data) - 1): dic = {} for i, key in enumerate(columns_name): dic[key] = data[j][i] dic_data.append(dic) return dic_data, data, header
def find_gene(prot_id): u = UniProt(verbose=False) res = u.mapping("EMBL", "ACC", query=prot_id) for key, values in res.items(): for value in values: res = u.search(value, frmt="tab", limit=3, columns="genes", database='uniparc') genes = set(res[11:].split(';')) genes = [i for i in genes if (0<len(i) and i !='\n')] if len(genes)<1: genes = 'none' return key, genes return prot_id, 'none'
class Peptides(object): """ :: >>> p = Peptides() >>> p.get_fasta_sequence("Q8IYB3") >>> p.get_peptide_position("Q8IYB3", "VPKPEPIPEPKEPSPE") 189 Sometimes, peptides are provided with a pattern indicating the phospho site. e.g., :: >>> """ def __init__(self, verbose=False): self.u = UniProt(verbose=verbose) self.sequences = {} def get_fasta_sequence(self, uniprot_name): seq = self.u.get_fasta_sequence(uniprot_name) return seq def get_phosphosite_position(self, uniprot_name, peptide): if uniprot_name not in self.sequences.keys(): seq = self.get_fasta_sequence(uniprot_name) self.sequences[uniprot_name] = seq[:] else: seq = self.sequences[uniprot_name][:] positions = [x.start() for x in re.finditer("PQS", seq)] return positions
def test1(): u = UniProt() # working bob bob = fetch_fasta_from_uniprot(u, 'P29317', True) print(bob) # toomanyError bob try: bob = fetch_fasta_from_uniprot(u, ['P29317', 'P13929'], True) print(bob) except TooManyError: # expected outcome print('toomanyerror handled') # toomanyError bob try: bob = fetch_fasta_from_uniprot(u, 'P29317' + ',\P13929', True) print(bob) except InvalidIdError: # expected outcome print('InvalidIdError 400 handled') # # InvalidIdError bob try: bob = fetch_fasta_from_uniprot(u, "IwantA404CodePlease", True) print(bob) except InvalidIdError: # expected outcome print('InvalidIdError 404 handled')
def getTaxonomyProtein(taxonomy, format="tab"): u = UniProt() #verbose=False) query = "taxonomy:" + taxonomy frmt = "tab" columns = ','.join(columns_name) #get all entry_name as a data_frame # entry_name=u.search(query,frmt=frmt,columns="entry name") # entry_name_1 = str(entry_name).split("\n") # enrty_name = entry_name_1[1:-1] #this is no enought informtion #get using the seach alldata = u.search(query, frmt=format, columns=columns) dataline = alldata.split("\n") data = [l.split("\t") for l in dataline[1:]] header = dataline[0].split("\t") return alldata, data, header
def call_uniprotkb(query, logger): """Calls to UniProt. If no data is retieved a default 'blank' dataframe is returned. :param query: str, query for UniProt :param logger: logger object Returns dataframe of search results. """ # Establish data to be retrieved from UniProt columnlist = ( "organism-id,organism,id,entry name, protein names,length,mass,domains,domain," "families," "go-id,go(molecular function),go(biological process)," "sequence") # This dictionary will be used to populate "blank"/"empty" databases when # an error is thrown. Iterables are used as values to avoid problems with # "ValueError: If using all scalar values, you must pass an index" blank_data = { "NCBI Taxonomy ID": ["NA"], "Organism": ["NA"], "UniProtKB Entry ID": ["NA"], "UniProtKB Entry Name": ["NA"], "UniProtKB Protein Names": ["NA"], "EC number": ["NA"], "Length (Aa)": ["NA"], "Mass (Da)": ["NA"], "Domains": ["NA"], "Domain count": ["NA"], "UniProtKB Linked Protein Families": ["NA"], "Gene ontology IDs": ["NA"], "Gene ontology (molecular function)": ["NA"], "Gene ontology (biological process)": ["NA"], "Sequence": ["NA"], } logger.info("querying uniprot, query: {query}") try: # open connection to UniProt(), search and convert result into pandas df search_result = UniProt().search( query, columns=columnlist, ) # returns empty string for no result return pd.read_table(io.StringIO(search_result)) except HTTPError: logger.warning((f"Network error occured during query: {query}\n" "Returning null value 'NA' for all UniProt data")) return pd.DataFrame(blank_data) except EmptyDataError: # No UniProt entries found for locus tag, return null data for logger.warning( (f"No data returned from UniProt during query: {query}\n" "Returning null value 'NA' for all UniProt data")) return pd.DataFrame(blank_data)
def __init__(self, verbosity="INFO"): super(Mapper, self).__init__(level=verbosity) self.logging.info("Initialising the services") self.logging.info("... uniprots") self._uniprot_service = UniProt() self.logging.info("... KEGG") self._kegg_service = KeggParser(verbose=False) self.logging.info("... HGNC") self._hgnc_service = HGNC() self.logging.info("... UniChem") self._unichem_service = UniChem() self.logging.info("...BioDBNet") self._biodbnet = BioDBNet()
def __init__(self, input_file, gene_id_column, output_file): self._input_file = input_file self._gene_id_column = gene_id_column self._output_file = output_file self._tmp_folder = "tmp_data" self._uniprot = UniProt(verbose=False) self._quickgo = QuickGO(verbose=False) if os.path.exists(self._tmp_folder) is False: os.mkdir(self._tmp_folder)
def load_fasta(self, id_): """Fetches FASTA from uniprot and loads into attribute :attr:`fasta` :param str id_: a given uniprot identifier :returns: nothing .. note:: same as :meth:`get_fasta` but returns nothing """ # save fasta into attributes fasta from bioservices import UniProt u = UniProt(verbose=False) try: res = u.retrieve(id_, frmt="fasta") # some entries in uniprot are valid but obsolet and return empty string if res == "": raise Exception self._fasta = res[:] except: pass
def search_struc(): u = UniProt() with open(os.path.join(resultspath, "myprot_list.csv"), "r") as infile: with open(os.path.join(resultspath, 'myprot_list_struc.csv'), 'w') as outfile: mywriter = csv.writer(outfile, delimiter=';') myreader = csv.reader(infile, delimiter=';') for row in myreader: if row[0] == "Family": mywriter.writerow(row + ["Uniprot entry", "Struc"]) continue struccell = "" entry = "" uprot = row[3] print("\n\n", row[2]) if uprot: data = u.quick_search("id:%s" % uprot) if data: entry = data[uprot]['Entry name'].lower() struc_res = obtain_struc_pdb(entry) if struc_res: struccell = struc_res print(struccell) else: temp = requests.get( 'http://gpcrdb.org/services/structure/template/' + entry).json() if temp: temp_res = obtain_struc_pdb(temp) if temp_res: struccell = "[Model]: " + temp_res print(struccell) else: print("-----No struc for template") else: print("-----No template") else: print("-----Uprot ID not found") else: print("-----No uprot ID") mywriter.writerow(row + [entry, struccell])
def hitrate(proteins, indexes, subclass): columns = ['subsequence', 'sprot_start', 'sprot_end', 'sprot_loc', 'dl_start', 'dl_end', 'dl_loc'] u = UniProt() match, total = 0,0 dl_peptides, dl_starts, dl_ends, sprot_starts, sprot_ends, sprot_locs =[], [], [], [], [], [] if proteins != None: for prot in proteins: locs = None try: entry = u.retrieve(prot.ac, frmt='xml') locs = entry['subcellularlocation'] except: continue if locs: pep_metadata = prot.matching_peptide[0] seq_range = pep_metadata.match_range[0] peptide = pep_metadata.peptide dl_peptides.append(peptide) start, end = indexes[peptide] dl_starts.append(start) dl_ends.append(end) pos = seq_range.start sprot_starts.append(pos) sprot_ends.append(seq_range.end) seq_len = seq_range.end - pos offset_weight = 1 if pos == start else min(abs(seq_len / (pos - start)), 1) loc = list((locs[0].children))[1].string sprot_locs.append(loc) match_weight = determine_locations(loc, subclass) * offset_weight match += match_weight # assert(match_weight <= 1),'match_weight {}'.format(match_weight) total += offset_weight if total == 0: hitrate = 0 else: hitrate = match/total vals = [[dl_peptides, sprot_starts, sprot_ends, sprot_locs, dl_starts, dl_ends, subclass]] df = pd.DataFrame(vals, columns=columns) return (hitrate, df)
def add_sequence_to_nodes(n: str, d: Dict[str, Any]): """ Maps UniProt ACC to UniProt ID. Retrieves sequence from UniProt and adds it to the node as a feature :param n: Graph node. :type n: str :param d: Graph attribute dictionary. :type d: Dict[str, Any] """ h = HGNC(verbose=False) u = UniProt(verbose=False) d["uniprot_ids"] = h.fetch( "symbol", d["protein_id"])["response"]["docs"][0]["uniprot_ids"] # Todo these API calls should probably be batched # Todo mapping with bioservices to support other protein IDs? for id in d["uniprot_ids"]: d[f"sequence_{id}"] = u.get_fasta_sequence(id)
def __init__(self, verbose=True): """.. rubric:: Constructor :param bool verbose: print informative messages .. doctest:: >>> from bioservices import PSICQUIC >>> s = PSICQUIC() """ urlStr = 'http://www.ebi.ac.uk/Tools/webservices/psicquic' super(PSICQUIC, self).__init__("PSICQUIC", verbose=verbose, url=urlStr) self._registry = None try: self.uniprot = UniProt(verbose=False) except: self.logging.warning("UniProt service could be be initialised") self.buffer = {}
def __init__(self, interactionlist=None): if interactionlist is None: interactionlist = ["phosphorylation"] self.interactionlist = interactionlist self.namespaces = {'df': 'http://psi.hupo.org/mi/mif'} # self._logger = logging.getLogger(__name__) self.u = UniProt(verbose=False) self._cache_kegg_entry_uniprots = {}
def __main__(): ids = sys.argv[1] filename = sys.argv[2] # TODO: check the validity and format ? try: from bioservices import UniProt u = UniProt(verbose=False) u.debugLevel = "ERROR" except ImportError: print("Could not import bioservoces ? Check that it is installed. Try 'pip install bioservices'") try: fasta = u.searchUniProtId(ids, "fasta") except: print("An error occured while fetching the FASTA file from uniprot") try: fh = open(filename, "w") fh.write(fasta) fh.close() except: print("could not save the FASTA file")
def main(): """ Main function.""" args = parse_args() if args.log: logfile = args.log logging.basicConfig(filename=logfile, level=logging.DEBUG, \ filemode='w', format='%(asctime)s %(message)s', \ datefmt='%Y-%m-%d %H:%M:%S') else: logfile = sys.stdout #Interface to the UniProt service u = UniProt(verbose=False) with open('../DataFilesVarstructure/pdb_chain_uniprot_1.csv','r') as csvinput: with open('../DataFilesVarstructure/pdb_chain_uniprot_2.csv', 'w') as csvoutput: writer = csv.writer(csvoutput) for row in csv.reader(csvinput): column_count = len(row) print(column_count) if column_count == 9: if row[0] == "PDB": writer.writerow(row + ["Entry_Name"]) print(row) else: res=u.search(str(row[2]),limit=1) print(res) if res != "" : for line in res.split("\n")[1:-1]: if(line != ""): print(line) res_id, res_Entry_Name, res_status, res_protein_names, res_gene_names, res_organism, Length = line.split("\t") strList =list() strList.append(res_Entry_Name) writer.writerow(row + strList)
class Mapper(Logging): """Accepted code: uniprot m = Mapper() # HGNC df_hgnc = m.get_all_hgnc_into_df() df_hgnc.to_pickle("mapper_hgnc.dat") # KEGG df_kegg1 = m.get_all_kegg_into_df1() df_kegg2 = m.get_all_kegg_into_df2() uniq_keggid = """ kegg_dblinks = ["IMGT", "Ensembl", "HGNC", "HPRD", "NCBI-GI", "OMIM", "NCBI-GeneID", "UniProt", "Vega"] hgnc_dblink = ['EC','Ensembl', 'EntrezGene', 'GDB', 'GENATLAS', 'GeneCards', 'GeneTests', 'GoPubmed', 'H-InvDB', 'HCDM', 'HCOP', 'HGNC', 'HORDE', 'IMGT_GENE_DB', 'INTERFIL', 'IUPHAR', 'KZNF', 'MEROPS', 'Nucleotide', 'OMIM', 'PubMed', 'RefSeq', 'Rfam', 'Treefam', 'UniProt', 'Vega', 'miRNA', 'snoRNABase'] def __init__(self, verbosity="INFO"): super(Mapper, self).__init__(level=verbosity) self.logging.info("Initialising the services") self.logging.info("... uniprots") self._uniprot_service = UniProt() self.logging.info("... KEGG") self._kegg_service = KeggParser(verbose=False) self.logging.info("... HGNC") self._hgnc_service = HGNC() self.logging.info("... UniChem") self._unichem_service = UniChem() self.logging.info("...BioDBNet") self._biodbnet = BioDBNet() def _uniprot2refseq(self, name): """ There are 2 refseq alias: REFSEQ_NT_ID and P_REFSEQ_AC. Here, we use the first one to agree with wikipedia http://en.wikipedia.org/wiki/Protein_Kinase_B """ return self._uniprot_service.mapping(fr="ACC", to="REFSEQ_NT_ID", query="P31749") def _update_uniprot_xref(self, df, xref=["HGNC_ID", "ENSEMBLE_ID", "P_ENTREZGENEID"]): """Update the dataframe using Uniprot to map indices onto cross reference databases """ for ref in xref: print("Processing %s " % ref) res = self._uniprot_service.multi_mapping("ACC", ref, list(df.index), timeout=10, ntrials=5) if "%s__uniprot_mapping" % ref not in df.columns: thisdf = pd.DataFrame({"%s__uniprot_mapping": res.values()}, index=res.keys()) df = df.join(thisdf) else: for index in df.index: if index in res.keys(): df.ix[index]["%s__uniprot_mapping" % ref] = res[index] def get_data_from_biodbnet(self, df_hgnc): """keys are unique Gene names input is made of the df based on HGNC data web services uniprot accession are duplicated sometimes. If som this is actually the iprimary accession entry and all secondary ones. e.g. , ABHD11 >>>> Q8N723;Q8NFV2;Q8NFV3;Q6PJU0;Q8NFV4;H7BYM8;Q8N722;Q9HBS8 ABHDB_HUMAN Alpha/beta hydrolase domain-containing protein 11 correspond actually to the primary one : Q8NFV4 """ b = biodbnet.BioDBNet() res2 = b.db2db("Gene Symbol", ["HGNC ID", "UniProt Accession", "UniProt Entry Name", "UniProt Protein Name", "KEGG Gene ID", "Ensembl Gene ID"], res.keys()[0:2000]) import pandas as pd import StringIO c = pd.read_csv(StringIO.StringIO(res2), delimiter="\t", index_col="Gene Symbol") return c
class PSICQUIC(RESTService): """Interface to the `PSICQUIC <http://code.google.com/p/psicquic/>`_ service There are 2 interfaces to the PSICQUIC service (REST and WSDL) but we used the REST only. This service provides a common interface to more than 25 other services related to protein. So, we won't detail all the possiblity of this service. Here is an example that consists of looking for interactors of the protein ZAP70 within the IntAct database:: >>> from bioservices import * >>> s = PSICQUIC() >>> res = s.query("intact", "zap70") >>> len(res) # there are 11 interactions found 11 >>> # Let us look at the second one in particular: >>> for x in res[1].split("\t"): ... print x uniprotkb:O95169 uniprotkb:P43403 intact:EBI-716238 intact:EBI-1211276 psi-mi:ndub8_human(display_long)|uniprotkb:NADH-ubiquinone oxidoreductase ASHI . . Here we have a list of entries. There are 15 of them (depending on the *output* parameter). The meaning of the entries is described on PSICQUIC website: https://code.google.com/p/psicquic/wiki/MITAB25Format . In short: #. Unique identifier for interactor A #. Unique identifier for interactor B. #. Alternative identifier for interactor A, for example the official gene #. Alternative identifier for interactor B. #. Aliases for A, separated by "| #. Aliases for B. #. Interaction detection methods, taken from the corresponding PSI-MI #. First author surname(s) of the publication(s) #. Identifier of the publication #. NCBI Taxonomy identifier for interactor A. #. NCBI Taxonomy identifier for interactor B. #. Interaction types, #. Source databases and identifiers, #. Interaction identifier(s) i #. Confidence score. Denoted as scoreType:value. Another example with reactome database:: res = s.query("reactome", "Q9Y266") .. warning:: PSICQUIC gives access to 25 other services. We cannot create a dedicated parsing for all of them. So, the ::`query` method returns the raw data. Addition class may provide dedicated parsing in the future. .. seealso:: :class:`bioservices.biogrid.BioGRID` """ _formats = ["tab25", "tab26", "tab27", "xml25", "count", "biopax", "xgmml", "rdf-xml", "rdf-xml-abbrev", "rdf-n3", "rdf-turtle"] # note the typo in "genbank indentifier from bind DB _mapping_uniprot = {"genbank indentifier": "P_GI", 'entrezgene/locuslink':"P_ENTREZGENEID", 'uniprotkb': "ACC+ID", 'rcsb pdb':"PDB_ID", 'ensembl':"ENSEMBL_ID", 'refseq':"P_REFSEQ_AC", 'hgnc':'HGNC_ID', "kegg": "KEGG_ID", "entrez gene/locuslink": "P_ENTREZGENEID", "chembl": "CHEMBL_ID", "ddbj/embl/genbank": "EMBL_ID", "dip": "DIP_ID", "ensemblgenomes": "ENSEMBLGENOME_ID", "omim":"MIM_ID", "chebi": None, "chembl": None, # "intact": None } # unknown: hprd, omim, bind, bind complexid, mdl, def __init__(self, verbose=True): """.. rubric:: Constructor :param bool verbose: print informative messages .. doctest:: >>> from bioservices import PSICQUIC >>> s = PSICQUIC() """ urlStr = 'http://www.ebi.ac.uk/Tools/webservices/psicquic' super(PSICQUIC, self).__init__("PSICQUIC", verbose=verbose, url=urlStr) self._registry = None try: self.uniprot = UniProt(verbose=False) except: self.logging.warning("UniProt service could be be initialised") self.buffer = {} def _get_formats(self): return PSICQUIC._formats formats = property(_get_formats, doc="Returns the possible output formats") def _get_active_db(self): names = self.registry_names[:] actives = self.registry_actives[:] names = [x.lower() for x,y in zip(names, actives) if y=="true"] return names activeDBs = property(_get_active_db, doc="returns the active DBs only") def read_registry(self): """Reads and returns the active registry """ url = self.url + '/registry/registry?action=ACTIVE&format=txt' res = self.request(url, format='txt') return res.split() def print_status(self): """Prints the services that are available :return: Nothing The output is tabulated. The columns are: * names * active * count * version * rest URL * soap URL * rest example * restricted .. seealso:: If you want the data into lists, see all attributes starting with registry such as :meth:`registry_names` """ url = self.url + '/registry/registry?action=STATUS&format=xml' res = self.request(url) names = self.registry_names counts = self.registry_counts versions = self.registry_versions actives = self.registry_actives resturls = self.registry_resturls soapurls = self.registry_soapurls restexs = self.registry_restexamples restricted = self.registry_restricted N = len(names) indices = sorted(range(0,N), key=lambda k: names[k]) for i in range(0,N): print("%s\t %s\t %s\t %s\t %s %s %s %s\n" % (names[i], actives[i], counts[i], versions[i], resturls[i], soapurls[i], restexs[i], restricted[i])) # todo a property for the version of PISCQUIC def _get_registry(self): if self._registry == None: url = self.url + '/registry/registry?action=STATUS&format=xml' res = self.request(url, format="xml") self._registry = res return self._registry registry = property(_get_registry, doc="returns the registry of psicquic") def _get_registry_names(self): res = self.registry return [x.findAll('name')[0].text for x in res.findAll("service")] registry_names = property(_get_registry_names, doc="returns all services available (names)") def _get_registry_restricted(self): res = self.registry return [x.findAll('restricted')[0].text for x in res.findAll("service")] registry_restricted = property(_get_registry_restricted, doc="returns restricted status of services" ) def _get_registry_resturl(self): res = self.registry data = [x.findAll('resturl')[0].text for x in res.findAll("service")] return data registry_resturls = property(_get_registry_resturl, doc="returns URL of REST services") def _get_registry_restex(self): res = self.registry data = [x.findAll('restexample')[0].text for x in res.findAll("service")] return data registry_restexamples = property(_get_registry_restex, doc="retuns REST example for each service") def _get_registry_soapurl(self): res = self.registry return [x.findAll('soapurl')[0].text for x in res.findAll("service")] registry_soapurls = property(_get_registry_soapurl, doc="returns URL of WSDL service") def _get_registry_active(self): res = self.registry return [x.findAll('active')[0].text for x in res.findAll("service")] registry_actives = property(_get_registry_active, doc="returns active state of each service") def _get_registry_count(self): res = self.registry return [x.findAll('count')[0].text for x in res.findAll("service")] registry_counts = property(_get_registry_count, doc="returns number of entries in each service") def _get_registry_version(self): res = self.registry names = [x.findAll('name')[0].text for x in res.findAll("service")] N = len(names) version = [0] * N for i in range(0,N): x = res.findAll("service")[i] if x.findAll("version"): version[i] = x.findAll("version")[0].text else: version[i] = None return version registry_versions = property(_get_registry_version, doc="returns version of each service") def query(self, service, query, output="tab25", version="current", firstResult=None, maxResults=None): """Send a query to a specific database :param str service: a registered service. See :attr:`registry_names`. :param str query: a valid query. Can be `*` or a protein name. :param str output: a valid format. See s._formats :: s.query("intact", "brca2", "tab27") s.query("intact", "zap70", "xml25") s.query("matrixdb", "*", "xml25") This is the programmatic approach to this website: http://www.ebi.ac.uk/Tools/webservices/psicquic/view/main.xhtml Another example consist in accessing the *string* database for fetching protein-protein interaction data of a particular model organism. Here we restrict the query to 100 results:: s.query("string", "species:10090", firstResult=0, maxResults=100, output="tab25") # spaces are automatically converted s.query("biogrid", "ZAP70 AND species:9606") .. warning:: AND must be in big caps. Some database are ore permissive than other (e.g., intact accepts "and"). species must be a valid ID number. Again, some DB are more permissive and may accept the name (e.g., human) To obtain the number of interactions in intact for the human specy:: >>> len(p.query("intact", "species:9606")) """ if service not in self.activeDBs: raise ValueError("database %s not in active databases" % service) params = {} if output!=None: self.checkParam(output, self.formats) params['format'] = output else: output="none" names = [x.lower() for x in self.registry_names] try: index = names.index(service) except ValueError: print("The service you gave (%s) is not registered. See self.registery_names" % service) raise ValueError # get the base url according to the service requested resturl = self.registry_resturls[index] if firstResult != None: params['firstResult'] = firstResult if maxResults != None: params['maxResults'] = maxResults postData = self.urlencode(params) url = resturl + 'query/' + query.replace(" ", "%20") if params: url += "?" + postData if "xml" in output: res = self.request(url, format="xml", baseUrl=False) else: res = self.request(url, format="txt",baseUrl=False) res = res.strip().split("\n") if output.startswith("tab"): res = self._convert_tab2dict(res) return res def _convert_tab2dict(self, data): """ https://code.google.com/p/psicquic/wiki/MITAB26Format """ results = [] for line in data: results.append(line.split("\t")) return results def queryAll(self, query, databases=None, output="tab25", version="current", firstResult=None, maxResults=None): """Same as query but runs on all active database :param list databases: database to query. Queries all active DB if not provided :return: dictionary where keys correspond to databases and values to the output of the query. :: res = s.queryAll("ZAP70 AND species:9606") """ results = {} if databases == None: databases = [x.lower() for x in self.activeDBs] for x in databases: if x not in self.activeDBs: raise ValueError("database %s not in active databases" % x) for name in databases: self.logging.warning("Querying %s" % name), res = self.query(name, query, output=output, version=version, firstResult=firstResult, maxResults=maxResults) if output.startswith("tab25"): results[name] = [x for x in res if x!=[""]] else: import copy results[name] = copy.copy(res) for name in databases: self.logging.info("Found %s in %s" % (len(results[name]), name)) return results def getInteractionCounter(self, query): """Returns a dictionary with database as key and results as values :param str query: a valid query :return: a dictionary which key as database and value as number of entries Consider only the active database. """ # get the active names only activeDBs = self.activeDBs[:] res = [(str(name), int(self.query(name, query, output="count")[0])) for name in activeDBs] return dict(res) def getName(self, data): idsA = [x[0] for x in data] idsB = [x[1] for x in data] return idsA, idsB def knownName(self, data): """Scan all entries (MITAB) and returns simplified version Each item in the input list of mitab entry The output is made of 2 lists corresponding to interactor A and B found in the mitab entries. elements in the input list takes the following forms:: DB1:ID1|DB2:ID2 DB3:ID3 The | sign separates equivalent IDs from different databases. We want to keep only one. The first known databae is kept. If in the list of DB:ID pairs no known database is found, then we keep the first one whatsover. known databases are those available in the uniprot mapping tools. chembl and chebi IDs are kept unchanged. """ self.logging.info("converting data into known names") idsA = [x[0].replace("\"","") for x in data] idsB = [x[1].replace("\"", "") for x in data] # extract the first and second ID but let us check if it is part of a # known uniprot mapping.Otherwise no conversion will be possible. # If so, we set the ID to "unknown" # remove the " character that can be found in a few cases (e.g, # chebi:"CHEBI:29036") #idsA = [x.replace("chebi:CHEBI:","chebi:") for x in idsA] #idsB = [x.replace("chebi:CHEBI:", "chebi:") for x in idsB] # special case: # in mint, there is an entry that ends with a | uniprotkb:P17844| idsA = [x.strip("|") for x in idsA] idsB = [x.strip("|") for x in idsB] # the first ID for i, entry in enumerate(idsA): try: dbs = [x.split(":")[0] for x in entry.split("|")] IDs = [x.split(":")[1] for x in entry.split("|")] valid_dbs = [(db,ID) for db,ID in zip(dbs,IDs) if db in self._mapping_uniprot.keys()] # search for an existing DB if len(valid_dbs)>=1: idsA[i] = valid_dbs[0][0] + ":" + valid_dbs[0][1] else: self.logging.debug("none of the DB for this entry (%s) are available" % (entry)) idsA[i] = "?" + dbs[0] + ":" + IDs[0] except: self.logging.info("Could not extract name from %s" % entry) idsA[i] = "??:" + entry # we add a : so that we are sure that a split(":") will work # the second ID for i, entry in enumerate(idsB): try: dbs = [x.split(":")[0] for x in entry.split("|")] IDs = [x.split(":")[1] for x in entry.split("|")] valid_dbs = [(db,ID) for db,ID in zip(dbs,IDs) if db in self._mapping_uniprot.keys()] # search for an existing DB if len(valid_dbs)>=1: idsB[i] = valid_dbs[0][0] + ":" + valid_dbs[0][1] else: self.logging.debug("none of the DB (%s) for this entry are available" % (entry)) idsB[i] = "?" + dbs[0] + ":" + IDs[0] except: self.logging.info("Could not extract name from %s" % entry) idsB[i] = "??:" + entry countA = len([x for x in idsA if x.startswith("?")]) countB = len([x for x in idsB if x.startswith("?")]) if countA+countB > 0: self.logging.warning("%s ids out of %s were not identified" % (countA+countB, len(idsA)*2)) print (set([x.split(":")[0] for x in idsA if x.startswith("?")])) print (set([x.split(":")[0] for x in idsB if x.startswith("?")])) self.logging.info("knownName done") return idsA, idsB def preCleaning(self, data): """remove entries ehre IdA or IdB is set to "-" """ ret = [x for x in data if x[0] !="-" and x[1]!="-"] return ret def postCleaningAll(self,data, keep_only="HUMAN", flatten=True, verbose=True): """ even more cleaing by ignoring score, db and interaction len(set([(x[0],x[1]) for x in retnew])) """ results = {} for k in data.keys(): self.logging.info("Post cleaning %s" % k) ret = self.postCleaning(data[k], keep_only="HUMAN", verbose=verbose) if len(ret): results[k] = ret if flatten: results = [x for k in results.keys() for x in results[k]] return results def postCleaning(self, data, keep_only="HUMAN", remove_db=["chebi","chembl"], keep_self_loop=False, verbose=True): """Remove entries with a None and keep only those with the keep pattern """ if verbose:print("Before removing anything: ", len(data)) data = [x for x in data if x[0]!=None and x[1]!=None] if verbose:print("After removing the None: ", len(data)) data = [x for x in data if x[0].startswith("!")==False and x[1].startswith("!")==False] if verbose:print("After removing the !: ", len(data)) for db in remove_db: data = [x for x in data if x[0].startswith(db)==False] data = [x for x in data if x[1].startswith(db)==False] if verbose:print("After removing entries that match %s : " % db, len(data)) data = [x for x in data if keep_only in x[0] and keep_only in x[1]] if verbose:print("After removing entries that don't match %s : " % keep_only, len(data)) if keep_self_loop == False: data = [x for x in data if x[0]!=x[1]] if verbose:print("After removing self loop : ", len(data)) data = list(set(data)) if verbose:print("After removing identical entries", len(data)) return data def convertAll(self, data): results = {} for k in data.keys(): self.logging.info("Analysing %s" % k) results[k] = self.convert(data[k], db=k) return results def convert(self, data, db=None): self.logging.debug("converting the database %s" % db) idsA, idsB = self.knownName(data) mapping = self.mappingOneDB(data) results = [] for i, entry in enumerate(data): x = idsA[i].split(":",1)[1] y = idsB[i].split(":",1)[1] xp = mapping[x] yp = mapping[y] try:ref = entry[8] except:ref="?" try:score = entry[14] except:score = "?" try:interaction = entry[11] except:interaction="?" results.append((xp, yp, score, interaction, ref, db)) return results def mappingOneDB(self, data): query = {} self.logging.debug("converting IDs with proper DB name (knownName function)") entriesA, entriesB = self.knownName(data) # idsA and B contains list of a single identifier of the form db:id # the db is known from _mapping.uniprot otherwise it is called "unknown" # get unique DBs to build the query dictionary dbsA = [x.split(":")[0] for x in entriesA] dbsB = [x.split(":")[0] for x in entriesB] for x in set(dbsA): query[x] = set() for x in set(dbsB): query[x] = set() for k in query.keys(): if k.startswith("?"): del query[k] # the data to store mapping = {} N = len(data) # scan all entries counter = 0 for entryA, entryB in zip(entriesA, entriesB): counter += 1 dbA, idA = entryA.split(":") try: dbB, idB = entryB.split(":") except: print entryB if idA not in mapping.keys(): if dbA.startswith("?"): mapping[idA] = entryA else: query[dbA].add(idA) if idB not in mapping.keys(): if dbB.startswith("?"): mapping[idB] = entryB else: query[dbB].add(idB) for k in query.keys(): if len(query[k])>2000 or counter == N: this_query = list(query[k]) DBname = self._mapping_uniprot[k] if DBname != None: self.logging.warning("Request sent to uniprot for %s database (%s/%s)" % (DBname, counter, N)) res = self.uniprot.mapping(fr=DBname, to="ID", query=" ".join(this_query)) for x in this_query: if x not in res: #was not found mapping[x] = "!" + k+":"+x else: # we should be here since the queries are populated # if not already in the mapping dictionary if x == mapping.keys(): raise ValueError(x) index = res.index(x) mapping[x] = res[index+1] else: for x in this_query: mapping[x] = k + ":" + x query[k] = set() for k in query.keys(): assert len(query[k])==0 return mapping
import sys import os path_fichier = "../uniprot/" contigs = [] #On obtient l'identifiant uniprot des séquences représentatives with open("../question_2/resultatEBI.txt", "r") as ebi: for line in ebi: temp = line.split("\t") uniprot = temp[1][10:].strip(" \n\t\r") contigs.append(uniprot) #Maintenant on fait les recherches u = UniProt() count = 0 for contig in contigs: nom_fichier = path_fichier+contig + ".xml" if os.path.isfile(nom_fichier): print "Fichier déjà existant: " + contig else: result = u.searchUniProtId(contig) with open(path_fichier+contig+".xml", "w") as uni: uni.write(result.prettify()) count += 1 print "count = " + str(count) + "contig = " + contig print "Nombre de contigs traités: " + str(count) ##Première ligne est toujours un contig
def main(): """ Main function.""" args = parse_args() if args.log: logfile = args.log logging.basicConfig(filename=logfile, level=logging.DEBUG, \ filemode='w', format='%(asctime)s %(message)s', \ datefmt='%Y-%m-%d %H:%M:%S') else: logfile = sys.stdout outputfile = open(args.out, "w") # Output header outputfile.write("chr\tpos\tid\tref\talt\tgene\tfeature\tfeature_type\tconsequence\tswissprotid\tuniprotid\tpdbid\tprotein_position\tamino_acid\n") vcf_row = {} #Interface to the UniProt service u = UniProt(verbose=False) vcf_reader = vcf.Reader(open(args.vcf, 'r')) ENSP_PDB_UNIPROT_mapping_DataFram = pd.DataFrame(columns=['ENSP','UniProtID','PDB']) #creating a util function to store mapping of Uniprot and PDB_ID for record in vcf_reader: # VEP fields curr_ENSP = '' if "CSQ" in record.INFO: csq = record.INFO['CSQ'] # BELOW: THERE ARE A COUPLE OF OPTIONS TO PROCEED # For going through annotations for all transcript for current_csq_element in csq: current_csq = current_csq_element.split('|') curr_ENSP = str(current_csq[26]) if curr_ENSP != "": # to get Protein ID given ENSP ID current_protein_list = u.search(curr_ENSP,frmt="list") for curr_protein in current_protein_list.split("\n"): if curr_protein != "": # to get PDB ID given protein id mapping_Dictionary = u.mapping(fr="ID", to="PDB_ID", query=str(curr_protein)) if bool(mapping_Dictionary) == True : if curr_ENSP not in ENSP_PDB_UNIPROT_mapping_DataFram.index: ENSP_PDB_UNIPROT_mapping_DataFram.loc[curr_ENSP] = pd.Series({'ENSP':curr_ENSP, 'UniProtID':mapping_Dictionary.keys(), 'PDB':mapping_Dictionary.values()}) #print(ENSP_PDB_UNIPROT_mapping_DataFram) # writing in a csv file for record in vcf_reader: current_chr = record.CHROM current_id = record.ID current_pos = record.POS current_ref = record.REF current_alt = ','.join(str(v) for v in record.ALT) # VEP fields current_gene, current_feature = '','' current_feature_type, current_consequence = '','' current_swissport,current_ENSP, current_protein, current_pdbid , current_protein_position, current_amino_acid = '','','','','','' if "CSQ" in record.INFO: csq = record.INFO['CSQ'] # BELOW: THERE ARE A COUPLE OF OPTIONS TO PROCEED # For going through annotations for all transcript for current_csq_element in csq: current_csq = current_csq_element.split('|') current_consequence = current_csq[1] current_gene = current_csq[4] current_feature_type = current_csq[5] current_feature = current_csq[6] current_protein_position = current_csq[14] current_amino_acid = current_csq[15] current_ENSP = current_csq[26] current_swissport = current_csq[27] # only cosider missense mutation #if current_swissport_in_my_list(current_swissport, swissprot_pdb_) if current_ENSP in ENSP_PDB_UNIPROT_mapping_DataFram.index: current_protein = ENSP_PDB_UNIPROT_mapping_DataFram.loc[current_ENSP]['UniProtID'] for item in ENSP_PDB_UNIPROT_mapping_DataFram.loc[current_ENSP]['PDB']: current_pdbid = item break; out_str = [ current_chr, str(current_pos), str(current_id), current_ref, current_alt, current_gene, current_feature, current_feature_type, current_consequence,current_swissport,current_ENSP, current_protein, current_pdbid , current_protein_position, current_amino_acid] else: current_protein = "" current_pdbid = "" out_str = [ current_chr, str(current_pos), str(current_id), current_ref, current_alt, current_gene, current_feature, current_feature_type, current_consequence,current_swissport,current_ENSP, current_protein, current_pdbid , current_protein_position, current_amino_acid] out_str = [x or 'None' for x in out_str] outputfile.write("\t".join(out_str)) outputfile.write("\n") else: current_gene, current_feature = '','' current_feature_type, current_consequence = '','' current_swissport,current_ENSP, current_protein, current_pdbid , current_protein_position, current_amino_acid = '','','','','','' out_str = [ current_chr, str(current_pos), str(current_id), current_ref, current_alt, current_gene, current_feature, current_feature_type, current_consequence,current_swissport,current_ENSP, current_protein, current_pdbid , current_protein_position, current_amino_acid] out_str = [x or 'None' for x in out_str] outputfile.write("\t".join(out_str)) outputfile.write("\n") outputfile.close() logging.info('Start.') logging.info('Command line: {}'.format(' '.join(sys.argv)))
def __init__(self, verbose=False): self.u = UniProt(verbose=verbose) self.sequences = {}
def _init_uniprot(self): if hasattr(self, "_uniprot") == False: self._uniprot = UniProt(verbose=self.debugLevel)
def func(): u = UniProt() res = u.search(query, frmt=fmt) with open(target_fn, 'wb') as fp: fp.write(res)
from pylab import rcParams from liverx import wd from matplotlib.colors import rgb2hex from statsmodels.stats.multitest import multipletests from scipy.stats.distributions import hypergeom from bioservices import KEGG, KEGGParser, QuickGO, UniProt from pandas import DataFrame, read_csv sns.set(style='ticks', palette='pastel', color_codes=True) # ---- Import network network = read_csv('%s/files/string_mouse_network_filtered_800.txt' % wd, sep='\t') network_proteins = set(network['protein1']).intersection(network['protein2']) # ---- Set-up UniProt uniprot = UniProt(cache=True) # ---- Set-up QuickGO bioservice quickgo = QuickGO(cache=True) # ---- Set-up KEGG bioservice kegg, kegg_parser = KEGG(cache=True), KEGGParser() kegg.organism = 'mmu' print '[INFO] KEGG service configured' kegg_pathways = {p: kegg.parse_kgml_pathway(p) for p in kegg.pathwayIds} print '[INFO] KEGG pathways extracted: ', len(kegg_pathways) # Convert KEGG pathways Gene Name to UniProt k2u = kegg.conv('uniprot', 'mmu')
def get_more_source_dict_ids(source_dict, primary_key, **kwargs): """ Script to add more ids to source dict nodes to facilitate pairing to a network Arguments: source_dict: id_key: value primary_key: current type of ids used for the nodes. Currently can be 'Entrez Gene (GeneID)' or any of the options in the BioServices UniProt mappings. kwargs: mapping_types: a list of mapping types to include verbose Returns: source_dict, also modified in place """ continue_flag = True file_key = primary_key if primary_key not in available_mapping_source.keys(): continue_flag = False print "Error, you must specify a valid primary_key descriptor to match to in the available database, exiting..." if 'mapping_types' in kwargs: mapping_types = kwargs['mapping_types'] else: mapping_types = default_mapping_target_list try: from bioservices import UniProt u = UniProt(verbose=False) except: print("No bioservices module installed or cannot connect, exiting...") print("e.g. if you are using pip, did you 'pip install bioservices'?") continue_flag = False if 'node_id_type' in kwargs: node_id_type = kwargs['node_id_type'] else: node_id_type = "Entrez Gene (GeneID)" if 'verbose' in kwargs: verbose = kwargs['verbose'] else: verbose = True # Maximum number of items to # query at a time # Note there is a length limit in bioservices 1.2.1 # for the web-based query string. # Trial-and-error suggests the most # id's that can be queried are # between 100 and 1000 max_query_length = 500 if continue_flag: the_query_id_list_list = [[]] i = 0 j = 0 for the_query_id in source_dict.keys(): if (j + 1) % max_query_length == 0: the_query_id_list_list.append([]) i += 1 the_query_id_list_list[i] = [] j = 0 the_query_id_list_list[i].append(the_query_id) j += 1 the_query_string_list = [] for i, the_query_id_list in enumerate(the_query_id_list_list): query_string = '' for the_query_id in the_query_id_list: if len(query_string) > 0: query_string = query_string + ' ' + the_query_id else: query_string = the_query_id the_query_string_list.append(query_string) for the_key in source_dict.keys(): if type(source_dict[the_key]) != dict: the_value = source_dict[the_key] source_dict[the_key] = {} source_dict[the_key]['value'] = the_value for the_target_type in mapping_types: the_result = {} for the_query_string in the_query_string_list: the_result.update(u.mapping(fr = available_mapping_source[file_key], to = available_mapping_target[the_target_type], query = the_query_string)) if verbose: print("** Finished mapping for %s to %s. **" % (file_key, the_target_type)) for the_query_id in source_dict.keys(): if the_query_id in the_result.keys(): if len(the_result[the_query_id]) > 0: source_dict[the_query_id][the_target_type] = the_result[the_query_id] else: source_dict[the_query_id][the_target_type] = [] else: source_dict[the_query_id][the_target_type] = [] return source_dict
# # Par Guillaume Lahaie # LAHG0407707 # # Dernière modification: 17 décembre 2013 # # Program qui obtient le uniref correspondant à un no d'accession genbank from bioservices import UniProt import sys import os from BeautifulSoup import BeautifulSoup UNIREF_PATH = "../uniref/" u = UniProt() with open("uniref_mapping.txt", "w") as r: with open("resultatNBCI.txt", "r") as f: for line in f: temp = line.split("-|-") print("Traitement du contig " + temp[0]) accession = temp[2].strip(" \t\n\r") u.mapping(fr='EMBL_ID', to='NF100', query=accession) res = u.search(accession, format='xml', limit=10) if res is '': r.write(temp[0] + "\tNone\n") print "aucun résultat pour ce contig" else: contig = temp[0].strip(" \t\n\r") with open(UNIREF_PATH+"result"+contig+".xml", "w") as xml: xml.write(res)
# -*- coding: utf-8 -*- """ Created on Mon Mar 28 11:39:12 2016 @author: Pieter """ import ms2matcher.ms2matcher as ms import os import argparse import pandas as pd import numpy as np from bioservices import UniProt u = UniProt(verbose=False) # Check provided arguments parser = argparse.ArgumentParser(description='MS2 experimental to database matcher') parser.add_argument("filepath",type=str, help="The path to the folder containing the experimental spectra to process.", metavar="filepath") parser.add_argument("--toleranceMS1","-t1",type=float,dest='ms1Tolerance',default=50, help="The mass accuracy for MS1 (default = 50 ppm).", metavar="t1") parser.add_argument("--toleranceMS2","-t2",type=float,dest='ms2Tolerance',default=0.1, help="The mass accuracy for MS1 (default = 0.1 Da).", metavar="t2") parser.add_argument("--FDR","-fdr",type=float,dest='desiredFDR',default=0.05, help="The desired FDR.", metavar="fdr") args = parser.parse_args() # Initialise file path to experimental spectra .dta files spectraFilePath = os.path.normpath(args.filepath) # Initialise other file paths (respect folder hierarchy in package) script_dir = os.path.dirname(__file__) #<-- absolute dir the script is in
def convert_gene_ids_bt(xml_file_in, id_identity = None, id_formatter = None, translate_file = None): """ Replace all found instances of old gene IDs to new IDs. N.B. It will only look at 'GENE ASSOCIATION' lines. 'translate_file' should be 2 column tsv file. """ ## Create ID conversion dictionary for MetaCyc translate_file = translate_file or '/Users/wbryant/Dropbox/Bacteroides/BioCyc_-_Protein-Gene-relations/BioCyc_BT_-_Protein-Gene-relations.txt' trans_in = open(translate_file,'r') id_dict = {} for line in trans_in: ids = line.split("\t") if len(ids[1]) > 0: id_dict[ids[0]] = ids[1].strip() id_identity = id_identity or model_metacyc_identifier id_formatter = id_formatter or model_metacyc_gene_2_biocyc ## Create gene -> locus dictionary from NCBI file ncbi_gene_file = '/Users/wbryant/work/BTH/data/NCBI/gene_list.dat' ncbi_in = open(ncbi_gene_file,'r') ncbi_id_dict = {} for line in ncbi_in: if re.search('[0-9]+\.[ ].+',line): ncbi_id = line.strip().split(" ")[-1] elif 'Other Aliases' in line: bt_ids = re.findall('BT\_[0-9]+',line) for bt_id in bt_ids: ncbi_id_dict[ncbi_id] = bt_id ncbi_in.close() ## Some specific UniProt IDs do not map - so put them here manually: uniprot_manual_dict = {} uniprot_manual_dict['Q8A1G3_BACTN'] = 'BT_3698' uniprot_manual_dict['G8JZS4_BACTN'] = 'BT_3703' uniprot_manual_dict['Q8A1G0_BACTN'] = 'BT_3704' uniprot_manual_dict['Q89YR9_BACTN'] = 'BT_4662' ### Run through lines of input file replacing relevant gene IDs with new gene IDs u = UniProt(verbose=False) xml_file_out = re.sub('\.xml','_out.xml',xml_file_in) f_in = open(xml_file_in,'r') f_out = open(xml_file_out,'w') for line in f_in: if 'GENE ASSOCIATION' in line: ## Look for genes fitting id_identity, convert and replace #print line ###! Change! line = re.sub('(\<[^\>]+\>[ \n]*$)',' \g<1>',line) old_ids = re.findall(id_identity, line) if len(old_ids) > 0: #print old_ids.groups(1) for old_id in old_ids: old_id_formatted = id_formatter(old_id) try: new_id = id_dict[old_id_formatted] except: new_id = old_id_formatted print("ID '%s' not found ..." % new_id) #print("%20s: %20s" % (old_id, new_id)) line = line.replace(old_id,new_id,1) ## Remove extraneous gene surrounds line = re.sub('\(gene\:([^\)]+)_i\)','\g<1>',line) ## Look for UniProt genes and convert if 'uniprot' in line: uniprot_entries = re.findall('\(uniprot\:[^\)]+\)',line) for uniprot_entry in uniprot_entries: ## Map IDs uniprot_id = re.sub('\(uniprot\:([^\)]+)\)','\g<1>',uniprot_entry) try: new_entry = u.mapping(fr='ACC',to='KEGG_ID',query=uniprot_id)[uniprot_id][0] except: print("Protein ID '%s' not found in mapping, trying local ..." % uniprot_id) try: new_entry = uniprot_manual_dict[uniprot_id] except: print("Protein ID '%s' not found in local ..." % uniprot_id) new_entry = uniprot_id new_id = re.sub('bth\:([^\)]+)','\g<1>',new_entry) line = line.replace(uniprot_entry,new_id,1) #u.mapping(fr='BIOCYC_ID',to='KEGG_ID',query='GJXV-2505') ## Get gene string line_groups = re.search('(.+GENE ASSOCIATION\:[ ]*)(.+)([ ]*\<.+)',line) gene_string = line_groups.group(2) if '_BACTN' in gene_string: print gene_string ## Look for NCBI IDs (like susG) and replace with BT IDs potential_ncbis = re.findall('[a-zA-Z0-9\_]+',gene_string) if '_BACTN' in gene_string: print ", ".join(potential_ncbis) for potential_ncbi in potential_ncbis: if potential_ncbi in ncbi_id_dict: new_id = ncbi_id_dict[potential_ncbi] gene_string = gene_string.replace(potential_ncbi,new_id,1) elif potential_ncbi in uniprot_manual_dict: new_id = uniprot_manual_dict[potential_ncbi] gene_string = gene_string.replace(potential_ncbi,new_id,1) ##Remove duplicates gene_list = gene_string.split(" or ") gene_list = list(set(gene_list)) ## Reconstitute line line = line_groups.group(1) line += " or ".join(gene_list) line += line_groups.group(3) f_out.write(line) else: f_out.write(line) f_out.close()
def get_more_node_ids(the_network, **kwargs): """ Script to add more identifiers to model notes based on the node.id Arguments: the_network: a Network object, modified in place kwargs: node_id_type: current type of ids used for the nodes. Currently can be Entrez Gene (GeneID) or any of the options in the BioServices UniProt mappings mapping_types: a list of target mapping id types to include verbose: Returns: the_network TODO: determine the best source db/module for pairings from bioservices """ continue_flag = True try: from bioservices import UniProt u = UniProt(verbose=False) except: print("No bioservices module installed or cannot connect, exiting...") print("e.g. if you are using pip, did you 'pip install bioservices'?") continue_flag = False the_node_locations = the_network.get_node_locations() if len(the_node_locations) == 0: print 'The network has no nodes, exiting...' continue_flag = False if 'node_id_type' in kwargs: node_id_type = kwargs['node_id_type'] else: node_id_type = "Entrez Gene (GeneID)" if 'mapping_types' in kwargs: mapping_types = kwargs['mapping_types'] else: mapping_types = default_mapping_target_list if 'verbose' in kwargs: verbose = kwargs['verbose'] else: verbose = True # Maximum number of items to # query at a time # Note there is a length limit in bioservices 1.2.1 # for the web-based query string. # Trial-and-error suggests the most # id's that can be queried are # between 100 and 1000 max_query_length = 500 if continue_flag: query_string = '' model_node_ids = [] for the_nodetype in the_network.nodetypes: model_node_ids += [x.id for x in the_nodetype.nodes] the_node_id_list_list = [[]] i = 0 j = 0 for the_node_id in model_node_ids: if (j + 1) % max_query_length == 0: the_node_id_list_list.append([]) i += 1 the_node_id_list_list[i] = [] j = 0 the_node_id_list_list[i].append(the_node_id) j += 1 query_string_list = [] for i, the_node_id_list in enumerate(the_node_id_list_list): query_string = '' for the_node_id in the_node_id_list: if len(query_string) > 0: query_string = query_string + ' ' + the_node_id else: query_string = the_node_id query_string_list.append(query_string) for the_target_type in mapping_types: the_result = {} for the_query_string in query_string_list: the_result.update(u.mapping(fr = available_mapping_source[node_id_type], to = available_mapping_target[the_target_type], query = the_query_string)) if verbose: print("**Finished mapping for %s to %s.**" % (node_id_type, the_target_type)) for the_nodetype in the_network.nodetypes: for the_node in the_nodetype.nodes: if (the_node.id in the_result.keys()): if len(the_result[the_node.id]) > 0: the_node.notes[the_target_type] = the_result[the_node.id] else: the_node.notes[the_target_type] = [] else: the_node.notes[the_target_type] = [] return the_network
class Annotations(MassSpecReader): """Create/store/read annotations from uniprot and figure out entry names The Annotations classes allows one to populate the dataframe attribute :attr:`df` with the **Entry** and **Entry_name** columns (UniProt entries). This is not strictly speaking required columns but provide more tools if available. The annotations also creates a new dataframe called :attr:`annotations` that stores in particular the protein sequence and the GO terms. The former being used to check the peptide sequence and the latter to plot relevant histogram about GO terms. This class inherits from :class:`msdas.readers.MassSpecReader`. Consequently, input can be a MassSpecReader instance, or a filename or even nothing (data can be read at a later stage). The dataframe must contain the Protein column. One reason to fetch the entries from UniProt is that the protein column name may contain typos or non-uniprot entries, therefore it is quite useful to fetch all entries from uniprot based on the protein name provided. This can be done thanks to the :meth:`get_uniprot_entries`. This method fills a dictionary called :attr:`_mapping` (note the underscore), which is used to populate a new column in the dataframe called **Entry**. If your initial dataframe contains the columns "Entry" with all valid UniProt entries (e.g., P23300) then the :attr:`_mapping` attribute is populated during the initialisation and the call to :meth:`get_uniprot_entries` can be skipped. If called, it will also be faster but will overwrite the content of the Entry column. You can also fill/correct/complete the :attr:`_mapping` attribute before calling :meth:`get_uniprot_entries` .. doctest:: >>> from msdas import annotations >>> import pandas as pd >>> df = pd.DataFrame({ 'Protein':['DIG1'], 'Sequence_Phospho':['SS(Phospho)T'], 'Psite':['S2']}) >>> a = annotations.Annotations(df, "YEAST") >>> a._mapping {} >>> a.get_uniprot_entries() {'DIG1_YEAST': ['Q03063']} >>> a.df.Entry 0 Q03063 Name: Entry, dtype: object Then, call :meth:`set_annotations`, which will fetch all annotations from uniprot and store them in a new dataframe in the :attr:`annotations` attribute :: a.set_annotations() a.annotations A new field called **Entry_name** is also added to the dataframe itself:: a.df.Entry_name On a big data set, it may take a few minutes to fetch all information from uniprot. So, we also provide tools to save and read back the relevant information ( :meth:`read_annotations`, :meth:`to_pickle`, :meth:`to_csv` ) :: from msdas import * r = readers.MassSpecReader(get_yeast_raw_data()) # this takes about 10 minutes depending on the connection for 1600 unique protein names r.get_uniprot_entries() r.set_annotations() r.to_pickle(tag="test") # creates a file called YEAST_annotations_test.pkl r.to_csv("data.csv") Next time, just type:: from msdas import * a = annotations.Annotations("data.csv", "YEAST") a.read_annotations("YEAST_annotations_test.pkl") To check that the entries are correct, one thing that can be done is to look for the peptide sequence into the FASTA sequence found in the annotations:: a.check_entries_versus_sequence() This is a very good sanity check to verify that the entry names found correspond to the peptide provided. If not, the protein name was probably wrong or was a gene name that could not be mapped correctly to the correct protein. If some entries are not found or mapping was not found, you need to manually check the issues and update the :attr:`_mapping` attribute, update the uniprot entries and annotations:: a._mapping[entry] = ['entry name'] a.get_uniprot_entries() a.set_annotations() a.check_entries_versus_sequence() if you cannot find a mapping, we would recommend to delete the item from the dataframe :attr:`df`. """ def __init__(self, data, organism=None, verbose=True, annotations=None, **kargs): """.. rubric:: Constructor :param data: a MassSpecReader compatible input (e.g., CSV file, None, a MassSpecReader instance). See :class:`msdas.readers.MassSpecReader` documentation for details :param organism: valid uniprot identifier for the organism e.g., HUMAN YEAST. :param annotations: a pickled file containing the annotations saved using :meth:`to_pickle`. :param kargs: valid parameter recognised by :class:`msdas.readers.MassSpecReader` """ super(Annotations, self).__init__(data=data, verbose=verbose, **kargs) if organism is None: raise ValueError("organism must be provided e.g. YEAST, HUMAN") self.organism = organism #: the dataframe where annotations from uniprot will be stored. self.annotations = None self._mapping = {} self.build_mapping_from_df()# if Entry is provided self._init_uniprot() if annotations: self.read_pickle(annotations) def _init_uniprot(self): if hasattr(self, "_uniprot") == False: self._uniprot = UniProt(verbose=self.debugLevel) def _update_species_to_find(self): entry_names = [x + "_" + self.organism for x in self.df.Protein] #unique_entry_names = list(set(entry_names)) species_to_find = [k for k in entry_names if k not in self._mapping.keys()] species_to_find = list(set(species_to_find)) self._species_to_find = list(set(species_to_find)) def build_mapping_from_df(self): """Populate the _mapping dictionary using the Uniprot Entry column""" if "Entry" in self.df.columns: for index in self.df.index: k = self.df.Protein.ix[index] if k.endswith("_"+self.organism) == False: k += "_" + self.organism v = self.df.Entry.ix[index] self._mapping[k] = [v] else: self.warning("Entry column not found in the dataframe. call get_uniprot_entries") def get_uniprot_entries(self, Nmax=50): """Search for the uniprot entries and entry names given protein column. Protein names from the dataframe are first used to feed uniprot mapping tool. Some protein names won't be found as a uniprot entry because there are not uniprot entry name but gene names. We therefore also scan missing entries by looking for gene names. Once found, the proposed items that contain the gene names and organism are candidates for the entry names. There may be several solutions though, which explain why the values in the :attr:`_mapping` dictionary are made of lists. If several candidates are found, warning and raised. Results are stored in :attr:`_mapping` and in the dataframe itself. Let us show one example with 3 protein names that cover all cases: * DIG1, is a valid uniprot entry * ASC1 is not a uniprot entry. It is a gene name from which the entry may be retrieved automatically. * LEU1 is a gene name AND a uniprot entry. This is an ambiguous case. The default is to use the uniprot entry but if you call :meth:`check_entries_versus_sequence` (after meth:`set_annotations`) you will see that there is a mismatch meaning that LEU1_YEAST provided in the protein column is catually not the protein name but the gene name :: >>> import pandas as pd >>> from msdas import * >>> df = pd.DataFrame({'Protein':['DIG1', 'LEU1', 'ASC1'], 'Sequence_Phospho':['S(Phospho)APAQVTQHSK', 'VEVTS(Phospho)EDEK', 'DS(Phospho)VTIISAGNDK'], 'Psite':['S142','S495', 'S166']}) >>> a = Annotations(df, "YEAST) >>> a.get_uniprot_entries() >>> a._mapping {'ASC1_YEAST': ['P38011', 'P01120'], 'DIG1_YEAST': ['Q03063'], 'LEU1_YEAST': ['P06208-1', 'P06208']} Here, DIG1 has one unique entry. This is expected because DIG1 is in fact an entry name (unique by definition). ASC1 is a gene name. This method figures out that it correspond to either P38011 or P01120. There are several entries because mapping from gene to protein is not unique. By default, the entry with highest score appears first. There is no 100% guarantee that this mapping is correct and :meth:`check_entries_versus_sequence` should be called to check that the peptide sequence is contained in this entry sequence. The last case (LEU1) is even more problematic because it is a valid entry name even though the protein name provided is actually a gene name... again call :meth:`check_entries_versus_sequence`. >>> a.set_annotations() >>> a.check_entries_versus_sequence() P06208-1 not found in the annotations index So, here we are told that amongst the 3 entries, P06208-1 is not found. This the LEU1 case. If you were to use batch tool, you would figure out given the peptide sequence that this is actually LEUC_YEAST entry with uniprot entry LEUC_YEAST/P07264. So, you need to manually update the mapping: >>> a._mapping['LEU1_YEAST'] = ['P07264'] >>> a.get_uniprot_entries() # to update the main df with new entries >>> a.set_annotations() # to retrieve the sequence of LEUC_YEAST >>> a.check_entries_versus_sequence() .. seealso:: :meth:`set_annotations` """ # get the mapping using bioservices.uniprot # apply function is 3 times slower than list... # entry_names = self.df.Protein.apply(lambda x: x + "_" + self.organism) self._update_species_to_find() if len(self._species_to_find)>0: self.logging.info("Fetching uniprot accession numbers for %s entries" % len(self.df.Protein)) self.logging.info("Fetching uniprot accession numbers for %s unique entries" % len(self.df.Protein.unique())) mapping = self._uniprot.multi_mapping(fr="ID", to="ACC",Nmax=Nmax, query=self._species_to_find) for k,v in mapping.iteritems(): if k not in self._mapping.keys(): self._mapping[k] = v # some species may not be found (secondary accession number) if _human # appended in tcell case. so we may need to call again the mapping but # without the appended organism string. self._update_species_to_find() if len(self._species_to_find): self.logging.info("Some species were not found ({}). Using secondary accession:".format(len(self._species_to_find))) self.logging.info("Fetching uniprot without trailing species") self.logging.info("Fetching %s new ones " % len(self._species_to_find)) mapping = self._uniprot.multi_mapping(fr="ID", to="ACC", query=[x.split("_")[0] for x in self._species_to_find], Nmax=Nmax) for k,v in mapping.iteritems(): self._mapping[k+ "_" + self.organism] = v # Some are not yet found. this could be because the provided protein name is actually a # gene name... def func(x, tag): if len(x)==0: return False else: return tag in x[0].split() self._genes = {} self._update_species_to_find() if len(self._species_to_find): self.logging.info("Some species are still not found {}. Trying to use gene names".format(len(self._species_to_find))) self.logging.info("Fetching uniprot accession numbers for those without _species appended") self.logging.info("Fetching %s new ones " % len(self._species_to_find)) for i,this in enumerate(self._species_to_find): if " " in this: continue self.logging.info("Searching for entry {}/{} for gene names".format(i+1,len(self._species_to_find))) df = self._uniprot.get_df(this.split("_")[0], organism=self.organism) l1 = df['Gene names'].apply(lambda x : func(x,this.split("_")[0] )) l2 = df['Entry name'].apply(lambda x: x.endswith(self.organism)) if sum(l1&l2) >= 1: k = list(df.ix[l2&l1]['Entry name']) v = list(df.ix[l2&l1]['Entry']) self.logging.debug(k, v, this) if k in self._mapping.keys(): raise ValueError("!!!!!!!%s Already in the dictionary " % k) #self._mapping[k] = v self._mapping[this] = v else: print("skipping %s... sum=%s" % (this, sum(l1&l2))) self._update_species_to_find() if len(self._species_to_find): self.logging.info("Some species were not found. Using gene names") self.logging.info("Fetching uniprot accession numbers for those without _species appended") self.logging.info("Fetching %s new ones " % len(self._species_to_find)) self._append_uniprot_entries_to_df() def _append_uniprot_entries_to_df(self): if "Entry" in self.df.columns: self.logging.warning("Overwritting column called Entry in the dataframe") # get list of unique entry names. entry_names = self.df.Protein.apply(lambda x: x + "_" + self.organism) #remapping = [(k,v[0]) for k,va.g in a._mapping.iteritems()] # add into dataframe the uniprot entries but must have same order as # in the dataframe (entry_names) uniprot_entries = [] for name in entry_names: # if not found, let us use unknown as a label. could use NA? uniprot_entry = self._mapping.get(name, "") if uniprot_entry == "": uniprot_entries.append("") print("!! ", name, " not found") else: if len(uniprot_entry)>1: self.logging.info("Found entry with several matches: %s %s . Only first one is selected (highest uniprot score)" % (name, uniprot_entry)) uniprot_entries.append(uniprot_entry[0]) # index=df.index is important to use the join afterwards thisdf = pd.DataFrame({'Entry': uniprot_entries}, index=self.df.index) if "Entry" in self.df.columns: del self.df['Entry'] self.df = self.df.join(thisdf) def _append_uniprot_entry_names_to_df(self): if isinstance(self.annotations, types.NoneType) == True: self.error("must call set_annotations first") return # let us add the Entry_name column as well entry_names = [self.annotations.ix[e]['Entry name'] if e in self.annotations.index else "" for e in self.df.Entry] self.df['Entry_name'] = entry_names def plot_goid_histogram(self, drop_duplicates=True): """Histrogram of the number of GO terms per petide or protein :param drop_duplicates: ignore duplicates entries .. plot:: :width: 80% :include-source: from msdas import * m = Annotations(get_yeast_small_data(), "YEAST", verbose=False) m.set_annotations() m.plot_goid_histogram() .. todo:: is this functional process or not """ if self.annotations is False: raise AttributeError(self._error_messages['annotations']) if drop_duplicates: entries = self.df.Entry.drop_duplicates() counter = self.annotations.ix[entries]['Gene ontology IDs'].apply(lambda x: len(x)) M = counter.max() # if we want the GO per peptides, then we need to look at the original # dataframe that contains several psites per peptide. UniProt_entry is # not a set so values from counter may be duplicated, which is what we # want for this first figure duplicated_counter = [counter[x] for x in self.df.Entry] pylab.figure(1) pylab.clf() pylab.hist(duplicated_counter, bins=[x+.5 for x in range(0,M+1)]) pylab.title("Distribution of number of GO id terms per peptide") pylab.grid() # annotations contains the unique protein entry, so here we get the number of GO terms per protein counter = self.annotations['Gene ontology IDs'].apply(lambda x: len(x)) M = counter.max() pylab.figure(2) pylab.clf() pylab.hist(counter, bins = [x+.5 for x in range(0,M+1)]) pylab.title("Distribution of number of GO id terms per protein") pylab.grid() def set_annotations(self, Nmax=100): """Fetched all information from uniprot and set :attr:`annotations` as a pandas dataframe. Look into the dataframe Entry column and update the annotations dataframe to populate missing entries. The Entry column in the :attr:`df` should have been populated by :meth:`get_uniprot_entries` with valid entries from Uniprot. If you have thousand of entries, this is taking a few minutes. You can save the annotations and read them back using :meth:`msdas.MassSpecReader.read_annotations` and :meth:`to_pickle`. """ self.logging.info("Fectching information from uniprot. Takes some time") #could split if too long entries = [this for this in list(set(self.df.Entry)) if this] # not need to search again if already present in the attribute if self.annotations is not None: entries = [x for x in entries if x not in list(self.annotations.index)] if len(entries)==0: self.warning("No new entries found. Your annotations dataframe is already up-to-date") self.annotations.drop_duplicates(subset="Entry name", inplace=True) self._append_uniprot_entry_names_to_df() return annotations = self._uniprot.get_df(entries, nChunk=Nmax) annotations = annotations[annotations.Entry.apply(lambda x: x in entries)] if len(annotations) == 0: raise ValueError("your list of protein is empty") self.logging.info("Fectching {}".format(len(annotations))) annotations.set_index(["Entry"], inplace=True) if self.annotations is None: self.annotations = annotations else: self.annotations = self.annotations.append(annotations) #self.annotations.set_index(["Entry"], inplace=True) self.logging.info("Annotations have been loaded. You can save the annotations" + " dataframe attribute using x.to_pickle('annotations.pkl') " + " Next time, you could just load if using \n\n" + " >>> m = readers.MassSpecReader(filename, mode='yeast')\n" + " >>> m.read_annotations('annotations.pkl')") #indices are the uniprot entry. Some may be identical with slightly different columns # but the entry name should be unique. Here, we keep the first instance of each entry self.annotations.drop_duplicates(subset="Entry name", inplace=True) self._append_uniprot_entry_names_to_df() def to_pickle(self, tag=None, overwrite=False): """Save annotations dataframe as a pickle :param tag: a tag to append to the name of the annotations file. :param overwrite: overwrite file if it exists filename is going to be organism_annotations_tag.pkl """ filename = self.organism + "_annotations" if tag != None and isinstance(tag, str): filename += "_" + tag filename += ".pkl" if overwrite == False: if os.path.exists(filename): raise IOError("file %s already exists" % filename) self.annotations.to_pickle(filename) def read_pickle(self, filename): """Read annotations in pickled format as saved by :meth:`to_pickle` :param str filename: filename to read """ try: self.annotations = pd.read_pickle(filename) # update the mapping dictionary for k,v in self.annotations['Entry name'].iteritems(): if k not in self._mapping.keys(): self._mapping[v] = [k] except: self.logging.error("Could not read your file. Expected a pkl \ containing a dataframe with Entry name and index being uniprot \ indices. ") def hist_most_relevant_goids(self, N=10, tight_layout=True, wrap_length=40, drop_duplicates=True, **kargs): """Plot histogram of the GO identifiers found in all proteins. :param int N: restrict histogram to terms that appear at least N times :param int wrap_length: wrap text on the y-axis by wrap_length (defaults to 40) :param drop_duplicates: drop the duplicated entries :param kargs: pandas.plot arguments accepted. .. plot:: :include-source: :width: 80% from msdas import * m = Annotations(get_yeast_small_data(), "YEAST", verbose=False) m.set_annotations() m.hist_most_relevant_goids(N=5) .. todo:: this is made on the annotations dataframe. Should be done based on the entry names in the dataframe """ if self.annotations is False: raise AttributeError(self._error_messages['annotations']) kargs['legend'] = kargs.get("legend", False) if drop_duplicates: entries = self.df.Entry.drop_duplicates() goids = [y for x in self.annotations.ix[entries]['Gene ontology (GO)'] for y in x] uniq_goids = set(goids) names = [x for x in uniq_goids] # let us wrap the string by 40 character max to avoid long labels in the figure names = ["\n".join(textwrap.wrap(name, width=wrap_length)) for name in names] count = [goids.count(x) for x in uniq_goids] df = pd.DataFrame({'name':names, 'size':count}, index=range(0, len(uniq_goids))) if N: subdf = df[df['size']>N].set_index("name") subdf.sort("size").plot(kind="barh", **kargs) if tight_layout: pylab.tight_layout() def check_entries_versus_sequence(self): """Check that peptide sequence are contained in uniprot sequence This is a very good sanity check on the validity of the uniprot entry names found by :meth:`get_uniprot_entries` method If a peptide sequence is not found, it means that the protein name is not correct. See AnnotationsYeast class where the :meth:`AnnotationsYeast.update_mapping` is used to update the incorrect mapping. .. seealso:: :meth:`find_sequence_blast` """ self.logging.info("Comparing peptide sequence in the attribute df with sequences in the annotations") self.logging.info("row index, protein name, uniprot entry") if isinstance(self.annotations, types.NoneType): raise Exception("annotations not set. call set_annotations") found = False for i in self.df.index: entry = self.df.ix[i].Entry if entry not in self.annotations.index: print("{} not found in the annotations index".format(entry)) continue if self.df.ix[i].Sequence not in self.annotations.ix[entry].Sequence: if found == False: print("Found unknown entries\nindex, protein name, uniprot entry ") found = True print(i, self.df.ix[i].Protein, self.df.ix[i].Entry) def find_sequence_blast(self, seq, email): """Utility to search for a sequence using BLAST via bioservices :param str seq: the sequence :param email: a valid email address .. note:: This is using NCIBlast web service via `BioServices <https://pypi.python.org/pypi/bioservices>`_. """ from bioservices import NCBIblast s = NCBIblast(verbose=self.level) jobid = s.run(program="blastp", sequence=seq, stype="protein", database="uniprotkb", email=email) return s.getResult(jobid, "out") def to_csv(self, filename): """Export the dataframe with data and annotations into a CSV file :meth:`set_annotations` and :meth:`get_uniprot_entries` must have been called. """ if "Entry" not in self.df.columns or "Entry_name" not in self.df.columns: raise ValueError("Entry or Entry_name missing in dataframe. You must call get_entries_uniprot and set_annotations methods") self.df.Identifier = self.df.Protein + "_" + self.df.Psite self.df.to_csv(filename, index=False, sep=",")
def get_more_node_ids(the_network, **kwargs): """ Script to add more identifiers to model notes based on the node.id Arguments: the_network: a Network object, modified in place kwargs: node_id_type: current type of ids used for the nodes. Currently can be 'Entrez Gene (GeneID)' or any of the options in the BioServices UniProt mappings mapping_types: a list of target mapping id types to include. Options can be viewed in core.parameters.py Note "Symbol" is an additional option for the officieal gene nomenclature symbol. email: optional, for NCBI queries. verbose: [True (default), False] Returns: the_network TODO: determine the best source db/module for pairings from bioservices """ continue_flag = True valid_mapping_targets = available_mapping_target.keys() + ['Symbol'] verbose = test_kwarg('verbose', kwargs, [True, False]) try: from bioservices import UniProt # Don't want verbosity at this low of a level u = UniProt(verbose = False) except: print("No bioservices module installed or cannot connect, exiting...") print("e.g. if you are using pip, did you 'pip install bioservices'?") continue_flag = False the_node_locations = the_network.get_node_locations() if len(the_node_locations) == 0: print 'The network has no nodes, exiting...' continue_flag = False if 'node_id_type' in kwargs: node_id_type = kwargs['node_id_type'] if node_id_type == 'Symbol': print "'Symbol' is a special case, not yet able to query with this option, exiting..." continue_flag = False else: print "No node id type specified, attempting to use 'Entrez Gene (GeneID)'" node_id_type = 'Entrez Gene (GeneID)' if 'mapping_types' in kwargs: mapping_types = [x for x in kwargs['mapping_types'] if x in valid_mapping_targets] if len(mapping_types) == 0: print('No valid mapping_types selected, exiting...') continue_flag = False elif 'Symbol' in mapping_types: if (('Entrez Gene (GeneID)' not in mapping_types) & (node_id_type != 'Entrez Gene (GeneID)')): print "'Symbol' mapping type needs 'Entrez Gene (GeneID)', exiting..." continue_flag = False else: mapping_types = default_mapping_target_list if 'email' in kwargs: email = kwargs['email'] else: email = '' # Maximum number of items to # query at a time # Note there is a length limit in bioservices 1.2.1 # for the web-based query string. # Trial-and-error suggests the most # id's that can be queried are # between 100 and 1000 max_query_length = 500 if continue_flag: query_string = '' model_node_ids = [] model_nodes = [] for the_nodetype in the_network.nodetypes: model_nodes += [x for x in the_nodetype.nodes] the_node_id_list_list = [[]] i = 0 j = 0 for the_node in model_nodes: if (j + 1) % max_query_length == 0: the_node_id_list_list.append([]) i += 1 the_node_id_list_list[i] = [] j = 0 the_node_id_list_list[i].append(the_node.id) j += 1 query_string_list = [] for i, the_node_id_list in enumerate(the_node_id_list_list): query_string = '' for the_node_id in the_node_id_list: if len(query_string) > 0: query_string = query_string + ' ' + the_node_id else: query_string = the_node_id query_string_list.append(query_string) for the_target_type in mapping_types: if the_target_type != 'Symbol': the_result = {} for the_query_string in query_string_list: the_result.update(u.mapping(fr = available_mapping_source[node_id_type], to = available_mapping_target[the_target_type], query = the_query_string)) if verbose: print("**Finished mapping for %s to %s.**" % (node_id_type, the_target_type)) for the_node in model_nodes: if (the_node.id in the_result.keys()): if len(the_result[the_node.id]) > 0: the_node.notes[the_target_type] = the_result[the_node.id] else: the_node.notes[the_target_type] = [] else: the_node.notes[the_target_type] = [] # To avoid a loss of information, we should also make # sure queried IDs are returned in the appropriate # field in case they weren't available in the database. if node_id_type in mapping_types: # Not yet supported anyway, but can leave this here. if node_id_type != 'Symbol': for the_node in model_nodes: if the_node.id not in the_node.notes[node_id_type]: the_node.notes[node_id_type].append(the_node.id) if "Symbol" in mapping_types: if ((node_id_type == "Entrez Gene (GeneID)") | ("Entrez Gene (GeneID)" in mapping_types)): the_entrez_to_query = [] query_dict = {} for the_node in model_nodes: query_dict[the_node.id] = {} query_dict[the_node.id]["Entrez Gene (GeneID)"] = [] if node_id_type == "Entrez Gene (GeneID)": query_dict[the_node.id]["Entrez Gene (GeneID)"].append(the_node.id) if "Entrez Gene (GeneID)" in mapping_types: the_entrez_list = the_node.notes["Entrez Gene (GeneID)"] if len(the_entrez_list) > 0: for the_entrez_id in the_entrez_list: if the_entrez_id not in query_dict[the_node.id]["Entrez Gene (GeneID)"]: query_dict[the_node.id]["Entrez Gene (GeneID)"].append(the_entrez_id) the_entrez_to_query += query_dict[the_node.id]["Entrez Gene (GeneID)"] the_entrez_to_query = list(set(the_entrez_to_query)) the_symbol_dict = get_entrez_annotation(the_entrez_to_query, email = email, verbose = verbose) for the_node in model_nodes: the_node.notes["Symbol"] = [] for the_entrez_id in query_dict[the_node.id]["Entrez Gene (GeneID)"]: the_symbol_id = the_symbol_dict[the_entrez_id]['NomenclatureSymbol'] if len(the_symbol_id) > 0: the_node.notes["Symbol"].append(the_symbol_id) print("**Finished mapping for %s to %s.**" % (node_id_type, "Symbol")) elif verbose: print "'Entrez Gene (GeneID)' mappings are needed first in order to query symbols, skipping..." return the_network
def get_more_source_dict_ids(source_dict, primary_key_type, **kwargs): """ Script to add more ids to source dict nodes to facilitate pairing to a network Arguments: source_dict: id_key: value primary_key: current type of ids used for the top level dict key. Currently can be 'Entrez Gene (GeneID)' or any of the options in the BioServices UniProt mappings. kwargs: mapping_types: a list of mapping types to include. See core.parameters for the full list. Note 'Symbol' is a special case for querying that depends on Entrez ID availability. verbose: [False (default), True] email: optional, for NCBI if querying for 'Symbol' Returns: source_dict, also modified in place """ continue_flag = True verbose = test_kwarg('verbose', kwargs, [False, True]) valid_mapping_targets = available_mapping_target.keys() + ['Symbol'] if primary_key_type not in available_mapping_source.keys(): if primary_key_type == 'Symbol': print "'Symbol' is a special case, not yet able to query with this as a primary key." print "Error, you must specify a valid primary_key_type descriptor to match to in the available database, exiting..." continue_flag = False if 'mapping_types' in kwargs: mapping_types = [x for x in kwargs['mapping_types'] if x in valid_mapping_targets] if len(mapping_types) == 0: print('No valid mapping_types selected, exiting...') continue_flag = False elif 'Symbol' in mapping_types: if (('Entrez Gene (GeneID)' not in mapping_types) & (primary_key_type != 'Entrez Gene (GeneID)')): print "'Symbol' mapping type needs 'Entrez Gene (GeneID)', exiting..." continue_flag = False else: mapping_types = default_mapping_target_list if 'email' in kwargs: email = kwargs['email'] else: email = '' try: from bioservices import UniProt # Don't want verbosity at this low of a level u = UniProt(verbose = False) except ImportError: print("No BioServices module installed or cannot connect, exiting...") print("e.g. if you are using pip, did you 'pip install bioservices'?") continue_flag = False # Maximum number of items to # query at a time # Note there is a length limit in bioservices 1.2.1 # for the web-based query string. # Trial-and-error suggests the most # id's that can be queried are # between 100 and 1000 max_query_length = 500 if continue_flag: the_query_id_list_list = [[]] i = 0 j = 0 for the_query_id in source_dict.keys(): if (j + 1) % max_query_length == 0: the_query_id_list_list.append([]) i += 1 the_query_id_list_list[i] = [] j = 0 the_query_id_list_list[i].append(the_query_id) j += 1 the_query_string_list = [] for i, the_query_id_list in enumerate(the_query_id_list_list): query_string = '' for the_query_id in the_query_id_list: if len(query_string) > 0: query_string = query_string + ' ' + the_query_id else: query_string = the_query_id the_query_string_list.append(query_string) for the_key in source_dict.keys(): if type(source_dict[the_key]) != dict: the_value = source_dict[the_key] source_dict[the_key] = {} source_dict[the_key]['value'] = the_value for the_target_type in mapping_types: if the_target_type != 'Symbol': the_result = {} for the_query_string in the_query_string_list: the_result.update(u.mapping(fr = available_mapping_source[primary_key_type], to = available_mapping_target[the_target_type], query = the_query_string)) if verbose: print("** Finished mapping for %s to %s. **" % (primary_key_type, the_target_type)) for the_query_id in source_dict.keys(): if the_query_id in the_result.keys(): if len(the_result[the_query_id]) > 0: source_dict[the_query_id][the_target_type] = the_result[the_query_id] else: source_dict[the_query_id][the_target_type] = [] else: source_dict[the_query_id][the_target_type] = [] # To avoid a loss of information, we should also make # sure queried IDs are returned in the appropriate # field in case they weren't available in the database. if primary_key_type in mapping_types: # Not yet supported but we can check to avoid breaking this if primary_key_type != 'Symbol': for the_source_dict_id in source_dict.keys(): if the_source_dict_id not in source_dict[the_source_dict_id][primary_key_type]: source_dict[the_source_dict_id][primary_key_type].append(the_source_dict_id) if "Symbol" in mapping_types: if ((primary_key_type == "Entrez Gene (GeneID)") | ("Entrez Gene (GeneID)" in mapping_types)): the_entrez_to_query = [] # Make query_dict in case "Entrez Gene (GeneID)" was # a primary_key_type but not in mapping_types query_dict = {} for the_source_dict_id in source_dict.keys(): query_dict[the_source_dict_id] = {} query_dict[the_source_dict_id]["Entrez Gene (GeneID)"] = [] if primary_key_type == "Entrez Gene (GeneID)": query_dict[the_source_dict_id]["Entrez Gene (GeneID)"].append(the_source_dict_id) if "Entrez Gene (GeneID)" in mapping_types: the_entrez_list = source_dict[the_source_dict_id]["Entrez Gene (GeneID)"] if len(the_entrez_list) > 0: for the_entrez_id in the_entrez_list: if the_entrez_id not in query_dict[the_source_dict_id]["Entrez Gene (GeneID)"]: query_dict[the_source_dict_id]["Entrez Gene (GeneID)"].append(the_entrez_id) the_entrez_to_query += query_dict[the_source_dict_id]["Entrez Gene (GeneID)"] the_entrez_to_query = list(set(the_entrez_to_query)) the_symbol_dict = get_entrez_annotation(the_entrez_to_query, email = email, verbose = verbose) for the_source_dict_id in source_dict.keys(): source_dict[the_source_dict_id]["Symbol"] = [] for the_entrez_id in query_dict[the_source_dict_id]["Entrez Gene (GeneID)"]: the_symbol_id = the_symbol_dict[the_entrez_id]['NomenclatureSymbol'] if len(the_symbol_id) > 0: source_dict[the_source_dict_id]["Symbol"].append(the_symbol_id) print("**Finished mapping for %s to %s.**" % (primary_key_type, "Symbol")) return source_dict
class GOTermAdder(object): def __init__(self, input_file, gene_id_column, output_file): self._input_file = input_file self._gene_id_column = gene_id_column self._output_file = output_file self._tmp_folder = "tmp_data" self._uniprot = UniProt(verbose=False) self._quickgo = QuickGO(verbose=False) if os.path.exists(self._tmp_folder) is False: os.mkdir(self._tmp_folder) def add_go_terms(self): with open(self._output_file, "w") as output_fh: for row in csv.reader(open(self._input_file), delimiter="\t"): if len(row[0]) == 0: self._write_row(row, output_fh) continue else: row = self._add_go_term_column(row) self._write_row(row, output_fh) def _write_row(self, row, output_fh): output_fh.write("\t".join(row) + "\n") def _add_go_term_column(self, row): gene_id = self._gene_id(row) uniprot_id = self._uniprot_id(gene_id) if uniprot_id is None: return row go_terms = self._go_terms(uniprot_id) go_term_names = [ self._go_term_name(go_term) for go_term in go_terms] assert len(go_terms) == len(go_term_names) row.append(", ".join( ["%s (%s)" % (go_terms, go_term_names) for go_terms, go_term_names in zip(go_terms, go_term_names)])) return row def _uniprot_id(self, gene_id): file_path = self._tmp_file_path(gene_id) if os.path.exists(file_path) is True: with open(file_path) as json_fh: return json.load(json_fh)["Uniprot"] else: uniprot_id = self._search_uniprot_id(gene_id) gene_data = {"Uniprot" : uniprot_id} with open(file_path, "w") as json_fh: json.dump(gene_data, json_fh) def _go_terms(self, uniprot_id): file_path = self._tmp_file_path(uniprot_id) if os.path.exists(file_path) is True: with open(file_path) as json_fh: return json.load(json_fh)["GO-Terms"] else: uniprot_entry = self._uniprot.searchUniProtId(uniprot_id) go_ids = [] for dbref in uniprot_entry.findAll("dbreference"): if dbref.attrs["type"] == "GO": go_ids.append(dbref.attrs["id"]) go_term_data = {"GO-Terms" : go_ids} with open(file_path, "w") as json_fh: json.dump(go_term_data, json_fh) return go_ids def _go_term_name(self, go_term): file_path = self._tmp_file_path(go_term) if os.path.exists(file_path) is True: with open(file_path) as json_fh: return json.load(json_fh)["name"] else: go_term_info = self._quickgo.Term(go_term).soup go_term_name = go_term_info.term.find("name").text go_term_data = {"name" : go_term_name} with open(file_path, "w") as json_fh: json.dump(go_term_data, json_fh) return go_term_name def _search_uniprot_id(self, gene_id): uniprot_id_search = self._uniprot.quick_search(gene_id) if len(uniprot_id_search) == 1: uniprot_id = uniprot_id_search.keys()[0] return uniprot_id elif len(uniprot_id_search) > 1: pass elif len(uniprot_id_search) > 0: pass def _tmp_file_path(self, gene_id): return "%s/%s.json" % (self._tmp_folder, gene_id) def _gene_id(self, row): return row[self._gene_id_column-1].split("GeneID:")[1].split(";")[0]