def get_default_df(cls, usecache=True): cp = ConfigParser() cp.read(os.path.expanduser('~/git/cafa4/etc/cafa4.conf')) upg = UniProt(cp) df = upg.get_swissprot_df(usecache=usecache) return df
def __init__(self, ): self.uniprot = UniProt() self.family_domains_columns = [ 'comment(DOMAIN)', 'comment(SIMILARITY)', 'families', 'feature(COILED COIL)', 'feature(COMPOSITIONAL BIAS)', 'feature(DOMAIN EXTENT)', 'feature(MOTIF)', 'feature(REGION)', 'feature(REPEAT)', 'feature(ZINC FINGER)' ]
def create(cls, name, query, godag, verbose=True): if verbose: print( 'No data with name \'{}\' found. Downloading... (This may take a while)' .format(name)) # download current GO DAG r = requests.get('http://purl.obolibrary.org/obo/go/go-basic.obo', allow_redirects=True) with open(PATH + '/godag/' + name + '.obo', 'wb') as file: file.write(r.content) # setup UniProt API up = UniProt() up.settings.TIMEOUT = None # load GO DAG godag = GoDag(name=godag) # download result = up.search(query, columns='id, sequence, go(molecular function)') df = pd.read_csv(StringIO(result), delimiter='\t') # filter go terms present in godag df['Gene ontology (molecular function)'] = df[ 'Gene ontology (molecular function)'].map(lambda labels: [ l for l in re.findall('GO:\d{7}', str(labels)) if l in godag.GODAG ]) df = df[df['Gene ontology (molecular function)'].map( lambda l: len(l) > 0)] # annotate full go df['Gene ontology (molecular function)'] = df[ 'Gene ontology (molecular function)'].map( lambda labels: godag.get_go_lineage_of(labels)) # clean df = cls.clean(df) # save df['labels'] = df.apply(lambda row: '; '.join(row['labels']), axis=1) os.mkdir(PATH + '/datasets/' + name) df.to_csv(PATH + '/datasets/' + name + '/data.csv', index=False) with open(PATH + '/datasets/' + name + '/info.txt', 'w') as file: file.write('Downloaded: {}'.format(date.today())) if verbose: print('Raw data successfully downloaded.')
class ProteinIdentifier(object): def __init__(self): super().__init__() self.seguid = seguid self.uniprot = UniProt(verbose=False) def sequence_from_uniprot(self, uniprot_ac): """Returns protein sequence from uniprot identifier""" try: return self.uniprot.get_fasta_sequence(uniprot_ac) except ValueError: return None @staticmethod def protein_identifier_resolver(): """Returns protein sequence of a given identifier, using""" pass # TODO @staticmethod def encode(sequence): """Protein seguid checksum based on amino-acid sequence""" return str(self.seguid(sequence))
def get_uniprot_df(accList): from bioservices.uniprot import UniProt #imports uniprot u = UniProt(verbose=True) df = u.get_df(accList) return df
class UniProt(object): ''' Aux info plugin. Takes dataframe, extracts entry_ids, adds info from uniprot. Returns modified dataframe. ''' ASPECTMAP = {'C': 'cc', 'F': 'mf', 'P': 'bp'} def __init__(self, config): self.log = logging.getLogger(self.__class__.__name__) self.config = config self.uniprotapi = None self.outdir = os.path.expanduser(config.get('global', 'outdir')) self.taxid_mapfile = os.path.expanduser( config.get('global', 'taxid_mapfile')) self.sprotdatfile = os.path.expanduser( config.get('ontologyplugin', 'sprotdatfile')) self.cachedir = os.path.expanduser( config.get('ontologyplugin', 'cachedir')) excodes = config.get('ontologyplugin', 'excluded_evidence_codes', fallback=[]).split(',') excodes = [x.strip() for x in excodes] self.excluded_evidence_codes = excodes self.sprotdf = None self.udf = None self.tdf = pd.read_csv(self.taxid_mapfile, index_col=0) # Create easy lookup mappings from taxon data frame... itdf = self.tdf.set_index('taxonid') self.taxiddict = itdf.to_dict(orient='index') isdf = self.tdf.set_index('species') self.specdict = isdf.to_dict(orient='index') self.log.debug("UniProtGOlugin initialized.") def cafa_execute(self, dataframe, online=False): """ Takes inbound dataframe of orthologs and adds in GO terms and evidence codes from uniprot/swissprot. For a given ortholog protein, one row is added for each GO term. Returns new dataframe with all info. """ # # inbound: # cafaid evalue score bias db proteinacc protein species cafaprot cafaspec # 0 T100900000001 1.100000e-156 523.6 8.5 sp Q9CQV8 1433B MOUSE 1433B MOUSE # 1 T100900000001 4.100000e-155 518.4 7.7 sp P35213 1433B RAT 1433B MOUSE # 2 T100900000001 5.400000e-155 518.0 7.2 sp A4K2U9 1433B PONAB 1433B MOUSE # 3 T100900000001 5.400000e-155 518.0 7.2 sp P31946 1433B HUMAN 1433B MOUSE # Get all unique target accession numbers. entries = dataframe['proteinacc'].unique().tolist() # Look up GOterms in uniprot... if online: self.uniprotapi = UniProt() self.log.debug("Querying uniprot API for %d unique entries" % len(entries)) self.udf = self.uniprotapi.get_df(entries) self.log.debug(f"\n{self.udf}") self.udf.to_csv("%s/uniprot.csv" % self.outdir) udfslim = self.udf[['Entry', 'Gene ontology IDs']] # df.tacc corresponds to udf.Entry ... # entry == proteinid # gene ontology id = goterm # self.log.debug("Making new rows for each goterm.") newrowdict = {} ix = 0 for row in udfslim.itertuples(): (entry, golist) = row[1:] for goterm in golist: #print("creating new row: %s : %s %s %s" % (ix, entry, gene, goterm)) newrow = [entry, goterm] newrowdict[ix] = newrow ix += 1 godf = pd.DataFrame.from_dict(newrowdict, orient='index', columns=['entry', 'goterm']) else: self.log.debug("Using offline functionality...") godf = self.get_swissprot_df(usecache=True) self.log.debug(f"GO DataFrame:\n{godf}") # proteinid proteinacc goterm goaspect goevidence # 0 001R_FRG3G Q6GZX4 GO:0046782 bp IEA # 1 002L_FRG3G Q6GZX3 GO:0033644 cc IEA # For each go term add row... newdfdict = {} ix = 0 for row in dataframe.itertuples(): self.log.debug("inbound row = %s" % str(row)) #(query, evalue, score, bias, db, tacc, protein, species) = row[1:] (cafaid, evalue, score, bias, db, proteinacc, protein, species, cafaprot, cafaspec) = row[1:] self.log.debug(f"Searching for match for '{proteinacc}'") gomatch = godf[godf.proteinacc == proteinacc] self.log.debug(f"gomatch is:\n {gomatch}") for gr in gomatch.itertuples(): (entry, proteinacc, protein, species, goterm, goaspect, goevidence) = gr[1:] newrow = [ cafaid, evalue, score, bias, db, proteinacc, protein, species, cafaprot, cafaspec, goterm, goaspect, goevidence ] newdfdict[ix] = newrow ix += 1 newdf = pd.DataFrame.from_dict(newdfdict, orient='index', columns=[ 'cafaid', 'evalue', 'score', 'bias', 'db', 'proteinacc', 'protein', 'species', 'cafaprot', 'cafaspec', 'goterm', 'goaspect', 'goevidence' ]) for xc in self.excluded_evidence_codes: self.log.debug( f"{len(newdf.index)} rows. Removing evidence code {xc}...") #newdf = newdf[newdf.goevidence != xc] newdf.drop(newdf.loc[newdf['goevidence'] == xc].index, inplace=True) self.log.debug(f"{len(newdf.index)} rows after.") self.log.debug(f"\n{str(newdf)}") return newdf # Output: # cafaid evalue score bias db proteinacc protein species cafaprot cafaspec goterm goaspect goevidence # 0 T100900000001 1.100000e-156 523.6 8.5 sp Q9CQV8 1433B MOUSE 1433B MOUSE GO:0005737 cc ISO # 1 T100900000001 1.100000e-156 523.6 8.5 sp Q9CQV8 1433B MOUSE 1433B MOUSE GO:0005829 cc ISO # 2 T100900000001 1.100000e-156 523.6 8.5 sp Q9CQV8 1433B MOUSE 1433B MOUSE GO:0042470 cc IEA # def _dat2upr(self): self.log.debug("opening swissprot dat file %s" % self.sprotdatfile) rgen = SeqIO.parse(self.sprotdatfile, "swiss") i = 0 uprlist = [] self.log.debug("Completed SeqIO.parse(). Handling records...") for record in rgen: upr = UniProtRecord(record) uprlist.append(upr) #print(record) i += 1 if i % 10000 == 0: self.log.debug("Handled %d records..." % i) # break self.log.debug("parsed dat file of %d records" % len(uprlist)) return uprlist def get_annotation_df(self): self.log.debug("opening swissprot dat file %s" % self.sprotdatfile) rgen = SeqIO.parse(self.sprotdatfile, "swiss") self.log.debug("rgen type is %s" % type(rgen)) #self.log.debug("Created generator with %d records" % len(rgen)) i = 0 alltuples = [] for record in rgen: #print(record) i += 1 if i % 1000 == 0: self.log.debug("Handled %d records..." % i) goterms = [] for xf in record.dbxrefs: if xf.startswith("GO:"): gt = xf[3:] goterms.append(gt) if len(goterms) > 0: proteinid = record.id protein = record.name taxonid = record.annotations['ncbi_taxid'][0] for gt in goterms: t = (taxonid, proteinid, protein, gt) alltuples.append(t) # fan out over goterms else: # ignore un-annotated entries. pass if i >= 1000: break #self.log.debug("generated %d tuples" % len(alltuples)) self.log.debug(f"Generated { len(alltuples) } tuples") df = pd.DataFrame( alltuples, columns=['taxonid', 'proteinid', 'protein', 'goterm']) return df ########################################## # # Non-cafalib usage (NOT using API) # ########################################## def get_swissprot_df(self, usecache=True): """ Get swissprot info as dataframe from files, without API, one row per GOterm. Fields: proteinid protein taxonid goterm goaspect goevidence self.proteinid = record.id self.proteinacc = record. ? self.protein = record.name self.goterms = [] for xf in record.dbxrefs: if xf.startswith("GO:"): gt = xf[3:] self.goterms.append(gt) self.accessions = record.annotations['accessions'] self.taxonid = record.annotations['ncbi_taxid'][0] """ cachepath = f"{self.cachedir}/sprotgolist.csv" if usecache: if os.path.exists(cachepath): self.sprotdf = pd.read_csv(cachepath, index_col=0) self.log.debug(f"Loaded dataframe from cache: {cachepath}") if self.sprotdf is not None: self.log.debug("Cache hit. Using DataFrame from cache...") else: self.log.debug("Getting dictionary list...") dlist = self._handle_swissprot_file() self.log.debug( f"Got dict list of {len(dlist)} entries. Creating dataframe..." ) self.sprotdf = pd.DataFrame(dlist) #self.sprotdf.set_index('proteinacc', inplace = True) self.log.debug(f"Made dataframe:\n {str(self.sprotdf)}") self.log.info(f"Saving dataframe to cache file: {cachepath}") self.sprotdf.to_csv(cachepath) return self.sprotdf def _handle_swissprot_file(self): ''' Read uniprot_sprot.dat and return list of dicts of relevant fields. ''' self.log.debug("Handling swissprot file...") filehandle = None try: self.log.info(f"Opening file {self.sprotdatfile}") filehandle = open(self.sprotdatfile, 'r') self.log.debug("File opened. Parsing...") dlist = self._parsefile(filehandle) filehandle.close() except FileNotFoundError: self.log.error("No such file %s" % filename) finally: if filehandle is not None: filehandle.close() self.log.debug("Parsed data file.") return dlist def _parsefile(self, filehandle): """ Parses sprot DAT file and fans out goterms to list of dicts. """ allentries = [] current = None sumreport = 1 suminterval = 10000 repthresh = sumreport * suminterval try: while True: line = filehandle.readline() if line == '': break #for line in filehandle: if line.startswith("ID "): # ID 001R_FRG3G Reviewed; 256 AA. # <prot_name>_<prot_spec> proteinid = line[5:16].strip() current = defaultdict(dict) current['proteinid'] = proteinid (protein, species) = proteinid.split('_') current['protein'] = protein current['species'] = species self.log.debug("Handling ID. New entry.") elif line.startswith("AC "): # AC Q6GZX4; # AC Q91896; O57469; self.log.debug("Handling AC.") accession = line[5:11].strip() current['proteinacc'] = accession elif line.startswith("OX "): #OX NCBI_TaxID=654924; self.log.debug("Handling OX.") taxonid = "" val = line[5:] fields = val.split('=') if fields[0] == 'NCBI_TaxID': taxonid = fields[1].strip().replace(';', '') current['taxonid'] = taxonid elif line.startswith("DR GO;"): # DR GO; GO:0046782; P:regulation of viral transcription; IEA:InterPro. # P biological process, C cellular component, F molecular function. self.log.debug("Handling DR.") fields = line.split(';') goterm = fields[1].strip() goinfo = fields[2] aspcode = goinfo.split(':')[0].strip() goaspect = UniProt.ASPECTMAP[aspcode] goevsrc = fields[3] (goevidence, evsrc) = goevsrc.split(':') goevidence = goevidence.strip() current['goterms'][goterm] = [goaspect, goevidence] elif line.startswith("SQ SEQUENCE"): self.log.debug("Handling SQ: XXX") # line = filehandle.readline() elif line.startswith("GN "): # Examples: # GN ABL1 {ECO:0000303|PubMed:21546455}, # GN Name=BRCA1; Synonyms=RNF53; # GN ORFNames=T13E15.24/T13E15.23, T14P1.25/T14P1.24; # self.log.debug("Handling GN.") val = line[5:] elif line.startswith("//"): self.log.debug("End of entry.") clist = self._handle_current(current) current = None allentries.extend(clist) self.log.debug( f"All entries list now {len(allentries)} items... ") if len(allentries) >= repthresh: self.log.info( f"Processed {len(allentries)} entries... ") sumreport += 1 repthresh = sumreport * suminterval except Exception as e: traceback.print_exc(file=sys.stdout) self.log.info(f"Parsed file with {len(allentries)} goterms") return allentries def _handle_current(self, currentinfo): """ takes dictionary: currentinfo = { 'proteinid' : 'x', 'protein' : 'xxx' , 'goterms' : { 'GO:0005634' : [ 'C' , 'HDA' ], 'GO:0005886' : [ 'C' ,'HDA'], } } returns list of dicts: [ { 'proteinid' : 'x', 'protein' : 'xxx' , 'goterm' : 'GO:0005634', 'goaspect':'cc', 'goevidence': 'HDA' }, { 'proteinid' : 'x', 'protein' : 'xxx' , 'goterm' : 'GO:0005886', 'goaspect':'cc', 'goevidence': 'HDA' }, ] """ self.log.debug(f'handling {currentinfo} ') newlist = [] gtdict = currentinfo['goterms'] for gt in gtdict.keys(): self.log.debug(f"Handling term {gt}") newdict = {} newdict['proteinid'] = currentinfo['proteinid'] newdict['proteinacc'] = currentinfo['proteinacc'] newdict['protein'] = currentinfo['protein'] newdict['species'] = currentinfo['species'] newdict['goterm'] = gt newdict['goaspect'] = currentinfo['goterms'][gt][0] newdict['goevidence'] = currentinfo['goterms'][gt][1] newlist.append(newdict) self.log.debug(f"Created fanout of length: {len(newlist)}") return newlist def _make_species_map(self): ''' Parses uniprot speclist.txt https://www.uniprot.org/docs/speclist.txt to local .CSV taxonid species lineanname commonname 72259 ABANI Abaeis nicippe Sleepy orange butterfly OXYMO E 475340: N=Oxytenis modestia C=Costa Rica leaf moth S=Dead-leaf moth ''' listfile = self.speciesmap self.log.debug("Opening species map file %s" % listfile) try: fh = open(listfile, 'r') except FileNotFoundError: self.log.error("No such file %s" % filename) species = None kingdom = None taxonid = None lineanname = None commonname = None columnnames = [ 'species', 'kingdom', 'taxonid', 'lineanname', 'commonname' ] datalist = [] # list of tuples try: for line in fh: #self.log.debug("handling line %s" % line) if 'N=' in line and not line.startswith('Code'): #self.log.debug("handling N= line. taxonid is %s" % taxonid) if species is not None: tup = (species, kingdom, taxonid, lineanname, commonname) #self.log.debug("Adding tuple: %s" % str(tup)) datalist.append(tup) # reset all varaiables species = kingdom = taxonid = lineanname = commonname = None species = line[:5] kingdom = line[6] taxonid = line[7:15].strip() lineanname = line[19:].strip() #self.log.debug("handling N= line. taxonid is %s" % taxonid) elif 'C=' in line: commonname = line[19:].strip() elif 'S=' in line: pass except Exception as e: traceback.print_exc(file=sys.stdout) finally: fh.close() self.log.debug("Parsed file with %d terms" % len(datalist)) df = pd.DataFrame(datalist, columns=columnnames) outfile = "%s/speclist.csv" % self.outdir self.log.debug("Writing dataframe to %s" % outfile) df.to_csv(outfile) print(str(df)) return df @classmethod def get_default_df(cls, usecache=True): cp = ConfigParser() cp.read(os.path.expanduser('~/git/cafa4/etc/cafa4.conf')) upg = UniProt(cp) df = upg.get_swissprot_df(usecache=usecache) return df @classmethod def calculate_prior(cls, dataframe, species=None, goaspect=None): """ @arg dataframe : standard internal dataframe, species : NCBI species code e.g. MOUSE | HUMAN goaspect : internal aspect code e.g. [cc | bp | mf ] proteinid proteinacc protein species goterm goaspect goevidence 11K_PAVHV P0DJZ0 11K PAVHV GO:0030430 cc IDA ... returns: dataframe w/ ranked list of goterms, within the specified species/aspect if supplied. otherwise globally goterm goaspect count prob GO:0045735 cc 3679 .142 GO:0030433 bp 1256 .086 """ df = dataframe if species is not None: df = df[df.species == species] if goaspect is not None: df = df[df.goaspect == goaspect] totalterms = df.goterm.count() newdf = pd.DataFrame(df.goterm.value_counts()).reset_index() newdf.columns = ['goterm', 'counts']
from bioservices.uniprot import UniProt import requests, json from xml.etree import ElementTree import xml.etree.ElementTree as ET u = UniProt() res = u.search('sphingolipid_metabolism+AND+organism:9606', frmt='tab', columns='id') identifiers = res.strip().split()[1:] #hey = open('iuphar.txt','w') for line in identifiers: url = "http://www.guidetopharmacology.org/services/targets?accession=%s&database=UniProt/XML" % line response = requests.get(url) t = response.status_code r = response.content if len(r) is not 0: t = r.find('targetId') a = r[t + 12] + r[t + 13] + r[t + 14] + r[t + 15] url = "http://www.guidetopharmacology.org/services/targets/%s/rankOrder" % a response = requests.get(url) T = response.content if len(T) is not 0: s = T.find('id=') oh = T[s + 3] + T[s + 4] + T[s + 5] + T[s + 6] url = "http://www.guidetopharmacology.org/services/ligands/%s/structure" % oh response = requests.get(url) ligand = response.content key = ligand.find('"smiles" :') + len('"smiles" :') for i in range(30): print ligand[key + i],
from pandas import Series, DataFrame import pandas as pd from bioservices.uniprot import UniProt u = UniProt(verbose=False) #filename= "~/YuLab/interlogs/HumanBinary_All.txt" filename = "HumanBinary_All.txt" def get_seq_ppi(filename): """Return all the items in the file named filename; if testfn then include only those items for which testfn is true""" with open(filename) as file: #return get_pairs(file) p = get_Ps(file) pp = p[0] ppid = get_ppi(pp) return ppid #def get_items # def get_pairs(src): # pairs = [[line.split()[0], line.split()[1]] for line in src if line[0] != ''] # for pp in pairs: # k = (pp[0], pp[1]) # return k def get_Ps(src):
class GeneVis: def __init__(self, ): self.uniprot = UniProt() self.family_domains_columns = [ 'comment(DOMAIN)', 'comment(SIMILARITY)', 'families', 'feature(COILED COIL)', 'feature(COMPOSITIONAL BIAS)', 'feature(DOMAIN EXTENT)', 'feature(MOTIF)', 'feature(REGION)', 'feature(REPEAT)', 'feature(ZINC FINGER)' ] def ensembl2Uniprot( self, ensembl_txid, ): d = u.mapping(fr="ENSEMBL_TRS_ID", to="ACC", query=ensembl_txid) def search(self, uniprot_kw="Nid1_MOUSE"): #for col in family_domains_columns: results = self.uniprot.search(uniprot_kw, columns=','.join( self.family_domains_columns)) self._results = results comm_pat = '(\d+)[.]+(\d+);[\t ]+/note=([0-9a-zA-Z"\- ]+);' domain_pat = re.compile('DOMAIN ' + comm_pat) motif_pat = re.compile('MOTIF ' + comm_pat) repeat_pat = re.compile('REPEAT ' + comm_pat) region_pat = re.compile('REGION ' + comm_pat) temp = [] for pat in [domain_pat, motif_pat, repeat_pat, region_pat]: ## convert to pandas df ext = pat.findall(results) if ext: temp += ext temp2 = pd.DataFrame.from_records(temp, columns=('start', 'end', 'group')) temp2.insert(0, column='chromosome', value='1') temp2.insert(3, column='strand', value=None) temp2['group'] = temp2['group'].str.strip('"') temp2['type'] = temp2['group'].str.replace(r" \d", "", regex=True) temp2.start = temp2.start.astype(int) temp2.end = temp2.end.astype(int) if temp2.empty: print(f"Warning: NO features found for {uniprot_kw}") self.features = temp2 self._max_length = int( self.uniprot.search(uniprot_kw, columns='length').split("\n")[1]) def show(self, region=None, show_label=False, figsize=(12, None)): # Plot track. feat_track = FeatureTrack(data=self.features, hue='type', label='group' if show_label else None) if self.features.empty: print("No found genes. do search again") return if region is None: region = ('1', 0, self.features.end.max() + 1) fig, ax = plot_tracks([feat_track], region=region, figsize=figsize, despine=True) #self.figure = fig return ax def add_mutation_feature(self, start, end, label, color="#FF1700"): self._features.append( GraphicFeature(start=start, end=end, strand=+1, color=color, label=label)) self._max_length = max(self._max_length, end) def add_feature(self, palette='tab10'): if self.features.empty: print("No found genes. do search again") return self._features = [] ft = self.features['type'].unique() colors = sns.color_palette(palette=palette, n_colors=len(ft)).as_hex() self.features['color'] = self.features['type'].map( {t: c for t, c in zip(ft, colors)}) for i, row in self.features.iterrows(): f = GraphicFeature(start=row.start, end=row.end, strand=+1, color=row.color, label=row.group) self._features.append(f) def show_feature(self, figure_width=8, xlabel=""): if len(self._features) < 1: print("No feautres to show") return record = GraphicRecord(sequence_length=self._max_length, features=self._features) ax, _ = record.plot(figure_width=figure_width) ax.set_xlabel(xlabel, fontweight="bold", fontsize=16) return ax
def test_testset(config): upg = UniProt(config) df = upg.get_annotation_df() return df
def test_speciesmap(config): upg = UniProt(config) upg._make_species_map()
def test_datparse(config): upg = UniProt(config) df = upg.get_swissprot_df() return df
def test_uniprot(config): upg = UniProt(config) entrylist = ['Q9CQV8', 'P35213', 'A4K2U9', 'P31946', 'Q4R572', 'P68250'] out = upg._query_entries(entrylist) print(out)
def cafa_execute(self, dataframe, online=False): """ Takes inbound dataframe of orthologs and adds in GO terms and evidence codes from uniprot/swissprot. For a given ortholog protein, one row is added for each GO term. Returns new dataframe with all info. """ # # inbound: # cafaid evalue score bias db proteinacc protein species cafaprot cafaspec # 0 T100900000001 1.100000e-156 523.6 8.5 sp Q9CQV8 1433B MOUSE 1433B MOUSE # 1 T100900000001 4.100000e-155 518.4 7.7 sp P35213 1433B RAT 1433B MOUSE # 2 T100900000001 5.400000e-155 518.0 7.2 sp A4K2U9 1433B PONAB 1433B MOUSE # 3 T100900000001 5.400000e-155 518.0 7.2 sp P31946 1433B HUMAN 1433B MOUSE # Get all unique target accession numbers. entries = dataframe['proteinacc'].unique().tolist() # Look up GOterms in uniprot... if online: self.uniprotapi = UniProt() self.log.debug("Querying uniprot API for %d unique entries" % len(entries)) self.udf = self.uniprotapi.get_df(entries) self.log.debug(f"\n{self.udf}") self.udf.to_csv("%s/uniprot.csv" % self.outdir) udfslim = self.udf[['Entry', 'Gene ontology IDs']] # df.tacc corresponds to udf.Entry ... # entry == proteinid # gene ontology id = goterm # self.log.debug("Making new rows for each goterm.") newrowdict = {} ix = 0 for row in udfslim.itertuples(): (entry, golist) = row[1:] for goterm in golist: #print("creating new row: %s : %s %s %s" % (ix, entry, gene, goterm)) newrow = [entry, goterm] newrowdict[ix] = newrow ix += 1 godf = pd.DataFrame.from_dict(newrowdict, orient='index', columns=['entry', 'goterm']) else: self.log.debug("Using offline functionality...") godf = self.get_swissprot_df(usecache=True) self.log.debug(f"GO DataFrame:\n{godf}") # proteinid proteinacc goterm goaspect goevidence # 0 001R_FRG3G Q6GZX4 GO:0046782 bp IEA # 1 002L_FRG3G Q6GZX3 GO:0033644 cc IEA # For each go term add row... newdfdict = {} ix = 0 for row in dataframe.itertuples(): self.log.debug("inbound row = %s" % str(row)) #(query, evalue, score, bias, db, tacc, protein, species) = row[1:] (cafaid, evalue, score, bias, db, proteinacc, protein, species, cafaprot, cafaspec) = row[1:] self.log.debug(f"Searching for match for '{proteinacc}'") gomatch = godf[godf.proteinacc == proteinacc] self.log.debug(f"gomatch is:\n {gomatch}") for gr in gomatch.itertuples(): (entry, proteinacc, protein, species, goterm, goaspect, goevidence) = gr[1:] newrow = [ cafaid, evalue, score, bias, db, proteinacc, protein, species, cafaprot, cafaspec, goterm, goaspect, goevidence ] newdfdict[ix] = newrow ix += 1 newdf = pd.DataFrame.from_dict(newdfdict, orient='index', columns=[ 'cafaid', 'evalue', 'score', 'bias', 'db', 'proteinacc', 'protein', 'species', 'cafaprot', 'cafaspec', 'goterm', 'goaspect', 'goevidence' ]) for xc in self.excluded_evidence_codes: self.log.debug( f"{len(newdf.index)} rows. Removing evidence code {xc}...") #newdf = newdf[newdf.goevidence != xc] newdf.drop(newdf.loc[newdf['goevidence'] == xc].index, inplace=True) self.log.debug(f"{len(newdf.index)} rows after.") self.log.debug(f"\n{str(newdf)}") return newdf
def __init__(self): super().__init__() self.seguid = seguid self.uniprot = UniProt(verbose=False)
def uniprot(): u = UniProt(verbose=False, cache=False) u.debugLevel = "ERROR" return u
def test_swissprot(config): logging.debug("Running test_swissprot") upg = UniProt(config) out = upg.get_swissprot_df() print(str(out))
import json import requests import tempfile import ssbio.databases.pdb import ssbio.utils import os.path as op from bioservices.uniprot import UniProt import ssbio.databases.uniprot bs_unip = UniProt() def get_pdbs_for_gene(bigg_model, bigg_gene, cache_dir=tempfile.gettempdir()): """Attempt to get a rank-ordered list of available PDB structures for a BiGG Model and its gene. Args: bigg_model: BiGG Model ID bigg_gene: BiGG Gene ID Returns: list: rank-ordered list of tuples of (pdb_id, chain_id) """ my_structures = [] # Download gene info gene = ssbio.utils.request_json(link='http://bigg.ucsd.edu/api/v2/models/{}/genes/{}'.format(bigg_model, bigg_gene), outfile='{}_{}.json'.format(bigg_model, bigg_gene), outdir=cache_dir, force_rerun_flag=False) uniprots = []
""" Since BindingDB has its own monomer IDs, all IDs are converted to PubChem CIDs. For that, the list of all the identifier mappings from monomer ID to PubChem IDs is downloaded from BindingDB webpage given below. That the file called monomer.txt is used in the code. Thus, used need to download the updated list, then use it in our code. (https://www.bindingdb.org/bind/chemsearch/marvin/SDFdownload.jsp?all_download=yes)""" from bioservices.uniprot import UniProt import requests from xml.etree import ElementTree u = UniProt() res = u.search('(sphingolipid+OR+sphingomyelin+OR+glycosphingolipid)+AND+organism:9606 ', frmt='tab',columns='id') identifiers = res.strip().split()[1:] #hey = open("bindingdb.txt","w") f=open("monomer.txt","r") lines=f.readlines() monoid = [] mono = [] sim= [] CID=[] cid = [] """ All the monomerIDs and their equivalent CIDs are saved as separate arrays""" for line in lines: monoid.append(line.split()[0]) for line in lines: cid.append(line.split()[1])
hagaiNames = [ "Microenvironment_ID", "Binding_motif", "Cofactor", "Metal", "cofactor_group", "EC", "Head", "Molecule", "Organism_scientific", "no_rank", "superkingdom", "phylum", "class_", "order", "family", "genus", "organism_taxid", "name", "chains", "Resolution", "Structure_method", "Keywords", "Journal_reference", "Release_date" ] dfHag = pd.read_csv(os.path.join(hdir, 'hagai.csv'), header=0, index_col=False) dfHag[['pdb', 'LIG', 'chain', 'resID', 'function']] = dfHag.Microenvironment_ID.str.split('[._]', expand=True) dfHag['resID'] = dfHag.resID.astype(int) dfHag['Binding_motif'] = dfHag.Binding_motif.astype(int) dfHag['hagai'] = 'yes' import numpy as np dfHag['ec'] = np.nan u = UniProt(verbose=True) bar = Bar("Processing", max=len(dfHag.index), fill='*', suffix='%(percent).1f%% - %(eta)ds') map = u.mapping('PDB_ID', 'ACC', query=dfHag.pdb) # print(map) # exit() df = u.get_df([id[0] for id in map.values()]) df.to_csv(os.path.join(hdir, 'df_microPDBs.csv')) #returns dataframe with # Unnamed: 0 # Entry # Entry name Gene names Gene names (primary ) Gene names (synonym ) Gene names (ordered locus ) # Gene names (ORF ) Organism Organism ID
# -*- coding: utf-8 -*- """ Created on Thu Jun 9 11:46:12 2016 @author: ewj """ from Bio import SeqIO import myUniprotIO from UniprotUtils import get_feature_frame from bioservices.uniprot import UniProt u = UniProt(verbose = True) import pandas as pd from tqdm import tqdm import os, itertools def evidence(feature,letter,out_file): #read query file x = myUniprotIO.UniprotIterator(open('query.xml','r'), return_raw_comments=True) #parse for wanted data (i.e. gene name, sequence id, position, etc.) L = [] for rec,seqrec in tqdm(enumerate(x)): gene_name = seqrec.annotations.get('gene_name_primary','_none_') t= ( get_feature_frame(seqrec,stype=feature,filter_val=letter) .assign(rec=rec, id=seqrec.id, gene_name=gene_name) .rename(columns={'start':'position'}) ) L.append(t)
import re blast = open("staph_out.txt", 'r') table = defaultdict(int) for line in blast: match = re.search("\A> \w", line) if (match): match = re.search("\w{,5}_\w{,5}\s", line[2:]) if (match): table[match.group()] += 1 blast.close() print(len(table)) blastOut = open("blast.tsv", 'w') u = UniProt(verbose=False) goTable = defaultdict(int) i = 0 for item in table: blastOut.write(item + '\t' + str(table[item]) + '\n') i += 1 if (i % 25 == 0): print(i) value = u.search(item, columns="go") value = value.split(';') for val in value: match = re.search("GO:\d*", val) if (match): goTable[match.group()] += (1 * table[item]) blastOut.close()
def uniprot(): u = UniProt(verbose=False, cache=False) u.logging.level = "ERROR" return u