Example #1
0
    def get_default_df(cls, usecache=True):
        cp = ConfigParser()
        cp.read(os.path.expanduser('~/git/cafa4/etc/cafa4.conf'))
        upg = UniProt(cp)
        df = upg.get_swissprot_df(usecache=usecache)

        return df
Example #2
0
 def __init__(self, ):
     self.uniprot = UniProt()
     self.family_domains_columns = [
         'comment(DOMAIN)', 'comment(SIMILARITY)', 'families',
         'feature(COILED COIL)', 'feature(COMPOSITIONAL BIAS)',
         'feature(DOMAIN EXTENT)', 'feature(MOTIF)', 'feature(REGION)',
         'feature(REPEAT)', 'feature(ZINC FINGER)'
     ]
Example #3
0
 def create(cls, name, query, godag, verbose=True):
     if verbose:
         print(
             'No data with name \'{}\' found. Downloading... (This may take a while)'
             .format(name))
     # download current GO DAG
     r = requests.get('http://purl.obolibrary.org/obo/go/go-basic.obo',
                      allow_redirects=True)
     with open(PATH + '/godag/' + name + '.obo', 'wb') as file:
         file.write(r.content)
     # setup UniProt API
     up = UniProt()
     up.settings.TIMEOUT = None
     # load GO DAG
     godag = GoDag(name=godag)
     # download
     result = up.search(query,
                        columns='id, sequence, go(molecular function)')
     df = pd.read_csv(StringIO(result), delimiter='\t')
     # filter go terms present in godag
     df['Gene ontology (molecular function)'] = df[
         'Gene ontology (molecular function)'].map(lambda labels: [
             l for l in re.findall('GO:\d{7}', str(labels))
             if l in godag.GODAG
         ])
     df = df[df['Gene ontology (molecular function)'].map(
         lambda l: len(l) > 0)]
     # annotate full go
     df['Gene ontology (molecular function)'] = df[
         'Gene ontology (molecular function)'].map(
             lambda labels: godag.get_go_lineage_of(labels))
     # clean
     df = cls.clean(df)
     # save
     df['labels'] = df.apply(lambda row: '; '.join(row['labels']), axis=1)
     os.mkdir(PATH + '/datasets/' + name)
     df.to_csv(PATH + '/datasets/' + name + '/data.csv', index=False)
     with open(PATH + '/datasets/' + name + '/info.txt', 'w') as file:
         file.write('Downloaded: {}'.format(date.today()))
     if verbose: print('Raw data successfully downloaded.')
Example #4
0
class ProteinIdentifier(object):
    def __init__(self):
        super().__init__()
        self.seguid = seguid
        self.uniprot = UniProt(verbose=False)

    def sequence_from_uniprot(self, uniprot_ac):
        """Returns protein sequence from uniprot identifier"""
        try:
            return self.uniprot.get_fasta_sequence(uniprot_ac)
        except ValueError:
            return None

    @staticmethod
    def protein_identifier_resolver():
        """Returns protein sequence of a given identifier, using"""
        pass  # TODO

    @staticmethod
    def encode(sequence):
        """Protein seguid checksum based on amino-acid sequence"""
        return str(self.seguid(sequence))
Example #5
0
def get_uniprot_df(accList):
    from bioservices.uniprot import UniProt #imports uniprot 
 
    u = UniProt(verbose=True)
    df = u.get_df(accList)
    return df
Example #6
0
class UniProt(object):
    '''
    Aux info plugin.
    Takes dataframe, extracts entry_ids, adds info from uniprot.  
    Returns modified dataframe. 

    '''

    ASPECTMAP = {'C': 'cc', 'F': 'mf', 'P': 'bp'}

    def __init__(self, config):
        self.log = logging.getLogger(self.__class__.__name__)
        self.config = config
        self.uniprotapi = None
        self.outdir = os.path.expanduser(config.get('global', 'outdir'))
        self.taxid_mapfile = os.path.expanduser(
            config.get('global', 'taxid_mapfile'))
        self.sprotdatfile = os.path.expanduser(
            config.get('ontologyplugin', 'sprotdatfile'))
        self.cachedir = os.path.expanduser(
            config.get('ontologyplugin', 'cachedir'))
        excodes = config.get('ontologyplugin',
                             'excluded_evidence_codes',
                             fallback=[]).split(',')
        excodes = [x.strip() for x in excodes]
        self.excluded_evidence_codes = excodes
        self.sprotdf = None
        self.udf = None
        self.tdf = pd.read_csv(self.taxid_mapfile, index_col=0)

        # Create easy lookup mappings from taxon data frame...
        itdf = self.tdf.set_index('taxonid')
        self.taxiddict = itdf.to_dict(orient='index')

        isdf = self.tdf.set_index('species')
        self.specdict = isdf.to_dict(orient='index')
        self.log.debug("UniProtGOlugin initialized.")

    def cafa_execute(self, dataframe, online=False):
        """
        Takes inbound dataframe of orthologs and adds in GO terms and evidence codes from 
        uniprot/swissprot.
        For a given ortholog protein, one row is added for each GO term.
        Returns new dataframe with all info.   
        
        """
        #
        # inbound:
        #            cafaid         evalue  score  bias  db proteinacc protein species cafaprot cafaspec
        # 0   T100900000001  1.100000e-156  523.6   8.5  sp    Q9CQV8   1433B   MOUSE    1433B    MOUSE
        # 1   T100900000001  4.100000e-155  518.4   7.7  sp    P35213   1433B     RAT    1433B    MOUSE
        # 2   T100900000001  5.400000e-155  518.0   7.2  sp    A4K2U9   1433B   PONAB    1433B    MOUSE
        # 3   T100900000001  5.400000e-155  518.0   7.2  sp    P31946   1433B   HUMAN    1433B    MOUSE

        # Get all unique target accession numbers.
        entries = dataframe['proteinacc'].unique().tolist()
        # Look up GOterms in uniprot...
        if online:
            self.uniprotapi = UniProt()
            self.log.debug("Querying uniprot API for %d unique entries" %
                           len(entries))
            self.udf = self.uniprotapi.get_df(entries)
            self.log.debug(f"\n{self.udf}")
            self.udf.to_csv("%s/uniprot.csv" % self.outdir)
            udfslim = self.udf[['Entry', 'Gene ontology IDs']]
            # df.tacc corresponds to udf.Entry  ...
            #  entry == proteinid
            #  gene ontology id = goterm
            #
            self.log.debug("Making new rows for each goterm.")
            newrowdict = {}
            ix = 0
            for row in udfslim.itertuples():
                (entry, golist) = row[1:]
                for goterm in golist:
                    #print("creating new row: %s : %s %s %s" % (ix, entry, gene, goterm))
                    newrow = [entry, goterm]
                    newrowdict[ix] = newrow
                    ix += 1

            godf = pd.DataFrame.from_dict(newrowdict,
                                          orient='index',
                                          columns=['entry', 'goterm'])

        else:
            self.log.debug("Using offline functionality...")
            godf = self.get_swissprot_df(usecache=True)
            self.log.debug(f"GO DataFrame:\n{godf}")
            #    proteinid   proteinacc    goterm      goaspect goevidence
            # 0  001R_FRG3G  Q6GZX4      GO:0046782    bp        IEA
            # 1  002L_FRG3G  Q6GZX3      GO:0033644    cc        IEA

        # For each go term add row...
        newdfdict = {}
        ix = 0
        for row in dataframe.itertuples():
            self.log.debug("inbound row = %s" % str(row))
            #(query, evalue, score, bias, db, tacc, protein, species) = row[1:]
            (cafaid, evalue, score, bias, db, proteinacc, protein, species,
             cafaprot, cafaspec) = row[1:]
            self.log.debug(f"Searching for match for '{proteinacc}'")
            gomatch = godf[godf.proteinacc == proteinacc]
            self.log.debug(f"gomatch is:\n {gomatch}")
            for gr in gomatch.itertuples():
                (entry, proteinacc, protein, species, goterm, goaspect,
                 goevidence) = gr[1:]
                newrow = [
                    cafaid, evalue, score, bias, db, proteinacc, protein,
                    species, cafaprot, cafaspec, goterm, goaspect, goevidence
                ]
                newdfdict[ix] = newrow
                ix += 1

        newdf = pd.DataFrame.from_dict(newdfdict,
                                       orient='index',
                                       columns=[
                                           'cafaid', 'evalue', 'score', 'bias',
                                           'db', 'proteinacc', 'protein',
                                           'species', 'cafaprot', 'cafaspec',
                                           'goterm', 'goaspect', 'goevidence'
                                       ])
        for xc in self.excluded_evidence_codes:
            self.log.debug(
                f"{len(newdf.index)} rows. Removing evidence code {xc}...")
            #newdf = newdf[newdf.goevidence != xc]
            newdf.drop(newdf.loc[newdf['goevidence'] == xc].index,
                       inplace=True)
            self.log.debug(f"{len(newdf.index)} rows after.")
            self.log.debug(f"\n{str(newdf)}")

        return newdf
        # Output:
        #             cafaid         evalue  score  bias  db proteinacc protein species cafaprot cafaspec      goterm goaspect goevidence
        # 0    T100900000001  1.100000e-156  523.6   8.5  sp     Q9CQV8   1433B   MOUSE    1433B    MOUSE  GO:0005737       cc        ISO
        # 1    T100900000001  1.100000e-156  523.6   8.5  sp     Q9CQV8   1433B   MOUSE    1433B    MOUSE  GO:0005829       cc        ISO
        # 2    T100900000001  1.100000e-156  523.6   8.5  sp     Q9CQV8   1433B   MOUSE    1433B    MOUSE  GO:0042470       cc        IEA
        #

    def _dat2upr(self):
        self.log.debug("opening swissprot dat file %s" % self.sprotdatfile)
        rgen = SeqIO.parse(self.sprotdatfile, "swiss")
        i = 0
        uprlist = []
        self.log.debug("Completed SeqIO.parse(). Handling records...")
        for record in rgen:
            upr = UniProtRecord(record)
            uprlist.append(upr)
            #print(record)
            i += 1
            if i % 10000 == 0:
                self.log.debug("Handled %d records..." % i)
            #    break
        self.log.debug("parsed dat file of %d records" % len(uprlist))
        return uprlist

    def get_annotation_df(self):
        self.log.debug("opening swissprot dat file %s" % self.sprotdatfile)
        rgen = SeqIO.parse(self.sprotdatfile, "swiss")
        self.log.debug("rgen type is %s" % type(rgen))
        #self.log.debug("Created generator with %d records" % len(rgen))
        i = 0
        alltuples = []
        for record in rgen:
            #print(record)
            i += 1
            if i % 1000 == 0:
                self.log.debug("Handled %d records..." % i)
            goterms = []
            for xf in record.dbxrefs:
                if xf.startswith("GO:"):
                    gt = xf[3:]
                    goterms.append(gt)
            if len(goterms) > 0:
                proteinid = record.id
                protein = record.name
                taxonid = record.annotations['ncbi_taxid'][0]
                for gt in goterms:
                    t = (taxonid, proteinid, protein, gt)
                    alltuples.append(t)
                # fan out over goterms
            else:
                # ignore un-annotated entries.
                pass

            if i >= 1000:
                break
        #self.log.debug("generated %d tuples" % len(alltuples))
        self.log.debug(f"Generated { len(alltuples) } tuples")
        df = pd.DataFrame(
            alltuples, columns=['taxonid', 'proteinid', 'protein', 'goterm'])

        return df


##########################################
#
#   Non-cafalib usage (NOT using API)
#
##########################################

    def get_swissprot_df(self, usecache=True):
        """
        Get swissprot info as dataframe from files, without API, one row per GOterm.
       
        Fields:
           proteinid protein taxonid goterm goaspect goevidence 
      
        self.proteinid = record.id
        self.proteinacc = record. ?
        self.protein = record.name
        self.goterms = []
        for xf in record.dbxrefs:
            if xf.startswith("GO:"):
                gt = xf[3:]
                self.goterms.append(gt)
        self.accessions = record.annotations['accessions']
        self.taxonid = record.annotations['ncbi_taxid'][0]
        
        """

        cachepath = f"{self.cachedir}/sprotgolist.csv"
        if usecache:
            if os.path.exists(cachepath):
                self.sprotdf = pd.read_csv(cachepath, index_col=0)
                self.log.debug(f"Loaded dataframe from cache: {cachepath}")
        if self.sprotdf is not None:
            self.log.debug("Cache hit. Using DataFrame from cache...")
        else:
            self.log.debug("Getting dictionary list...")
            dlist = self._handle_swissprot_file()
            self.log.debug(
                f"Got dict list of {len(dlist)} entries. Creating dataframe..."
            )
            self.sprotdf = pd.DataFrame(dlist)
            #self.sprotdf.set_index('proteinacc', inplace = True)
            self.log.debug(f"Made dataframe:\n {str(self.sprotdf)}")
            self.log.info(f"Saving dataframe to cache file: {cachepath}")
            self.sprotdf.to_csv(cachepath)
        return self.sprotdf

    def _handle_swissprot_file(self):
        '''
         Read uniprot_sprot.dat and return list of dicts of relevant fields.
    

        '''
        self.log.debug("Handling swissprot file...")
        filehandle = None
        try:
            self.log.info(f"Opening file {self.sprotdatfile}")
            filehandle = open(self.sprotdatfile, 'r')
            self.log.debug("File opened. Parsing...")
            dlist = self._parsefile(filehandle)
            filehandle.close()

        except FileNotFoundError:
            self.log.error("No such file %s" % filename)

        finally:
            if filehandle is not None:
                filehandle.close()
        self.log.debug("Parsed data file.")
        return dlist

    def _parsefile(self, filehandle):
        """
        Parses sprot DAT file and fans out goterms to list of dicts. 
    
        """
        allentries = []
        current = None
        sumreport = 1
        suminterval = 10000
        repthresh = sumreport * suminterval
        try:
            while True:
                line = filehandle.readline()
                if line == '':
                    break
            #for line in filehandle:
                if line.startswith("ID "):
                    # ID   001R_FRG3G              Reviewed;         256 AA.
                    #      <prot_name>_<prot_spec>
                    proteinid = line[5:16].strip()
                    current = defaultdict(dict)
                    current['proteinid'] = proteinid
                    (protein, species) = proteinid.split('_')
                    current['protein'] = protein
                    current['species'] = species
                    self.log.debug("Handling ID. New entry.")

                elif line.startswith("AC "):
                    # AC   Q6GZX4;
                    # AC   Q91896; O57469;
                    self.log.debug("Handling AC.")
                    accession = line[5:11].strip()
                    current['proteinacc'] = accession

                elif line.startswith("OX   "):
                    #OX   NCBI_TaxID=654924;
                    self.log.debug("Handling OX.")
                    taxonid = ""
                    val = line[5:]
                    fields = val.split('=')
                    if fields[0] == 'NCBI_TaxID':
                        taxonid = fields[1].strip().replace(';', '')
                    current['taxonid'] = taxonid

                elif line.startswith("DR   GO;"):
                    # DR   GO; GO:0046782; P:regulation of viral transcription; IEA:InterPro.
                    # P biological process, C cellular component, F molecular function.
                    self.log.debug("Handling DR.")
                    fields = line.split(';')
                    goterm = fields[1].strip()
                    goinfo = fields[2]
                    aspcode = goinfo.split(':')[0].strip()
                    goaspect = UniProt.ASPECTMAP[aspcode]
                    goevsrc = fields[3]
                    (goevidence, evsrc) = goevsrc.split(':')
                    goevidence = goevidence.strip()
                    current['goterms'][goterm] = [goaspect, goevidence]

                elif line.startswith("SQ   SEQUENCE"):
                    self.log.debug("Handling SQ:  XXX")
                    # line = filehandle.readline()

                elif line.startswith("GN   "):
                    # Examples:
                    #  GN   ABL1 {ECO:0000303|PubMed:21546455},
                    #  GN   Name=BRCA1; Synonyms=RNF53;
                    #  GN   ORFNames=T13E15.24/T13E15.23, T14P1.25/T14P1.24;
                    #

                    self.log.debug("Handling GN.")
                    val = line[5:]

                elif line.startswith("//"):
                    self.log.debug("End of entry.")
                    clist = self._handle_current(current)
                    current = None
                    allentries.extend(clist)
                    self.log.debug(
                        f"All entries list now {len(allentries)} items... ")
                    if len(allentries) >= repthresh:
                        self.log.info(
                            f"Processed {len(allentries)} entries... ")
                        sumreport += 1
                        repthresh = sumreport * suminterval

        except Exception as e:
            traceback.print_exc(file=sys.stdout)

        self.log.info(f"Parsed file with {len(allentries)} goterms")
        return allentries

    def _handle_current(self, currentinfo):
        """
        takes dictionary:
        currentinfo = { 'proteinid' : 'x', 'protein' : 'xxx' , 'goterms' :  { 'GO:0005634' : [ 'C' , 'HDA' ],
                                                                              'GO:0005886' : [ 'C' ,'HDA'],
                                                                              }                                                                                              
                        } 
        
        returns list of dicts:
                     [  { 'proteinid' : 'x', 'protein' : 'xxx' , 'goterm' : 'GO:0005634',
                                                                           'goaspect':'cc',
                                                                           'goevidence': 'HDA' },
                       { 'proteinid' : 'x', 'protein' : 'xxx' , 'goterm' : 'GO:0005886',
                                                                           'goaspect':'cc',
                                                                           'goevidence': 'HDA' },                                                                           
                      ]
        """
        self.log.debug(f'handling {currentinfo} ')
        newlist = []
        gtdict = currentinfo['goterms']
        for gt in gtdict.keys():
            self.log.debug(f"Handling term {gt}")
            newdict = {}
            newdict['proteinid'] = currentinfo['proteinid']
            newdict['proteinacc'] = currentinfo['proteinacc']
            newdict['protein'] = currentinfo['protein']
            newdict['species'] = currentinfo['species']
            newdict['goterm'] = gt
            newdict['goaspect'] = currentinfo['goterms'][gt][0]
            newdict['goevidence'] = currentinfo['goterms'][gt][1]
            newlist.append(newdict)

        self.log.debug(f"Created fanout of length: {len(newlist)}")
        return newlist

    def _make_species_map(self):
        '''
        Parses uniprot speclist.txt    https://www.uniprot.org/docs/speclist.txt
        to local .CSV
        
        taxonid   species   lineanname       commonname
        72259      ABANI    Abaeis nicippe   Sleepy orange butterfly
                                             
        OXYMO E  475340: N=Oxytenis modestia
                         C=Costa Rica leaf moth
                         S=Dead-leaf moth
        
        '''
        listfile = self.speciesmap
        self.log.debug("Opening species map file %s" % listfile)
        try:
            fh = open(listfile, 'r')
        except FileNotFoundError:
            self.log.error("No such file %s" % filename)

        species = None
        kingdom = None
        taxonid = None
        lineanname = None
        commonname = None

        columnnames = [
            'species', 'kingdom', 'taxonid', 'lineanname', 'commonname'
        ]
        datalist = []
        # list of tuples

        try:
            for line in fh:
                #self.log.debug("handling line %s" % line)
                if 'N=' in line and not line.startswith('Code'):
                    #self.log.debug("handling N= line. taxonid is %s" % taxonid)
                    if species is not None:
                        tup = (species, kingdom, taxonid, lineanname,
                               commonname)
                        #self.log.debug("Adding tuple: %s" % str(tup))
                        datalist.append(tup)
                        # reset all varaiables
                        species = kingdom = taxonid = lineanname = commonname = None
                    species = line[:5]
                    kingdom = line[6]
                    taxonid = line[7:15].strip()
                    lineanname = line[19:].strip()
                    #self.log.debug("handling N= line. taxonid is %s" % taxonid)
                elif 'C=' in line:
                    commonname = line[19:].strip()
                elif 'S=' in line:
                    pass
        except Exception as e:
            traceback.print_exc(file=sys.stdout)
        finally:
            fh.close()

        self.log.debug("Parsed file with %d terms" % len(datalist))

        df = pd.DataFrame(datalist, columns=columnnames)
        outfile = "%s/speclist.csv" % self.outdir
        self.log.debug("Writing dataframe to %s" % outfile)
        df.to_csv(outfile)
        print(str(df))
        return df

    @classmethod
    def get_default_df(cls, usecache=True):
        cp = ConfigParser()
        cp.read(os.path.expanduser('~/git/cafa4/etc/cafa4.conf'))
        upg = UniProt(cp)
        df = upg.get_swissprot_df(usecache=usecache)

        return df

    @classmethod
    def calculate_prior(cls, dataframe, species=None, goaspect=None):
        """
        @arg 
           dataframe :  standard internal dataframe, 
           species  :  NCBI species code   e.g. MOUSE | HUMAN
           goaspect : internal aspect code   e.g. [cc | bp | mf ]
           
           proteinid proteinacc protein species      goterm goaspect goevidence
           11K_PAVHV     P0DJZ0     11K   PAVHV  GO:0030430       cc        IDA
           ...

        returns:
            dataframe w/ ranked list of goterms, within the specified species/aspect if supplied.
            otherwise globally 
            
            goterm      goaspect    count    prob
            GO:0045735  cc           3679    .142
            GO:0030433  bp           1256    .086

        """
        df = dataframe
        if species is not None:
            df = df[df.species == species]
        if goaspect is not None:
            df = df[df.goaspect == goaspect]

        totalterms = df.goterm.count()
        newdf = pd.DataFrame(df.goterm.value_counts()).reset_index()
        newdf.columns = ['goterm', 'counts']
Example #7
0
from bioservices.uniprot import UniProt
import requests, json
from xml.etree import ElementTree
import xml.etree.ElementTree as ET
u = UniProt()

res = u.search('sphingolipid_metabolism+AND+organism:9606',
               frmt='tab',
               columns='id')
identifiers = res.strip().split()[1:]
#hey = open('iuphar.txt','w')
for line in identifiers:
    url = "http://www.guidetopharmacology.org/services/targets?accession=%s&database=UniProt/XML" % line
    response = requests.get(url)
    t = response.status_code
    r = response.content
    if len(r) is not 0:
        t = r.find('targetId')
        a = r[t + 12] + r[t + 13] + r[t + 14] + r[t + 15]
        url = "http://www.guidetopharmacology.org/services/targets/%s/rankOrder" % a
        response = requests.get(url)
        T = response.content
        if len(T) is not 0:
            s = T.find('id=')
            oh = T[s + 3] + T[s + 4] + T[s + 5] + T[s + 6]
            url = "http://www.guidetopharmacology.org/services/ligands/%s/structure" % oh
            response = requests.get(url)
            ligand = response.content
            key = ligand.find('"smiles" :') + len('"smiles" :')
            for i in range(30):
                print ligand[key + i],
Example #8
0
from pandas import Series, DataFrame
import pandas as pd

from bioservices.uniprot import UniProt

u = UniProt(verbose=False)

#filename= "~/YuLab/interlogs/HumanBinary_All.txt"
filename = "HumanBinary_All.txt"


def get_seq_ppi(filename):
    """Return all the items in the file named filename; if testfn
    then include only those items for which testfn is true"""
    with open(filename) as file:
        #return get_pairs(file)
        p = get_Ps(file)
        pp = p[0]
        ppid = get_ppi(pp)
        return ppid


#def get_items


# def get_pairs(src):
#     pairs = [[line.split()[0], line.split()[1]] for line in src if line[0] != '']
#     for pp in pairs:
#         k = (pp[0], pp[1])
#     return k
def get_Ps(src):
Example #9
0
class GeneVis:
    def __init__(self, ):
        self.uniprot = UniProt()
        self.family_domains_columns = [
            'comment(DOMAIN)', 'comment(SIMILARITY)', 'families',
            'feature(COILED COIL)', 'feature(COMPOSITIONAL BIAS)',
            'feature(DOMAIN EXTENT)', 'feature(MOTIF)', 'feature(REGION)',
            'feature(REPEAT)', 'feature(ZINC FINGER)'
        ]

    def ensembl2Uniprot(
        self,
        ensembl_txid,
    ):
        d = u.mapping(fr="ENSEMBL_TRS_ID", to="ACC", query=ensembl_txid)

    def search(self, uniprot_kw="Nid1_MOUSE"):
        #for col in family_domains_columns:
        results = self.uniprot.search(uniprot_kw,
                                      columns=','.join(
                                          self.family_domains_columns))
        self._results = results
        comm_pat = '(\d+)[.]+(\d+);[\t ]+/note=([0-9a-zA-Z"\- ]+);'
        domain_pat = re.compile('DOMAIN ' + comm_pat)
        motif_pat = re.compile('MOTIF ' + comm_pat)
        repeat_pat = re.compile('REPEAT ' + comm_pat)
        region_pat = re.compile('REGION ' + comm_pat)

        temp = []
        for pat in [domain_pat, motif_pat, repeat_pat, region_pat]:
            ## convert to pandas df
            ext = pat.findall(results)
            if ext:
                temp += ext

        temp2 = pd.DataFrame.from_records(temp,
                                          columns=('start', 'end', 'group'))
        temp2.insert(0, column='chromosome', value='1')
        temp2.insert(3, column='strand', value=None)
        temp2['group'] = temp2['group'].str.strip('"')
        temp2['type'] = temp2['group'].str.replace(r" \d", "", regex=True)

        temp2.start = temp2.start.astype(int)
        temp2.end = temp2.end.astype(int)
        if temp2.empty:
            print(f"Warning: NO features found for {uniprot_kw}")
        self.features = temp2
        self._max_length = int(
            self.uniprot.search(uniprot_kw, columns='length').split("\n")[1])

    def show(self, region=None, show_label=False, figsize=(12, None)):
        # Plot track.
        feat_track = FeatureTrack(data=self.features,
                                  hue='type',
                                  label='group' if show_label else None)
        if self.features.empty:
            print("No found genes. do search again")
            return
        if region is None:
            region = ('1', 0, self.features.end.max() + 1)
        fig, ax = plot_tracks([feat_track],
                              region=region,
                              figsize=figsize,
                              despine=True)
        #self.figure = fig
        return ax

    def add_mutation_feature(self, start, end, label, color="#FF1700"):
        self._features.append(
            GraphicFeature(start=start,
                           end=end,
                           strand=+1,
                           color=color,
                           label=label))
        self._max_length = max(self._max_length, end)

    def add_feature(self, palette='tab10'):
        if self.features.empty:
            print("No found genes. do search again")
            return
        self._features = []
        ft = self.features['type'].unique()
        colors = sns.color_palette(palette=palette, n_colors=len(ft)).as_hex()
        self.features['color'] = self.features['type'].map(
            {t: c
             for t, c in zip(ft, colors)})

        for i, row in self.features.iterrows():
            f = GraphicFeature(start=row.start,
                               end=row.end,
                               strand=+1,
                               color=row.color,
                               label=row.group)
            self._features.append(f)

    def show_feature(self, figure_width=8, xlabel=""):
        if len(self._features) < 1:
            print("No feautres to show")
            return
        record = GraphicRecord(sequence_length=self._max_length,
                               features=self._features)
        ax, _ = record.plot(figure_width=figure_width)
        ax.set_xlabel(xlabel, fontweight="bold", fontsize=16)
        return ax
Example #10
0
def test_testset(config):
    upg = UniProt(config)
    df = upg.get_annotation_df()
    return df
Example #11
0
def test_speciesmap(config):
    upg = UniProt(config)
    upg._make_species_map()
Example #12
0
def test_datparse(config):
    upg = UniProt(config)
    df = upg.get_swissprot_df()
    return df
Example #13
0
def test_uniprot(config):
    upg = UniProt(config)
    entrylist = ['Q9CQV8', 'P35213', 'A4K2U9', 'P31946', 'Q4R572', 'P68250']
    out = upg._query_entries(entrylist)
    print(out)
Example #14
0
    def cafa_execute(self, dataframe, online=False):
        """
        Takes inbound dataframe of orthologs and adds in GO terms and evidence codes from 
        uniprot/swissprot.
        For a given ortholog protein, one row is added for each GO term.
        Returns new dataframe with all info.   
        
        """
        #
        # inbound:
        #            cafaid         evalue  score  bias  db proteinacc protein species cafaprot cafaspec
        # 0   T100900000001  1.100000e-156  523.6   8.5  sp    Q9CQV8   1433B   MOUSE    1433B    MOUSE
        # 1   T100900000001  4.100000e-155  518.4   7.7  sp    P35213   1433B     RAT    1433B    MOUSE
        # 2   T100900000001  5.400000e-155  518.0   7.2  sp    A4K2U9   1433B   PONAB    1433B    MOUSE
        # 3   T100900000001  5.400000e-155  518.0   7.2  sp    P31946   1433B   HUMAN    1433B    MOUSE

        # Get all unique target accession numbers.
        entries = dataframe['proteinacc'].unique().tolist()
        # Look up GOterms in uniprot...
        if online:
            self.uniprotapi = UniProt()
            self.log.debug("Querying uniprot API for %d unique entries" %
                           len(entries))
            self.udf = self.uniprotapi.get_df(entries)
            self.log.debug(f"\n{self.udf}")
            self.udf.to_csv("%s/uniprot.csv" % self.outdir)
            udfslim = self.udf[['Entry', 'Gene ontology IDs']]
            # df.tacc corresponds to udf.Entry  ...
            #  entry == proteinid
            #  gene ontology id = goterm
            #
            self.log.debug("Making new rows for each goterm.")
            newrowdict = {}
            ix = 0
            for row in udfslim.itertuples():
                (entry, golist) = row[1:]
                for goterm in golist:
                    #print("creating new row: %s : %s %s %s" % (ix, entry, gene, goterm))
                    newrow = [entry, goterm]
                    newrowdict[ix] = newrow
                    ix += 1

            godf = pd.DataFrame.from_dict(newrowdict,
                                          orient='index',
                                          columns=['entry', 'goterm'])

        else:
            self.log.debug("Using offline functionality...")
            godf = self.get_swissprot_df(usecache=True)
            self.log.debug(f"GO DataFrame:\n{godf}")
            #    proteinid   proteinacc    goterm      goaspect goevidence
            # 0  001R_FRG3G  Q6GZX4      GO:0046782    bp        IEA
            # 1  002L_FRG3G  Q6GZX3      GO:0033644    cc        IEA

        # For each go term add row...
        newdfdict = {}
        ix = 0
        for row in dataframe.itertuples():
            self.log.debug("inbound row = %s" % str(row))
            #(query, evalue, score, bias, db, tacc, protein, species) = row[1:]
            (cafaid, evalue, score, bias, db, proteinacc, protein, species,
             cafaprot, cafaspec) = row[1:]
            self.log.debug(f"Searching for match for '{proteinacc}'")
            gomatch = godf[godf.proteinacc == proteinacc]
            self.log.debug(f"gomatch is:\n {gomatch}")
            for gr in gomatch.itertuples():
                (entry, proteinacc, protein, species, goterm, goaspect,
                 goevidence) = gr[1:]
                newrow = [
                    cafaid, evalue, score, bias, db, proteinacc, protein,
                    species, cafaprot, cafaspec, goterm, goaspect, goevidence
                ]
                newdfdict[ix] = newrow
                ix += 1

        newdf = pd.DataFrame.from_dict(newdfdict,
                                       orient='index',
                                       columns=[
                                           'cafaid', 'evalue', 'score', 'bias',
                                           'db', 'proteinacc', 'protein',
                                           'species', 'cafaprot', 'cafaspec',
                                           'goterm', 'goaspect', 'goevidence'
                                       ])
        for xc in self.excluded_evidence_codes:
            self.log.debug(
                f"{len(newdf.index)} rows. Removing evidence code {xc}...")
            #newdf = newdf[newdf.goevidence != xc]
            newdf.drop(newdf.loc[newdf['goevidence'] == xc].index,
                       inplace=True)
            self.log.debug(f"{len(newdf.index)} rows after.")
            self.log.debug(f"\n{str(newdf)}")

        return newdf
Example #15
0
 def __init__(self):
     super().__init__()
     self.seguid = seguid
     self.uniprot = UniProt(verbose=False)
Example #16
0
def uniprot():
    u = UniProt(verbose=False, cache=False)
    u.debugLevel = "ERROR"
    return u
Example #17
0
def test_swissprot(config):
    logging.debug("Running test_swissprot")
    upg = UniProt(config)
    out = upg.get_swissprot_df()
    print(str(out))
Example #18
0
import json
import requests
import tempfile
import ssbio.databases.pdb
import ssbio.utils
import os.path as op
from bioservices.uniprot import UniProt
import ssbio.databases.uniprot
bs_unip = UniProt()


def get_pdbs_for_gene(bigg_model, bigg_gene, cache_dir=tempfile.gettempdir()):
    """Attempt to get a rank-ordered list of available PDB structures for a BiGG Model and its gene.

    Args:
        bigg_model: BiGG Model ID
        bigg_gene: BiGG Gene ID

    Returns:
        list: rank-ordered list of tuples of (pdb_id, chain_id)

    """
    my_structures = []

    # Download gene info
    gene = ssbio.utils.request_json(link='http://bigg.ucsd.edu/api/v2/models/{}/genes/{}'.format(bigg_model, bigg_gene),
                                    outfile='{}_{}.json'.format(bigg_model, bigg_gene),
                                    outdir=cache_dir,
                                    force_rerun_flag=False)

    uniprots = []
""" Since BindingDB has its own monomer IDs, all IDs are converted to PubChem CIDs. For that, the list of all the identifier
mappings from monomer ID to PubChem IDs is downloaded from BindingDB webpage given below. That the file called monomer.txt
is used in the code. Thus, used need to download the updated list, then use it in our code.
(https://www.bindingdb.org/bind/chemsearch/marvin/SDFdownload.jsp?all_download=yes)"""

from bioservices.uniprot import UniProt
import requests
from xml.etree import ElementTree
u = UniProt()
 
res = u.search('(sphingolipid+OR+sphingomyelin+OR+glycosphingolipid)+AND+organism:9606 ', frmt='tab',columns='id')
identifiers = res.strip().split()[1:]
#hey = open("bindingdb.txt","w")
f=open("monomer.txt","r")
lines=f.readlines()

monoid = []
mono = []
sim= []
CID=[]
cid = []

""" All the monomerIDs and their equivalent CIDs are saved as separate arrays"""

for line in lines:
  monoid.append(line.split()[0])


for line in lines:
  cid.append(line.split()[1])
Example #20
0
hagaiNames = [
    "Microenvironment_ID", "Binding_motif", "Cofactor", "Metal",
    "cofactor_group", "EC", "Head", "Molecule", "Organism_scientific",
    "no_rank", "superkingdom", "phylum", "class_", "order", "family", "genus",
    "organism_taxid", "name", "chains", "Resolution", "Structure_method",
    "Keywords", "Journal_reference", "Release_date"
]
dfHag = pd.read_csv(os.path.join(hdir, 'hagai.csv'), header=0, index_col=False)
dfHag[['pdb', 'LIG', 'chain', 'resID',
       'function']] = dfHag.Microenvironment_ID.str.split('[._]', expand=True)
dfHag['resID'] = dfHag.resID.astype(int)
dfHag['Binding_motif'] = dfHag.Binding_motif.astype(int)
dfHag['hagai'] = 'yes'
import numpy as np
dfHag['ec'] = np.nan
u = UniProt(verbose=True)

bar = Bar("Processing",
          max=len(dfHag.index),
          fill='*',
          suffix='%(percent).1f%% - %(eta)ds')

map = u.mapping('PDB_ID', 'ACC', query=dfHag.pdb)
# print(map)
# exit()
df = u.get_df([id[0] for id in map.values()])
df.to_csv(os.path.join(hdir, 'df_microPDBs.csv'))  #returns dataframe with
# Unnamed: 0
# Entry
# Entry name Gene names Gene names  (primary ) Gene names  (synonym ) Gene names  (ordered locus )
# Gene names  (ORF ) Organism Organism ID
Example #21
0
# -*- coding: utf-8 -*-
"""
Created on Thu Jun  9 11:46:12 2016

@author: ewj
"""
from Bio import SeqIO
import myUniprotIO
from UniprotUtils import get_feature_frame
from bioservices.uniprot import UniProt
u = UniProt(verbose = True)
import pandas as pd
from tqdm import tqdm
import os, itertools


def evidence(feature,letter,out_file):
    #read query file
    x = myUniprotIO.UniprotIterator(open('query.xml','r'), return_raw_comments=True)

    #parse for wanted data (i.e. gene name, sequence id, position, etc.)
    L = []
    for rec,seqrec in tqdm(enumerate(x)):
        gene_name = seqrec.annotations.get('gene_name_primary','_none_')
        
        t= (
            get_feature_frame(seqrec,stype=feature,filter_val=letter)
            .assign(rec=rec, id=seqrec.id, gene_name=gene_name)
            .rename(columns={'start':'position'})
            )    
        L.append(t)
import re

blast = open("staph_out.txt", 'r')
table = defaultdict(int)

for line in blast:
    match = re.search("\A> \w", line)
    if (match):
        match = re.search("\w{,5}_\w{,5}\s", line[2:])
        if (match):
            table[match.group()] += 1
blast.close()

print(len(table))
blastOut = open("blast.tsv", 'w')
u = UniProt(verbose=False)
goTable = defaultdict(int)

i = 0
for item in table:
    blastOut.write(item + '\t' + str(table[item]) + '\n')
    i += 1
    if (i % 25 == 0):
        print(i)
    value = u.search(item, columns="go")
    value = value.split(';')
    for val in value:
        match = re.search("GO:\d*", val)
        if (match):
            goTable[match.group()] += (1 * table[item])
blastOut.close()
Example #23
0
def uniprot():
    u = UniProt(verbose=False, cache=False)
    u.logging.level = "ERROR"
    return u