Esempio n. 1
0
def parse_uniprot(entries):
    ent = ['id:' + s for s in entries]
    print(ent[2])
    u = UniProt(verbose=True)
    df = u.get_df(ent)

    return df
Esempio n. 2
0
    def __init__(self, verbose=True):
        """.. rubric:: Constructor

        :param bool verbose: print informative messages

        .. doctest::

            >>> from bioservices import PSICQUIC
            >>> s = PSICQUIC()

        """
        self.services = REST(
            "PSICQUIC",
            verbose=verbose,
            url="https://www.ebi.ac.uk/Tools/webservices/psicquic",
            url_defined_later=True)  # this prevent annoying warning

        self._registry = None

        try:
            self.uniprot = UniProt(verbose=False)
        except:
            self.services.logging.warning(
                "UniProt service could be be initialised")
        self.buffer = {}
Esempio n. 3
0
def kegg_to_symbol_through_uniprot(unknown_genes):
    # create string to call uniprot for mapping
    search_string = '\t'.join(unknown_genes)
    kegg_to_gene_name = dict()
    missing = set()
    uniprot = UniProt(verbose=True)
    # This is where it gets tricky. Checking to see if there is a uniprot
    # mapping for the species, if not, trying from KEGG side. Sometimes
    # kegg  links to a different uniprot, or uniprot links to a diff kegg.
    uni_dict = dict(uniprot.mapping("KEGG_ID", "ACC", query=search_string))
    for i in unknown_genes:
        if i in uni_dict:
            for n in uni_dict[i]:
                x = uniprot.search("accession:{}".format(n),
                                   columns='genes(PREFERRED),reviewed,id',
                                   limit=1)
                _, data = x.rstrip('\n').split('\n')
                name, review, entry = data.split('\t')
                if n != entry:
                    print(i, n, entry, x, "dont match")
                elif review == 'reviewed':
                    kegg_to_gene_name[i] = name

        else:
            missing.add(i)
    print("{} mappings not found from kegg to"
          " gene name".format(len(missing)))
    print(missing)
    return kegg_to_gene_name
Esempio n. 4
0
 def __init__(self, input, parent=None):
     QThread.__init__(self, parent)
     self.seqs = input[0]
     self.nodes = None if not input[1] else input[1]
     self.u = UniProt(verbose=False)
     self.PDBLogger = logging.getLogger("PDBSearch")
     del input, parent
Esempio n. 5
0
def search_uniprot(genes, columns):

    start_time = datetime.datetime.now()

    with yaspin(text="Performing UniProt search...", color="cyan") as sp:
        uniprot = UniProt(verbose=False)
        raw_data = ''
        headers = True

        for chunk in chunks(genes, 10):
            gene_search = "+OR+".join(list(chunk))
            new_data = uniprot.search(gene_search, frmt="tab", columns=f"entry name, {','.join(columns)}")

            # Removes first line if this is second or next batches.
            try:
                if not headers:
                    new_data = new_data.split("\n")[1]

                raw_data = raw_data + "\n" + new_data
                headers = False
            except IndexError:
                pass
        data = pandas.read_csv(pandas.compat.StringIO(raw_data), sep="\t")

        time_diff = (datetime.datetime.now() - start_time).total_seconds()

        sp.text = f"Performing UniProt Search => Task done in {time_diff} seconds."
        sp.ok("✔")

        return data
Esempio n. 6
0
def kegg_to_symbol_through_uniprot(unknown_genes):
    # create string to call uniprot for mapping
    search_string = '\t'.join(unknown_genes)
    kegg_to_gene_name = dict()
    missing = set()
    uniprot = UniProt(verbose=True)
    # This is where it gets tricky. Checking to see if there is a uniprot
    # mapping for the species, if not, trying from KEGG side. Sometimes
    # kegg  links to a different uniprot, or uniprot links to a diff kegg.
    uni_dict = dict(uniprot.mapping("KEGG_ID", "ACC", query=search_string))
    for i in unknown_genes:
        if i in uni_dict:
            for n in uni_dict[i]:
                x = uniprot.search("accession:{}".format(n),
                                   columns='genes(PREFERRED),reviewed,id',
                                   limit=1)
                header, data = x.rstrip('\n').split('\n')
                name, review, entry = data.split('\t')
                if n != entry:
                    print(i, n, entry, x, "dont match")
                elif review == 'reviewed':
                    kegg_to_gene_name[i] = name

        else:
            missing.add(i)
    print("{} mappings not found from kegg to"
          " gene name".format(len(missing)))
    print(missing)
    return kegg_to_gene_name
Esempio n. 7
0
    def test_extract_protein_interactions_kgml(self, kgml_file,
                                               expected_no_rel):
        # Arrange
        sut = KeggProteinInteractionsExtractor()
        with open(
                os.path.join(os.path.dirname(os.path.realpath(__file__)),
                             kgml_file), 'r') as myfile:
            kgml_string = myfile.read()

        # Mock Kegg ops
        mock_kegg = KEGG()
        sut.kegg = mock_kegg

        # No matter what the input is, return the  ko numbers that map to hsa numbers
        mock_kegg.link = MagicMock(return_value="ko:K00922	hsa:5293\n" +
                                   "ko:K00922	hsa:5291\n" +
                                   "ko:K02649	hsa:5295")

        # No matter what the input is, return the  hsa numbers that map to uniprot numbers
        mock_kegg.conv = MagicMock(return_value={"hsa:5293": "up:B0LPE5"})

        # Mock Uni Prot
        mock_uniprot = UniProt()
        sut.uniprot = mock_uniprot
        mock_uniprot.mapping = MagicMock(
            return_value={"B0LPE5": ["gene1", "gene2"]})

        # Act
        actual = sut.extract_protein_interactions_kgml(kgml_string)

        # Assert
        self.assertEqual(expected_no_rel, len(actual))
Esempio n. 8
0
def get_uniprot_metadata_online(uniprot_ids):

    uniprot_ids = list(set(uniprot_ids))
    print('get_uniprot_metadata', len(uniprot_ids))

    BATCH_SIZE = 200
    uniprot = UniProt()
    uniprot_lookup = {}

    cumulative_total = 0
    for x in batch(uniprot_ids, BATCH_SIZE):
        batch_ids = [i for i in x]
        cumulative_total += len(batch_ids)
        print(cumulative_total, '/', len(uniprot_ids))

        res = uniprot.retrieve(batch_ids)
        for r in res:
            for key in r['accession']:
                protein_id = key.contents[0]
                for x in r['recommendedname']:
                    tag = x.find('shortname')
                    if tag is None:
                        tag = x.find('fullname')
                    label = tag.contents[0]
                    uniprot_lookup[protein_id] = {'display_name': label}

    return uniprot_lookup
Esempio n. 9
0
def get_protein_info(uniprot_ids):
    """
    Retrieves EMBL accession numbers and taxonomy ids for list of proteins. Creates a dict to map each protein's
    uid to its EMBL accession number and tax id.
    :param uniprot_ids: List of Uniprot IDs, e.g., ['P0AAJ3', 'A0NAQ1']
    :return: dictionary mapping each uid to its info
    """
    from bioservices import UniProt

    missing_embl = []
    missing_taxid = []
    orthos_map = {}
    u = UniProt()

    uniprot_records = list(map(lambda x: x.decode("utf-8"), u.retrieve(uniprot_ids, frmt='txt')))  # WSL CLI
    # uniprot_records = u.retrieve(uniprot_ids, frmt='txt')  # PyCharm

    embl_pattern = re.compile(r"DR\s+EMBL;.*?;\s+(.*?);")
    taxid_pattern = re.compile(r"OX\s+NCBI_TaxID=(\d+)")

    for i, record in enumerate(uniprot_records):
        embl_acc = get_match(embl_pattern, record, uniprot_ids, missing_embl, i)  # EMBL accession number for coding seq
        taxonomy_id = get_match(taxid_pattern, record, uniprot_ids, missing_taxid, i)  # tax_id of organism protein belongs to
        orthos_map[uniprot_ids[i]] = [embl_acc, taxonomy_id]  # map protein info to its uid

    if missing_embl:
        print('\n{} Protein(s) Missing EMBL Accession Number: '.format(len(missing_embl)) + ', '.join(missing_embl))

    if missing_taxid:
        print('\n{} Protein(s) Missing NCBI TaxID: '.format(len(missing_taxid)) + ', '.join(missing_taxid))

    return orthos_map
Esempio n. 10
0
def main():
    """ Main function."""
    args = parse_args()
    if args.log:
        logfile = args.log
        logging.basicConfig(filename=logfile, level=logging.DEBUG, \
            filemode='w', format='%(asctime)s %(message)s', \
            datefmt='%Y-%m-%d %H:%M:%S')
    else:
        logfile = sys.stdout

    #Interface to the UniProt service
    u = UniProt(verbose=False)
    con = sqlite3.connect("PDB_Chain_Uniprot.db")
    cur = con.cursor()

    cur.execute("SELECT SP_PRIMARY FROM PDB_Chain_Uniprot WHERE Swissprot_Id = '' LIMIT 1000" )
    #cur.execute("SELECT SP_PRIMARY FROM PDB_Chain_Uniprot WHERE Swissprot_Id =?",(str(current_swissprot),))
    rows = cur.fetchall()
    for row in rows:
        #print(str(row[0]))
        res = u.search(str(row[0]),limit=1)
        #print(res)
        if res != "" :
            for line in res.split("\n")[1:-1]:
                if(line != ""):
                    res_id, res_Entry_Name, res_status, res_protein_names, res_gene_names, res_organism, Length = line.split("\t")
                    cur.execute("UPDATE PDB_Chain_Uniprot SET Swissprot_Id=?  WHERE SP_PRIMARY = ?",(str(res_Entry_Name),str(row[0]),))
                    con.commit()
    con.close()
Esempio n. 11
0
def get_id_from_bioservice(entity):

    from bioservices import UniProt
    u = UniProt(cache=True)

    # 数据库API查询
    Ids = None
    temp = []
    temp_cc = []
    # df = u.get_df(["Q9HCK8"])
    # print(df)
    res_reviewed = u.search(entity + '+reviewed:yes',
                            frmt="tab",
                            columns="id, entry name, genes, comment(FUNCTION)",
                            limit=5)  # , protein names
    res_unreviewed = u.search(
        entity,
        frmt="tab",
        columns="id, entry name, genes, genes(PREFERRED)",
        limit=5)
    # print(res_reviewed)
    # print(res_unreviewed)

    if res_reviewed == 400:
        print('请求无效\n')
        return Ids

    if res_reviewed:  # 若是有返回结果
        results = res_reviewed.split('\n')[1:-1]  # 去除开头一行和最后的''
        for line in results:
            results = line.split('\t')
            temp.append(results[0])
            temp_cc.append(results[-1])
            # break
    return temp, temp_cc
Esempio n. 12
0
    def __init__(self,specie,taxid):

        self.query = ""
        self.specie = specie
        self.taxid = taxid
        self.dicotested = {} #avoid to test several time the same ID
        self.unip = UniProt()
Esempio n. 13
0
 def uniprot_acc_to_taxonmy(self, accesion):
     """From one uniprot ID to taxonomy"""
     from bioservices import UniProt
     u = UniProt()
     data = u.search(accesion, frmt="xml")
     from bs4 import BeautifulSoup
     soup = BeautifulSoup(data, "html.parser")
     return ' (' + ', '.join([t.text for t in soup.find_all('taxon')]) + ')'
Esempio n. 14
0
 def uniprot_acc_to_taxonmy(self, accesion):
     """From one uniprot ID to taxonomy"""
     from bioservices import UniProt
     u = UniProt()
     data = u.search(accesion, frmt="xml")
     from bs4 import BeautifulSoup
     soup = BeautifulSoup(data, "html.parser")
     return ' (' + ', '.join([t.text for t in soup.find_all('taxon')]) + ')'
Esempio n. 15
0
def retrieve_label_from_uniprot_df(ID):
    uniprot = UniProt()
    df = uniprot.get_df(ID)
    label = df["Taxonomic lineage (PHYLUM)"][0]

    if type(label) == np.float64 and np.isnan(label):
        raise ValueError("Label was NaN")
    return label
Esempio n. 16
0
    def get_cnograph_intact(self, label="entry_name"):
        """Return cnograph made of the protein names found in the interactions
        of the annotations.


        .. plot::
            :include-source:
            :width: 50%

            from msdas import *
            a = annotations.Annotations(get_yeast_small_data(), "YEAST")
            a.get_uniprot_entries()
            a.set_annotations()
            n = network.NetworkFromUniProt(a.annotations)
            c = n.get_cnograph_intact()
            c.plotdot()


        """
        assert label in ["entry_id", "entry_name"]
        c = CNOGraph()
        interactions = self.annotations["Interacts with"]
        
        # add all nodes
        c.add_nodes_from(interactions.index)

        # some have no interactions in which case, it is filled with NaN. let us drop those
        # entries. 
        interactions = interactions.dropna()
        indices = interactions.index
        for i, index in enumerate(indices):
            print("{}/{}".format(i+1, len(indices)))
            these_interactions = interactions.ix[index].split(';')
            these_interactions = [x.strip() for x in these_interactions]
            for interaction in these_interactions:
                if interaction == "Itself":
                    interaction = index
                c.add_reaction("{}={}".format(index, interaction))

        if label == "entry_id":
            c._signals = list(self.annotations.index)
        else:
            # bioservices required because interacting species may not be part
            # of the list of measurements,
            from bioservices import UniProt
            u = UniProt(verbose=self.verbose)
            mapping = u.multi_mapping(fr="ACC", to="ID", query=c.nodes())
            for k, v in mapping.iteritems():
                if len(mapping[k])>1:
                    print("ambigous case {} with more than 1 mapping. will take only first".format(k))
                mapping[str(k)] = str(v[0].split("_")[0])
            c.relabel_nodes(mapping)

            measured = [x.split("_")[0] for x in self.annotations['Entry name']]
            c._signals = measured

        return c
Esempio n. 17
0
    def uniprot2genename(self, name):
        """Return the gene names of a UniProt identifier"""
        from bioservices import UniProt
        c = UniProt(cache=True)

        try:
            res = pd.read_csv(StringIO(c.search(name, limit=1)), sep='\t')
            return list(res['Gene names'].values)
        except:
            print("Could not find %s" % name)
Esempio n. 18
0
def retrieve_label_from_uniparc(ID):
    uniprot = UniProt()
    columns, values = uniprot.search(ID, database="uniparc",
                                     limit=1)[:-1].split("\n")
    name_idx = columns.split("\t").index("Organisms")
    name = values.split("\t")[name_idx].split("; ")[0]
    columns, values = uniprot.search(name, database="taxonomy",
                                     limit=1)[:-1].split("\n")
    lineage_idx = columns.split("\t").index("Lineage")
    label = values.split("\t")[lineage_idx].split("; ")[:2][-1]
    return label
Esempio n. 19
0
def pI_calc(dataframe):
    df = dataframe
    u = UniProt()
    for index, row in df.iterrows():
        seqce = u.search(df.loc[index, "prot_acc"],
                         frmt="tab",
                         columns="sequence").split('\n')
        p_i = ipc.predict_isoelectric_point(seqce[1])
        df.loc[index, "pI"] = p_i

    return df
Esempio n. 20
0
    def get_fasta(self, id_):
        """Fetches FASTA from uniprot and loads into attrbiute :attr:`fasta`

        :param str id_: a given uniprot identifier
        :returns: the FASTA contents

        """
        print("get_fasta is deprecated. Use load_fasta instead")
        from bioservices import UniProt
        u = UniProt(verbose=False)
        res = u.retrieve(id_, frmt="fasta")
        self._fasta = res[:]
        return res
Esempio n. 21
0
    def get_fasta(self, id_):
        """Fetches FASTA from uniprot and loads into attrbiute :attr:`fasta`

        :param str id_: a given uniprot identifier
        :returns: the FASTA contents

        """
        print("get_fasta is deprecated. Use load_fasta instead")
        from bioservices import UniProt
        u = UniProt(verbose=False)
        res = u.retrieve(id_, frmt="fasta")
        self._fasta = res[:]
        return res
Esempio n. 22
0
def quick_getprotinfo(protlist):
    """get protein information from uniprot database
    uniprot(http://www.uniprot.org) based on the package of
    bioservices.

    input ::=
    protlist: list of proteins
    idnm: type of protein names, such as AC.
    output::
    dict of protein information::
    Entry name; Gene names; Length; Organism; Protein names; Status.
    """
    u = UniProt(verbose=False)
    return u.quick_search(protlist)
Esempio n. 23
0
def write_fasta_for_ids(uniprot_ids, output_file):
    u = UniProt(verbose=False)
    count = 1
    all_seqs = []
    for uni_id in uniprot_ids:
        all_seqs.append(u.retrieve(uni_id, 'fasta'))
        if count % 500 == 0:
            print("Retrieved sequence for {}/{} IDs".format(
                count, len(uniprot_ids)))
        count += 1
    all_fasta_seqs = [i for i in all_seqs if not type(i) == int]
    final_fasta = ''.join(all_fasta_seqs)
    with open(output_file, 'w') as f:
        f.write(final_fasta)
Esempio n. 24
0
def getUniprotInfo(uni_id):
    u = UniProt()  #verbose=False)
    frmt = "tab"
    columns = ','.join(columns_name)
    alldata = u.search(uni_id, frmt=frmt, columns=columns)
    dataline = alldata.split("\n")
    data = [l.split("\t") for l in dataline[1:]]
    header = dataline[0].split("\t")
    dic_data = []
    for j in range(len(data) - 1):
        dic = {}
        for i, key in enumerate(columns_name):
            dic[key] = data[j][i]
        dic_data.append(dic)
    return dic_data, data, header
Esempio n. 25
0
def find_gene(prot_id):
    u = UniProt(verbose=False)
    res = u.mapping("EMBL", "ACC", query=prot_id)
    for key, values in res.items():
        for value in values:
            res = u.search(value, frmt="tab", limit=3, columns="genes", database='uniparc')

            genes = set(res[11:].split(';'))
            genes = [i for i in genes if (0<len(i) and i !='\n')]

            if len(genes)<1:
                genes = 'none'

            return key, genes
    return prot_id, 'none'
Esempio n. 26
0
class Peptides(object):
    """

    ::

        >>> p = Peptides()
        >>> p.get_fasta_sequence("Q8IYB3")
        >>> p.get_peptide_position("Q8IYB3", "VPKPEPIPEPKEPSPE")
        189


    Sometimes, peptides are provided with a pattern indicating the phospho site.
    e.g., ::

        >>>

    """
    def __init__(self, verbose=False):
        self.u = UniProt(verbose=verbose)
        self.sequences = {}

    def get_fasta_sequence(self, uniprot_name):
        seq = self.u.get_fasta_sequence(uniprot_name)
        return seq

    def get_phosphosite_position(self, uniprot_name, peptide):
        if uniprot_name not in self.sequences.keys():
            seq = self.get_fasta_sequence(uniprot_name)
            self.sequences[uniprot_name] = seq[:]
        else:
            seq = self.sequences[uniprot_name][:]
        positions = [x.start() for x in re.finditer("PQS", seq)]
        return positions
Esempio n. 27
0
def test1():

    u = UniProt()

    # working bob
    bob = fetch_fasta_from_uniprot(u, 'P29317', True)
    print(bob)

    # toomanyError bob
    try:
        bob = fetch_fasta_from_uniprot(u, ['P29317', 'P13929'], True)
        print(bob)
    except TooManyError:
        # expected outcome
        print('toomanyerror handled')

    # toomanyError bob
    try:
        bob = fetch_fasta_from_uniprot(u, 'P29317' + ',\P13929', True)
        print(bob)
    except InvalidIdError:
        # expected outcome
        print('InvalidIdError 400 handled')

        #

    # InvalidIdError bob
    try:
        bob = fetch_fasta_from_uniprot(u, "IwantA404CodePlease", True)
        print(bob)
    except InvalidIdError:
        # expected outcome
        print('InvalidIdError 404 handled')
Esempio n. 28
0
def getTaxonomyProtein(taxonomy, format="tab"):
    u = UniProt()  #verbose=False)
    query = "taxonomy:" + taxonomy
    frmt = "tab"
    columns = ','.join(columns_name)
    #get all entry_name as a data_frame
    #    entry_name=u.search(query,frmt=frmt,columns="entry name")
    #    entry_name_1 = str(entry_name).split("\n")
    #    enrty_name = entry_name_1[1:-1]
    #this is no enought informtion
    #get using the seach
    alldata = u.search(query, frmt=format, columns=columns)
    dataline = alldata.split("\n")
    data = [l.split("\t") for l in dataline[1:]]
    header = dataline[0].split("\t")
    return alldata, data, header
def call_uniprotkb(query, logger):
    """Calls to UniProt.

    If no data is retieved a default 'blank' dataframe is returned.

    :param query: str, query for UniProt
    :param logger: logger object

    Returns dataframe of search results.
    """
    # Establish data to be retrieved from UniProt
    columnlist = (
        "organism-id,organism,id,entry name, protein names,length,mass,domains,domain,"
        "families,"
        "go-id,go(molecular function),go(biological process),"
        "sequence")

    # This dictionary will be used to populate "blank"/"empty" databases when
    # an error is thrown. Iterables are used as values to avoid problems with
    # "ValueError: If using all scalar values, you must pass an index"
    blank_data = {
        "NCBI Taxonomy ID": ["NA"],
        "Organism": ["NA"],
        "UniProtKB Entry ID": ["NA"],
        "UniProtKB Entry Name": ["NA"],
        "UniProtKB Protein Names": ["NA"],
        "EC number": ["NA"],
        "Length (Aa)": ["NA"],
        "Mass (Da)": ["NA"],
        "Domains": ["NA"],
        "Domain count": ["NA"],
        "UniProtKB Linked Protein Families": ["NA"],
        "Gene ontology IDs": ["NA"],
        "Gene ontology (molecular function)": ["NA"],
        "Gene ontology (biological process)": ["NA"],
        "Sequence": ["NA"],
    }

    logger.info("querying uniprot, query: {query}")
    try:
        # open connection to UniProt(), search and convert result into pandas df
        search_result = UniProt().search(
            query,
            columns=columnlist,
        )  # returns empty string for no result

        return pd.read_table(io.StringIO(search_result))

    except HTTPError:
        logger.warning((f"Network error occured during query: {query}\n"
                        "Returning null value 'NA' for all UniProt data"))
        return pd.DataFrame(blank_data)

    except EmptyDataError:
        # No UniProt entries found for locus tag, return null data for
        logger.warning(
            (f"No data returned from UniProt during query: {query}\n"
             "Returning null value 'NA' for all UniProt data"))
        return pd.DataFrame(blank_data)
Esempio n. 30
0
    def __init__(self, verbosity="INFO"):
        super(Mapper, self).__init__(level=verbosity)
        self.logging.info("Initialising the services")
        self.logging.info("... uniprots")
        self._uniprot_service = UniProt()

        self.logging.info("... KEGG")
        self._kegg_service = KeggParser(verbose=False)

        self.logging.info("... HGNC")
        self._hgnc_service = HGNC()

        self.logging.info("... UniChem")
        self._unichem_service = UniChem()

        self.logging.info("...BioDBNet")
        self._biodbnet = BioDBNet()
Esempio n. 31
0
 def __init__(self, input_file, gene_id_column, output_file):
     self._input_file = input_file
     self._gene_id_column = gene_id_column
     self._output_file = output_file
     self._tmp_folder = "tmp_data"
     self._uniprot = UniProt(verbose=False)
     self._quickgo = QuickGO(verbose=False)
     if os.path.exists(self._tmp_folder) is False:
         os.mkdir(self._tmp_folder)
Esempio n. 32
0
    def load_fasta(self, id_):
        """Fetches FASTA from uniprot and loads into attribute :attr:`fasta`

        :param str id_: a given uniprot identifier
        :returns: nothing

        .. note:: same as :meth:`get_fasta` but returns nothing
        """
        # save fasta into attributes fasta
        from bioservices import UniProt
        u = UniProt(verbose=False)
        try:
            res = u.retrieve(id_, frmt="fasta")
            # some entries in uniprot are valid but obsolet and return empty string
            if res == "":
                raise Exception
            self._fasta = res[:]
        except:
            pass
Esempio n. 33
0
    def load_fasta(self, id_):
        """Fetches FASTA from uniprot and loads into attribute :attr:`fasta`

        :param str id_: a given uniprot identifier
        :returns: nothing

        .. note:: same as :meth:`get_fasta` but returns nothing
        """
        # save fasta into attributes fasta
        from bioservices import UniProt
        u = UniProt(verbose=False)
        try:
            res = u.retrieve(id_, frmt="fasta")
            # some entries in uniprot are valid but obsolet and return empty string
            if res == "":
                raise Exception
            self._fasta = res[:]
        except:
            pass
Esempio n. 34
0
def search_struc():
    u = UniProt()
    with open(os.path.join(resultspath, "myprot_list.csv"), "r") as infile:
        with open(os.path.join(resultspath, 'myprot_list_struc.csv'),
                  'w') as outfile:
            mywriter = csv.writer(outfile, delimiter=';')
            myreader = csv.reader(infile, delimiter=';')
            for row in myreader:
                if row[0] == "Family":
                    mywriter.writerow(row + ["Uniprot entry", "Struc"])
                    continue
                struccell = ""
                entry = ""
                uprot = row[3]
                print("\n\n", row[2])
                if uprot:
                    data = u.quick_search("id:%s" % uprot)
                    if data:
                        entry = data[uprot]['Entry name'].lower()
                        struc_res = obtain_struc_pdb(entry)
                        if struc_res:
                            struccell = struc_res
                            print(struccell)
                        else:
                            temp = requests.get(
                                'http://gpcrdb.org/services/structure/template/'
                                + entry).json()
                            if temp:
                                temp_res = obtain_struc_pdb(temp)
                                if temp_res:
                                    struccell = "[Model]: " + temp_res
                                    print(struccell)
                                else:
                                    print("-----No struc for template")
                            else:
                                print("-----No template")
                    else:
                        print("-----Uprot ID not found")
                else:
                    print("-----No uprot ID")

                mywriter.writerow(row + [entry, struccell])
Esempio n. 35
0
def hitrate(proteins, indexes, subclass):
    columns = ['subsequence', 'sprot_start', 'sprot_end', 'sprot_loc', 'dl_start', 'dl_end', 'dl_loc']
    u = UniProt()
    match, total = 0,0
    dl_peptides, dl_starts, dl_ends, sprot_starts, sprot_ends, sprot_locs =[], [], [], [], [], []
    if proteins != None:
        for prot in proteins:
            locs = None
            try:
                entry = u.retrieve(prot.ac, frmt='xml')
                locs = entry['subcellularlocation']
            except:
                continue
            if locs:
                pep_metadata = prot.matching_peptide[0]
                seq_range = pep_metadata.match_range[0]
                peptide = pep_metadata.peptide
                dl_peptides.append(peptide)
                start, end = indexes[peptide]
                dl_starts.append(start)
                dl_ends.append(end)
                
                pos = seq_range.start
                sprot_starts.append(pos)
                sprot_ends.append(seq_range.end)
                
                seq_len = seq_range.end - pos
                offset_weight = 1 if pos == start else min(abs(seq_len / (pos - start)), 1) 
                
                loc = list((locs[0].children))[1].string
                sprot_locs.append(loc)
                match_weight = determine_locations(loc, subclass) * offset_weight
                match += match_weight
                # assert(match_weight <= 1),'match_weight {}'.format(match_weight)
                total += offset_weight
    if total == 0:
        hitrate = 0
    else:
        hitrate = match/total
    vals = [[dl_peptides, sprot_starts, sprot_ends, sprot_locs, dl_starts, dl_ends, subclass]]
    df = pd.DataFrame(vals, columns=columns)
    return (hitrate, df)
Esempio n. 36
0
def add_sequence_to_nodes(n: str, d: Dict[str, Any]):
    """
    Maps UniProt ACC to UniProt ID. Retrieves sequence from UniProt and adds it to the node as a feature

    :param n: Graph node.
    :type n: str
    :param d: Graph attribute dictionary.
    :type d: Dict[str, Any]
    """
    h = HGNC(verbose=False)
    u = UniProt(verbose=False)

    d["uniprot_ids"] = h.fetch(
        "symbol", d["protein_id"])["response"]["docs"][0]["uniprot_ids"]

    # Todo these API calls should probably be batched
    # Todo mapping with bioservices to support other protein IDs?

    for id in d["uniprot_ids"]:
        d[f"sequence_{id}"] = u.get_fasta_sequence(id)
Esempio n. 37
0
    def __init__(self, verbose=True):
        """.. rubric:: Constructor

        :param bool verbose: print informative messages

        .. doctest::

            >>> from bioservices import PSICQUIC
            >>> s = PSICQUIC()

        """
        urlStr = 'http://www.ebi.ac.uk/Tools/webservices/psicquic'
        super(PSICQUIC, self).__init__("PSICQUIC", verbose=verbose, url=urlStr)
        self._registry = None

        try:
            self.uniprot = UniProt(verbose=False)
        except:
            self.logging.warning("UniProt service could be be initialised")
        self.buffer = {}
Esempio n. 38
0
    def __init__(self, interactionlist=None):

        if interactionlist is None:
            interactionlist = ["phosphorylation"]
        self.interactionlist = interactionlist

        self.namespaces = {'df': 'http://psi.hupo.org/mi/mif'}  #
        self._logger = logging.getLogger(__name__)

        self.u = UniProt(verbose=False)
        self._cache_kegg_entry_uniprots = {}
Esempio n. 39
0
def __main__():
    ids = sys.argv[1]
    filename = sys.argv[2]
    # TODO: check the validity and format ? 
    try:
        from  bioservices import UniProt
        u = UniProt(verbose=False)
        u.debugLevel = "ERROR"
    except ImportError:
        print("Could not import bioservoces ? Check that it is installed. Try 'pip install bioservices'")

    try:
        fasta = u.searchUniProtId(ids, "fasta")
    except:
        print("An error occured while fetching the FASTA file from uniprot")

    try:
        fh = open(filename, "w")
        fh.write(fasta)
        fh.close()
    except:
        print("could not save the FASTA file")
Esempio n. 40
0
def main():
    """ Main function."""
    args = parse_args()
    if args.log:
        logfile = args.log
        logging.basicConfig(filename=logfile, level=logging.DEBUG, \
            filemode='w', format='%(asctime)s %(message)s', \
            datefmt='%Y-%m-%d %H:%M:%S')
    else:
        logfile = sys.stdout

    #Interface to the UniProt service
    u = UniProt(verbose=False)

    with open('../DataFilesVarstructure/pdb_chain_uniprot_1.csv','r') as csvinput:
        with open('../DataFilesVarstructure/pdb_chain_uniprot_2.csv', 'w') as csvoutput:
            writer = csv.writer(csvoutput)

            for row in csv.reader(csvinput):
                column_count = len(row)
                print(column_count)
                if column_count == 9:
                    if row[0] == "PDB":
                        writer.writerow(row + ["Entry_Name"])
                        print(row)
                    else:
                        res=u.search(str(row[2]),limit=1)
                        print(res)
                        if res != "" :
                            for line in res.split("\n")[1:-1]:
                                if(line != ""):
                                    print(line)
                                    res_id, res_Entry_Name, res_status, res_protein_names, res_gene_names, res_organism, Length = line.split("\t")
                                    strList =list()
                                    strList.append(res_Entry_Name)
                                    writer.writerow(row + strList)
Esempio n. 41
0
    def __init__(self, verbosity="INFO"):
        super(Mapper, self).__init__(level=verbosity)
        self.logging.info("Initialising the services")
        self.logging.info("... uniprots")
        self._uniprot_service = UniProt()

        self.logging.info("... KEGG")
        self._kegg_service = KeggParser(verbose=False)

        self.logging.info("... HGNC")
        self._hgnc_service = HGNC()

        self.logging.info("... UniChem")
        self._unichem_service = UniChem()

        self.logging.info("...BioDBNet")
        self._biodbnet = BioDBNet()
Esempio n. 42
0
    def __init__(self, verbose=True):
        """.. rubric:: Constructor

        :param bool verbose: print informative messages

        .. doctest:: 

            >>> from bioservices import PSICQUIC
            >>> s = PSICQUIC()

        """
        urlStr = 'http://www.ebi.ac.uk/Tools/webservices/psicquic'
        super(PSICQUIC, self).__init__("PSICQUIC", verbose=verbose, url=urlStr)
        self._registry = None

        try:
            self.uniprot = UniProt(verbose=False)
        except:
            self.logging.warning("UniProt service could be be initialised")
        self.buffer = {}
Esempio n. 43
0
class Mapper(Logging):
    """Accepted code:

        uniprot


    m = Mapper()
    # HGNC
    df_hgnc = m.get_all_hgnc_into_df()
    df_hgnc.to_pickle("mapper_hgnc.dat")

    # KEGG
    df_kegg1 = m.get_all_kegg_into_df1()
    df_kegg2 = m.get_all_kegg_into_df2()

    uniq_keggid = 

    """
    kegg_dblinks  = ["IMGT", "Ensembl", "HGNC", "HPRD", "NCBI-GI", "OMIM", "NCBI-GeneID", "UniProt", "Vega"]
    hgnc_dblink =  ['EC','Ensembl', 'EntrezGene', 'GDB', 'GENATLAS',
            'GeneCards', 'GeneTests', 'GoPubmed', 'H-InvDB', 'HCDM', 'HCOP',
            'HGNC', 'HORDE', 'IMGT_GENE_DB', 'INTERFIL', 'IUPHAR', 'KZNF',
            'MEROPS', 'Nucleotide', 'OMIM', 'PubMed', 'RefSeq', 'Rfam',
            'Treefam', 'UniProt', 'Vega', 'miRNA', 'snoRNABase']


    def __init__(self, verbosity="INFO"):
        super(Mapper, self).__init__(level=verbosity)
        self.logging.info("Initialising the services")
        self.logging.info("... uniprots")
        self._uniprot_service = UniProt()

        self.logging.info("... KEGG")
        self._kegg_service = KeggParser(verbose=False)

        self.logging.info("... HGNC")
        self._hgnc_service = HGNC()

        self.logging.info("... UniChem")
        self._unichem_service = UniChem()

        self.logging.info("...BioDBNet")
        self._biodbnet = BioDBNet()

    def _uniprot2refseq(self, name):
        """

        There are 2 refseq alias: REFSEQ_NT_ID and P_REFSEQ_AC.

        Here, we use the first one to agree with wikipedia
        http://en.wikipedia.org/wiki/Protein_Kinase_B

        """
        return self._uniprot_service.mapping(fr="ACC", to="REFSEQ_NT_ID", query="P31749")

    def _update_uniprot_xref(self, df, 
            xref=["HGNC_ID", "ENSEMBLE_ID",  "P_ENTREZGENEID"]):
        """Update the dataframe using Uniprot to map indices onto cross
        reference databases


        """
        for ref in xref:
            print("Processing %s " % ref)
            res = self._uniprot_service.multi_mapping("ACC", ref,
                    list(df.index), timeout=10, ntrials=5)
            if "%s__uniprot_mapping" % ref not in df.columns:
                thisdf = pd.DataFrame({"%s__uniprot_mapping": res.values()},
                        index=res.keys())
                df = df.join(thisdf)
            else:
                for index in df.index:
                    if index in res.keys():
                        df.ix[index]["%s__uniprot_mapping" % ref] = res[index]

    def get_data_from_biodbnet(self, df_hgnc):
        """keys are unique Gene names
        
        input is made of the df based on HGNC data web services

        uniprot accession are duplicated sometimes. If som this is actually the
        iprimary accession entry and all secondary ones.


        e.g. ,
        
        ABHD11 >>>> Q8N723;Q8NFV2;Q8NFV3;Q6PJU0;Q8NFV4;H7BYM8;Q8N722;Q9HBS8 ABHDB_HUMAN Alpha/beta hydrolase domain-containing protein 11
        correspond actually to the primary one : Q8NFV4

        """
        b = biodbnet.BioDBNet()
        res2 = b.db2db("Gene Symbol", ["HGNC ID", "UniProt Accession", "UniProt Entry Name", "UniProt Protein Name", "KEGG Gene ID", "Ensembl Gene ID"], 
                res.keys()[0:2000])

        import pandas as pd
        import StringIO
        c = pd.read_csv(StringIO.StringIO(res2), delimiter="\t", index_col="Gene Symbol")
        return c
Esempio n. 44
0
class PSICQUIC(RESTService):
    """Interface to the `PSICQUIC <http://code.google.com/p/psicquic/>`_ service

    There are 2 interfaces to the PSICQUIC service (REST and WSDL) but we used
    the REST only.


    This service provides a common interface to more than 25 other services
    related to protein. So, we won't detail all the possiblity of this service.
    Here is an example that consists of looking for interactors of the
    protein ZAP70 within the IntAct database::

        >>> from bioservices import *
        >>> s = PSICQUIC()
        >>> res = s.query("intact", "zap70")
        >>> len(res) # there are 11 interactions found
        11
        >>> # Let us look at the second one in particular:
        >>> for x in res[1].split("\t"): 
        ...     print x
        uniprotkb:O95169
        uniprotkb:P43403
        intact:EBI-716238
        intact:EBI-1211276
        psi-mi:ndub8_human(display_long)|uniprotkb:NADH-ubiquinone oxidoreductase ASHI
        .
        .

    Here we have a list of entries. There are 15 of them (depending on
    the *output* parameter). The meaning of the entries is described on PSICQUIC
    website: https://code.google.com/p/psicquic/wiki/MITAB25Format . In short:

    
    #. Unique identifier for interactor A
    #. Unique identifier for interactor B.
    #. Alternative identifier for interactor A, for example the official gene
    #. Alternative identifier for interactor B.
    #. Aliases for A, separated by "|
    #. Aliases for B.
    #. Interaction detection methods, taken from the corresponding PSI-MI
    #. First author surname(s) of the publication(s) 
    #. Identifier of the publication 
    #. NCBI Taxonomy identifier for interactor A. 
    #. NCBI Taxonomy identifier for interactor B.
    #. Interaction types, 
    #. Source databases and identifiers, 
    #. Interaction identifier(s) i
    #. Confidence score. Denoted as scoreType:value. 



    Another example with reactome database::

        res = s.query("reactome", "Q9Y266")


    .. warning:: PSICQUIC gives access to 25 other services. We cannot create
        a dedicated parsing for all of them. So, the ::`query` method returns
        the raw data. Addition class may provide dedicated parsing in the
        future.

    .. seealso:: :class:`bioservices.biogrid.BioGRID`
    """

    _formats = ["tab25", "tab26", "tab27", "xml25", "count", "biopax", "xgmml",
        "rdf-xml", "rdf-xml-abbrev", "rdf-n3", "rdf-turtle"]


    # note the typo in "genbank indentifier from bind DB
    _mapping_uniprot = {"genbank indentifier": "P_GI",
        'entrezgene/locuslink':"P_ENTREZGENEID",
        'uniprotkb': "ACC+ID",
        'rcsb pdb':"PDB_ID",
        'ensembl':"ENSEMBL_ID",
        'refseq':"P_REFSEQ_AC",
        'hgnc':'HGNC_ID',
        "kegg": "KEGG_ID",
        "entrez gene/locuslink": "P_ENTREZGENEID",
        "chembl": "CHEMBL_ID",
        "ddbj/embl/genbank": "EMBL_ID",
        "dip": "DIP_ID",
        "ensemblgenomes": "ENSEMBLGENOME_ID",
        "omim":"MIM_ID",
        "chebi": None,
        "chembl": None,
#        "intact": None
    }

# unknown: hprd, omim, bind, bind complexid, mdl, 

    def __init__(self, verbose=True):
        """.. rubric:: Constructor

        :param bool verbose: print informative messages

        .. doctest:: 

            >>> from bioservices import PSICQUIC
            >>> s = PSICQUIC()

        """
        urlStr = 'http://www.ebi.ac.uk/Tools/webservices/psicquic'
        super(PSICQUIC, self).__init__("PSICQUIC", verbose=verbose, url=urlStr)
        self._registry = None

        try:
            self.uniprot = UniProt(verbose=False)
        except:
            self.logging.warning("UniProt service could be be initialised")

        self.buffer = {}

    def _get_formats(self):
        return PSICQUIC._formats
    formats = property(_get_formats, doc="Returns the possible output formats")

    def _get_active_db(self):
        names = self.registry_names[:]
        actives = self.registry_actives[:]
        names = [x.lower() for x,y in zip(names, actives) if y=="true"]
        return names
    activeDBs = property(_get_active_db, doc="returns the active DBs only")


    def read_registry(self):
        """Reads and returns the active registry 

        """
        url = self.url + '/registry/registry?action=ACTIVE&format=txt'
        res = self.request(url, format='txt')
        return res.split()

    def print_status(self):
        """Prints the services that are available

        :return: Nothing

        The output is tabulated. The columns are:

        * names
        * active
        * count
        * version
        * rest URL
        * soap URL
        * rest example
        * restricted

        .. seealso:: If you want the data into lists, see all attributes
            starting with registry such as :meth:`registry_names`
        """
        url = self.url +  '/registry/registry?action=STATUS&format=xml'
        res = self.request(url)
        names = self.registry_names
        counts = self.registry_counts
        versions = self.registry_versions
        actives = self.registry_actives
        resturls = self.registry_resturls
        soapurls = self.registry_soapurls
        restexs = self.registry_restexamples
        restricted = self.registry_restricted
        N = len(names)

        indices = sorted(range(0,N), key=lambda k: names[k])

        for i in range(0,N):
            print("%s\t %s\t %s\t %s\t %s %s %s %s\n" % (names[i], actives[i], 
                counts[i], versions[i], resturls[i], soapurls[i], restexs[i], restricted[i]))


    # todo a property for the version of PISCQUIC

    def _get_registry(self):
        if self._registry == None:
            url = self.url +  '/registry/registry?action=STATUS&format=xml'
            res = self.request(url, format="xml")
            self._registry = res
        return self._registry
    registry = property(_get_registry, doc="returns the registry of psicquic")

    def _get_registry_names(self):
        res = self.registry
        return [x.findAll('name')[0].text for x in res.findAll("service")]
    registry_names = property(_get_registry_names, doc="returns all services available (names)")

    def _get_registry_restricted(self):
        res = self.registry
        return [x.findAll('restricted')[0].text for x in res.findAll("service")]
    registry_restricted = property(_get_registry_restricted, doc="returns restricted status of services" )

    def _get_registry_resturl(self):
        res = self.registry
        data = [x.findAll('resturl')[0].text for x in res.findAll("service")]
        return data
    registry_resturls = property(_get_registry_resturl, doc="returns URL of REST services")

    def _get_registry_restex(self):
        res = self.registry
        data = [x.findAll('restexample')[0].text for x in res.findAll("service")]
        return data
    registry_restexamples = property(_get_registry_restex, doc="retuns REST example for each service")

    def _get_registry_soapurl(self):
        res = self.registry
        return  [x.findAll('soapurl')[0].text for x in res.findAll("service")]
    registry_soapurls = property(_get_registry_soapurl, doc="returns URL of WSDL service")

    def _get_registry_active(self):
        res = self.registry
        return  [x.findAll('active')[0].text for x in res.findAll("service")]
    registry_actives = property(_get_registry_active, doc="returns active state of each service")

    def _get_registry_count(self):
        res = self.registry
        return  [x.findAll('count')[0].text for x in res.findAll("service")]
    registry_counts = property(_get_registry_count, doc="returns number of entries in each service")

    def _get_registry_version(self):
        res = self.registry
        names = [x.findAll('name')[0].text for x in res.findAll("service")]
        N = len(names)
        version = [0] * N
        for i in range(0,N):
            x = res.findAll("service")[i]
            if x.findAll("version"):
                version[i] = x.findAll("version")[0].text
            else:
                version[i] = None 
        return  version
    registry_versions = property(_get_registry_version, doc="returns version of each service")

    def query(self, service, query, output="tab25", version="current", firstResult=None, maxResults=None):
        """Send a query to a specific database 

        :param str service: a registered service. See :attr:`registry_names`.
        :param str query: a valid query. Can be `*` or a protein name.
        :param str output: a valid format. See s._formats

        ::

            s.query("intact", "brca2", "tab27")
            s.query("intact", "zap70", "xml25")
            s.query("matrixdb", "*", "xml25")

        This is the programmatic approach to this website:

        http://www.ebi.ac.uk/Tools/webservices/psicquic/view/main.xhtml


        Another example consist in accessing the *string* database for fetching 
        protein-protein interaction data of a particular model organism. Here we
        restrict the query to 100 results::

            s.query("string", "species:10090", firstResult=0, maxResults=100, output="tab25")

        # spaces are automatically converted

            s.query("biogrid", "ZAP70 AND species:9606")

        .. warning:: AND must be in big caps. Some database are ore permissive
            than other (e.g., intact accepts "and"). species must be a valid ID number. Again, some DB are more
            permissive and may accept the name (e.g., human)

        To obtain the number of interactions in intact for the human specy:: 

            >>> len(p.query("intact", "species:9606"))


        """
        if service not in self.activeDBs:
            raise ValueError("database %s not in active databases" % service)

        params = {}
        if output!=None:
            self.checkParam(output, self.formats)
            params['format'] = output
        else: output="none"

        names = [x.lower() for x in self.registry_names]
        try:
            index = names.index(service)
        except ValueError:
            print("The service you gave (%s) is not registered. See self.registery_names" % service)
            raise ValueError

        # get the base url according to the service requested
        resturl = self.registry_resturls[index]

        if firstResult != None:
            params['firstResult'] = firstResult
        if maxResults != None:
            params['maxResults'] = maxResults

        postData = self.urlencode(params)

        url = resturl  + 'query/' + query.replace(" ", "%20")
        if params:
            url += "?" + postData


        if "xml" in output:
            res = self.request(url, format="xml", baseUrl=False)
        else:
            res = self.request(url, format="txt",baseUrl=False)
            res = res.strip().split("\n")

        if output.startswith("tab"):
            res = self._convert_tab2dict(res)

        return res


    def _convert_tab2dict(self, data):
        """

        https://code.google.com/p/psicquic/wiki/MITAB26Format
        """
        results = []
        for line in data:
            results.append(line.split("\t"))

        return results


    def queryAll(self, query, databases=None, output="tab25", version="current", firstResult=None, maxResults=None):
        """Same as query but runs on all active database

        :param list databases: database to query. Queries all active DB if not provided
        :return: dictionary where keys correspond to databases and values to the output of the query.

        ::

            res = s.queryAll("ZAP70 AND species:9606")
        """

        results = {}
        if databases == None:
             databases = [x.lower() for x in self.activeDBs]

        for x in databases:
            if x not in self.activeDBs:
                raise ValueError("database %s not in active databases" % x)


        for name in databases:
            self.logging.warning("Querying %s" % name),
            res = self.query(name, query, output=output, version=version, firstResult=firstResult, maxResults=maxResults)
            if output.startswith("tab25"):
                results[name] = [x for x in res if x!=[""]]
            else:
                import copy
                results[name] = copy.copy(res)
        for name in databases:
            self.logging.info("Found %s in %s" % (len(results[name]), name))
        return results



    def getInteractionCounter(self, query):
        """Returns a dictionary with database as key and results as values

        :param str query: a valid query
        :return: a dictionary which key as database and value as number of entries 

        Consider only the active database.

        """
        # get the active names only
        activeDBs = self.activeDBs[:] 
        res = [(str(name), int(self.query(name, query, output="count")[0])) for name in activeDBs]
        return dict(res)

    def getName(self, data):
        idsA = [x[0] for x in data]
        idsB = [x[1] for x in data]
        return idsA, idsB

    def knownName(self, data):
        """Scan all entries (MITAB) and returns simplified version


        Each item in the input list of mitab entry
        The output is made of 2 lists corresponding to 
        interactor A and B found in the mitab entries.

        elements in the input list takes the following forms::

            DB1:ID1|DB2:ID2
            DB3:ID3

        The | sign separates equivalent IDs from different databases. 

        We want to keep only one. The first known databae is kept. If in the list of DB:ID pairs no known
        database is found, then we keep the first one whatsover.

        known databases are those available in the uniprot mapping tools. 

        chembl and chebi IDs are kept unchanged.


        """


        self.logging.info("converting data into known names")
        idsA = [x[0].replace("\"","") for x in data]
        idsB = [x[1].replace("\"", "") for x in data]
        # extract the first and second ID but let us check if it is part of a
        # known uniprot mapping.Otherwise no conversion will be possible.
        # If so, we set the ID to "unknown"
        # remove the " character that can be found in a few cases (e.g,
        # chebi:"CHEBI:29036")
        #idsA = [x.replace("chebi:CHEBI:","chebi:") for x in idsA]
        #idsB = [x.replace("chebi:CHEBI:", "chebi:") for x in idsB]

        # special case:
        # in mint, there is an entry that ends with a | uniprotkb:P17844|
        idsA = [x.strip("|") for x in idsA]
        idsB = [x.strip("|") for x in idsB]


        # the first ID
        for i, entry in enumerate(idsA):
            try:
                dbs = [x.split(":")[0] for x in entry.split("|")]
                IDs = [x.split(":")[1] for x in entry.split("|")]
                valid_dbs = [(db,ID) for db,ID in zip(dbs,IDs) if db in self._mapping_uniprot.keys()]
                # search for an existing DB
                if len(valid_dbs)>=1:
                    idsA[i] = valid_dbs[0][0] + ":" + valid_dbs[0][1]
                else:
                    self.logging.debug("none of the DB for this entry (%s) are available" % (entry))
                    idsA[i] = "?" + dbs[0] + ":" + IDs[0]
            except:
                self.logging.info("Could not extract name from %s" % entry)
                idsA[i] = "??:" + entry  # we add a : so that we are sure that a split(":") will work
        # the second ID
        for i, entry in enumerate(idsB):
            try:
                dbs = [x.split(":")[0] for x in entry.split("|")]
                IDs = [x.split(":")[1] for x in entry.split("|")]
                valid_dbs = [(db,ID) for db,ID in zip(dbs,IDs) if db in self._mapping_uniprot.keys()]
                # search for an existing DB
                if len(valid_dbs)>=1:
                    idsB[i] = valid_dbs[0][0] + ":" + valid_dbs[0][1]
                else:
                    self.logging.debug("none of the DB (%s) for this entry are available" % (entry))
                    idsB[i] = "?" + dbs[0] + ":" + IDs[0]
            except:
                self.logging.info("Could not extract name from %s" % entry)
                idsB[i] = "??:" + entry

        countA = len([x for x in idsA if x.startswith("?")])
        countB = len([x for x in idsB if x.startswith("?")])
        if countA+countB > 0:
            self.logging.warning("%s ids out of %s were not identified" % (countA+countB, len(idsA)*2))
            print (set([x.split(":")[0] for x in idsA if x.startswith("?")]))
            print (set([x.split(":")[0] for x in idsB if x.startswith("?")]))
        self.logging.info("knownName done")
        return idsA, idsB

    def preCleaning(self, data):
        """remove entries ehre IdA or IdB is set to "-"

        """
        ret = [x for x in data if x[0] !="-" and x[1]!="-"]
        return ret

    def postCleaningAll(self,data, keep_only="HUMAN", flatten=True, verbose=True):
        """
    
        even more cleaing by ignoring score, db and interaction
        len(set([(x[0],x[1]) for x in retnew]))
        """
        results = {}
        for k in data.keys():
            self.logging.info("Post cleaning %s" % k)
            ret = self.postCleaning(data[k], keep_only="HUMAN", verbose=verbose)
            if len(ret):
                results[k] = ret
        if flatten:
            results = [x for k in results.keys() for x in results[k]]
        return results

    def postCleaning(self, data, keep_only="HUMAN", remove_db=["chebi","chembl"], 
        keep_self_loop=False, verbose=True):
        """Remove entries with a None and keep only those with the keep pattern




        """
        if verbose:print("Before removing anything: ", len(data))

        data = [x for x in data if x[0]!=None and x[1]!=None]
        if verbose:print("After removing the None: ", len(data))
    
        data = [x for x in data if x[0].startswith("!")==False and x[1].startswith("!")==False]
        if verbose:print("After removing the !: ", len(data))

    
        for db in remove_db:
            data = [x for x in data if x[0].startswith(db)==False]
            data = [x for x in data if x[1].startswith(db)==False]
            if verbose:print("After removing entries that match %s : " % db, len(data))

        data = [x for x in data if keep_only in x[0] and keep_only in x[1]]
        if verbose:print("After removing entries that don't match %s : " % keep_only, len(data))
    
        if keep_self_loop == False:
            data = [x for x in data if x[0]!=x[1]]
            if verbose:print("After removing self loop : ", len(data))

        data = list(set(data))
        if verbose:print("After removing identical entries", len(data))



        return data


    def convertAll(self, data):
        results = {}
        for k in data.keys():
            self.logging.info("Analysing %s" % k)
            results[k] = self.convert(data[k], db=k)
        return results

    def convert(self, data, db=None):
        self.logging.debug("converting the database %s" % db)
        idsA, idsB = self.knownName(data)
        mapping = self.mappingOneDB(data)
        results = []
        for i, entry in enumerate(data):
            x = idsA[i].split(":",1)[1]
            y = idsB[i].split(":",1)[1]
            xp = mapping[x]
            yp = mapping[y]
            try:ref = entry[8]
            except:ref="?"
            try:score = entry[14]
            except:score = "?"
            try:interaction = entry[11]
            except:interaction="?"
            results.append((xp, yp, score, interaction, ref, db))
        return results


    def mappingOneDB(self, data):
        query = {}
        self.logging.debug("converting IDs with proper DB name (knownName function)")
        entriesA, entriesB = self.knownName(data) # idsA and B contains list of a single identifier of the form db:id
        # the db is known from _mapping.uniprot otherwise it is called "unknown"

        # get unique DBs to build the query dictionary
        dbsA = [x.split(":")[0] for x in entriesA]
        dbsB = [x.split(":")[0] for x in entriesB]
        for x in set(dbsA):
            query[x] = set()
        for x in set(dbsB):
            query[x] = set()
        for k in query.keys():
            if k.startswith("?"):
                del query[k]

        # the data to store
        mapping = {}
        N = len(data)

        # scan all entries
        counter = 0
        for entryA, entryB in zip(entriesA, entriesB):
            counter += 1
            dbA, idA = entryA.split(":")
            try:
                dbB, idB = entryB.split(":")
            except:
                print entryB
            if idA not in mapping.keys():
                if dbA.startswith("?"):
                    mapping[idA] = entryA
                else:
                    query[dbA].add(idA)
            if idB not in mapping.keys():
                if dbB.startswith("?"):
                    mapping[idB] = entryB
                else:
                    query[dbB].add(idB)

            for k in query.keys():
                if len(query[k])>2000 or counter == N:
                    this_query = list(query[k])
                    DBname = self._mapping_uniprot[k]

                    if DBname != None:
                        self.logging.warning("Request sent to uniprot for %s database (%s/%s)" % (DBname, counter, N))
                        res = self.uniprot.mapping(fr=DBname, to="ID", query=" ".join(this_query))
                        for x in this_query:
                            if x not in res: #was not found
                                mapping[x] = "!" + k+":"+x
                            else:
                                # we should be here since the queries are populated
                                # if not already in the mapping dictionary
                                if x == mapping.keys():
                                    raise ValueError(x)
                                index = res.index(x)
                                mapping[x] = res[index+1]
                    else:
                        for x in this_query:
                            mapping[x] = k + ":" + x
                    query[k] = set()

        for k in query.keys():
            assert len(query[k])==0
        return mapping
import sys
import os

path_fichier = "../uniprot/"

contigs = []

#On obtient l'identifiant uniprot des séquences représentatives
with open("../question_2/resultatEBI.txt", "r") as ebi:
    for line in ebi:
        temp = line.split("\t")
        uniprot = temp[1][10:].strip(" \n\t\r")
        contigs.append(uniprot)

#Maintenant on fait les recherches
u = UniProt()
count = 0
for contig in contigs:
    nom_fichier = path_fichier+contig + ".xml"
    if os.path.isfile(nom_fichier):
        print "Fichier déjà existant: " + contig
    else:
        result = u.searchUniProtId(contig)
        with open(path_fichier+contig+".xml", "w") as uni:
            uni.write(result.prettify())
    count += 1
    print "count = " + str(count) + "contig =  " + contig

print "Nombre de contigs traités: " + str(count)

##Première ligne est toujours un contig
Esempio n. 46
0
def main():
    """ Main function."""
    args = parse_args()
    if args.log:
        logfile = args.log
        logging.basicConfig(filename=logfile, level=logging.DEBUG, \
            filemode='w', format='%(asctime)s %(message)s', \
            datefmt='%Y-%m-%d %H:%M:%S')
    else:
        logfile = sys.stdout

    outputfile = open(args.out, "w")
    # Output header
    outputfile.write("chr\tpos\tid\tref\talt\tgene\tfeature\tfeature_type\tconsequence\tswissprotid\tuniprotid\tpdbid\tprotein_position\tamino_acid\n")

    vcf_row = {}

    #Interface to the UniProt service
    u = UniProt(verbose=False)

    vcf_reader = vcf.Reader(open(args.vcf, 'r'))
    ENSP_PDB_UNIPROT_mapping_DataFram = pd.DataFrame(columns=['ENSP','UniProtID','PDB'])
    #creating a util function to store mapping of Uniprot and PDB_ID
    for record in vcf_reader:
        # VEP fields
        curr_ENSP = ''
        if "CSQ" in record.INFO:
            csq = record.INFO['CSQ']
            # BELOW: THERE ARE A COUPLE OF OPTIONS TO PROCEED
            # For going through annotations for all transcript
            for current_csq_element in csq:
                current_csq = current_csq_element.split('|')
                curr_ENSP = str(current_csq[26])
                if curr_ENSP != "":
                    # to get Protein ID given ENSP ID
                    current_protein_list = u.search(curr_ENSP,frmt="list")
                    for curr_protein in current_protein_list.split("\n"):
                        if curr_protein != "":
                            # to get PDB ID given protein id
                            mapping_Dictionary = u.mapping(fr="ID", to="PDB_ID", query=str(curr_protein))
                            if bool(mapping_Dictionary) == True :
                                if curr_ENSP not in ENSP_PDB_UNIPROT_mapping_DataFram.index:
                                    ENSP_PDB_UNIPROT_mapping_DataFram.loc[curr_ENSP] = pd.Series({'ENSP':curr_ENSP, 'UniProtID':mapping_Dictionary.keys(), 'PDB':mapping_Dictionary.values()})

    #print(ENSP_PDB_UNIPROT_mapping_DataFram)
    # writing in a csv file
    for record in vcf_reader:
        current_chr = record.CHROM
        current_id = record.ID
        current_pos = record.POS
        current_ref = record.REF
        current_alt = ','.join(str(v) for v in record.ALT)

        # VEP fields
        current_gene, current_feature = '',''
        current_feature_type, current_consequence = '',''
        current_swissport,current_ENSP, current_protein, current_pdbid , current_protein_position, current_amino_acid = '','','','','',''
        if "CSQ" in record.INFO:
            csq = record.INFO['CSQ']

            # BELOW: THERE ARE A COUPLE OF OPTIONS TO PROCEED
            # For going through annotations for all transcript
            for current_csq_element in csq:
                current_csq = current_csq_element.split('|')
                current_consequence = current_csq[1]
                current_gene = current_csq[4]
                current_feature_type = current_csq[5]
                current_feature = current_csq[6]
                current_protein_position = current_csq[14]
                current_amino_acid = current_csq[15]
                current_ENSP = current_csq[26]
                current_swissport = current_csq[27]

                # only cosider missense mutation

                #if current_swissport_in_my_list(current_swissport, swissprot_pdb_)
                if current_ENSP in ENSP_PDB_UNIPROT_mapping_DataFram.index:
                    current_protein = ENSP_PDB_UNIPROT_mapping_DataFram.loc[current_ENSP]['UniProtID']
                    for item in ENSP_PDB_UNIPROT_mapping_DataFram.loc[current_ENSP]['PDB']:
                        current_pdbid = item
                        break;
                    out_str = [ current_chr, str(current_pos), str(current_id), current_ref, current_alt,
                                current_gene, current_feature, current_feature_type, current_consequence,current_swissport,current_ENSP, current_protein, current_pdbid , current_protein_position, current_amino_acid]
                else:
                    current_protein = ""
                    current_pdbid = ""
                    out_str = [ current_chr, str(current_pos), str(current_id), current_ref, current_alt,
                            current_gene, current_feature, current_feature_type, current_consequence,current_swissport,current_ENSP, current_protein, current_pdbid , current_protein_position, current_amino_acid]

                out_str = [x or 'None' for x in out_str]

                outputfile.write("\t".join(out_str))
                outputfile.write("\n")

        else:
            current_gene, current_feature = '',''
            current_feature_type, current_consequence = '',''
            current_swissport,current_ENSP, current_protein, current_pdbid , current_protein_position, current_amino_acid = '','','','','',''

            out_str = [ current_chr, str(current_pos), str(current_id), current_ref, current_alt,
                        current_gene, current_feature, current_feature_type, current_consequence,current_swissport,current_ENSP, current_protein, current_pdbid , current_protein_position, current_amino_acid]
            out_str = [x or 'None' for x in out_str]
            outputfile.write("\t".join(out_str))
            outputfile.write("\n")

    outputfile.close()

    logging.info('Start.')
    logging.info('Command line: {}'.format(' '.join(sys.argv)))
Esempio n. 47
0
 def __init__(self, verbose=False):
     self.u = UniProt(verbose=verbose)
     self.sequences = {}
Esempio n. 48
0
 def _init_uniprot(self):
     if hasattr(self, "_uniprot") == False:
         self._uniprot = UniProt(verbose=self.debugLevel)
Esempio n. 49
0
 def func():
     u = UniProt()
     res = u.search(query, frmt=fmt)
     with open(target_fn, 'wb') as fp:
         fp.write(res)
Esempio n. 50
0
from pylab import rcParams
from liverx import wd
from matplotlib.colors import rgb2hex
from statsmodels.stats.multitest import multipletests
from scipy.stats.distributions import hypergeom
from bioservices import KEGG, KEGGParser, QuickGO, UniProt
from pandas import DataFrame, read_csv

sns.set(style='ticks', palette='pastel', color_codes=True)

# ---- Import network
network = read_csv('%s/files/string_mouse_network_filtered_800.txt' % wd, sep='\t')
network_proteins = set(network['protein1']).intersection(network['protein2'])

# ---- Set-up UniProt
uniprot = UniProt(cache=True)

# ---- Set-up QuickGO bioservice
quickgo = QuickGO(cache=True)

# ---- Set-up KEGG bioservice
kegg, kegg_parser = KEGG(cache=True), KEGGParser()

kegg.organism = 'mmu'
print '[INFO] KEGG service configured'

kegg_pathways = {p: kegg.parse_kgml_pathway(p) for p in kegg.pathwayIds}
print '[INFO] KEGG pathways extracted: ', len(kegg_pathways)

# Convert KEGG pathways Gene Name to UniProt
k2u = kegg.conv('uniprot', 'mmu')
Esempio n. 51
0
def get_more_source_dict_ids(source_dict, primary_key, **kwargs):
    """ Script to add more ids to source dict nodes
    to facilitate pairing to a network

    Arguments:
     source_dict: id_key: value

     primary_key: current type of ids used for the nodes.
      Currently can be 'Entrez Gene (GeneID)' or any of the options 
      in the BioServices UniProt mappings.

    kwargs:
     mapping_types: a list of mapping types to include
     verbose

    Returns:
     source_dict, also modified in place

    
    """

    continue_flag = True

    file_key = primary_key
    if primary_key not in available_mapping_source.keys():
        continue_flag = False
        print "Error, you must specify a valid primary_key descriptor to match to in the available database, exiting..."

    if 'mapping_types' in kwargs:
        mapping_types = kwargs['mapping_types'] 
    else:
        mapping_types = default_mapping_target_list

    try:
        from bioservices import UniProt
        u = UniProt(verbose=False)
    except:
        print("No bioservices module installed or cannot connect, exiting...")
        print("e.g. if you are using pip, did you 'pip install bioservices'?")
        continue_flag = False

    if 'node_id_type' in kwargs:
        node_id_type = kwargs['node_id_type'] 
    else:
        node_id_type = "Entrez Gene (GeneID)"

    if 'verbose' in kwargs:
        verbose = kwargs['verbose'] 
    else:
        verbose = True

    # Maximum number of items to
    # query at a time 
    # Note there is a length limit in bioservices 1.2.1
    # for the web-based query string.
    # Trial-and-error suggests the most
    # id's that can be queried are
    # between 100 and 1000
    max_query_length = 500

    if continue_flag:

        the_query_id_list_list = [[]]
        i = 0
        j = 0
        for the_query_id in source_dict.keys():
            if (j + 1) % max_query_length == 0:
                the_query_id_list_list.append([])
                i += 1
                the_query_id_list_list[i] = []
                j = 0
            the_query_id_list_list[i].append(the_query_id)
            j += 1

        the_query_string_list = []
        for i, the_query_id_list in enumerate(the_query_id_list_list):
            query_string = ''
            for the_query_id in the_query_id_list:
                if len(query_string) > 0:
                    query_string = query_string + ' ' + the_query_id
                else:
                    query_string = the_query_id
            the_query_string_list.append(query_string)

        for the_key in source_dict.keys():
            if type(source_dict[the_key]) != dict:
                the_value = source_dict[the_key]
                source_dict[the_key] = {}
                source_dict[the_key]['value'] = the_value
                
        for the_target_type in mapping_types:
            the_result = {}
            for the_query_string in the_query_string_list:
                the_result.update(u.mapping(fr = available_mapping_source[file_key], to = available_mapping_target[the_target_type], query = the_query_string))
            if verbose:
                print("** Finished mapping for %s to %s. **" % (file_key, the_target_type))
            for the_query_id in source_dict.keys():
                if the_query_id in the_result.keys():
                    if len(the_result[the_query_id]) > 0:
                        source_dict[the_query_id][the_target_type] = the_result[the_query_id]
                    else:
                        source_dict[the_query_id][the_target_type] = []
                else:
                    source_dict[the_query_id][the_target_type] = []

    return source_dict
#
# Par Guillaume Lahaie
# LAHG0407707
#
# Dernière modification: 17 décembre 2013
#
# Program qui obtient le uniref correspondant à un no d'accession genbank

from bioservices import UniProt
import sys
import os
from BeautifulSoup import BeautifulSoup

UNIREF_PATH = "../uniref/"

u = UniProt()
with open("uniref_mapping.txt", "w") as r:
    with open("resultatNBCI.txt", "r") as f:
        for line in f:
            temp = line.split("-|-")
            print("Traitement du contig " + temp[0])
            accession = temp[2].strip(" \t\n\r")
            u.mapping(fr='EMBL_ID', to='NF100', query=accession)
            res = u.search(accession, format='xml', limit=10)
            if res is '':
                r.write(temp[0] + "\tNone\n")
                print "aucun résultat pour ce contig"
            else:
                contig = temp[0].strip(" \t\n\r")
                with open(UNIREF_PATH+"result"+contig+".xml", "w") as xml:
                    xml.write(res)
Esempio n. 53
0
# -*- coding: utf-8 -*-
"""
Created on Mon Mar 28 11:39:12 2016

@author: Pieter
"""
import ms2matcher.ms2matcher as ms
import os
import argparse
import pandas as pd
import numpy as np
from bioservices import UniProt
u = UniProt(verbose=False)

# Check provided arguments
parser = argparse.ArgumentParser(description='MS2 experimental to database matcher')
parser.add_argument("filepath",type=str,
                    help="The path to the folder containing the experimental spectra to process.", metavar="filepath")
parser.add_argument("--toleranceMS1","-t1",type=float,dest='ms1Tolerance',default=50,
                    help="The mass accuracy for MS1 (default = 50 ppm).", metavar="t1")
parser.add_argument("--toleranceMS2","-t2",type=float,dest='ms2Tolerance',default=0.1,
                    help="The mass accuracy for MS1 (default = 0.1 Da).", metavar="t2")
parser.add_argument("--FDR","-fdr",type=float,dest='desiredFDR',default=0.05,
                    help="The desired FDR.", metavar="fdr")
args = parser.parse_args()

# Initialise file path to experimental spectra .dta files
spectraFilePath = os.path.normpath(args.filepath)

# Initialise other file paths (respect folder hierarchy in package)
script_dir = os.path.dirname(__file__) #<-- absolute dir the script is in
Esempio n. 54
0
def convert_gene_ids_bt(xml_file_in, id_identity = None, id_formatter = None, translate_file = None):
    """
    Replace all found instances of old gene IDs to new IDs.
    N.B. It will only look at 'GENE ASSOCIATION' lines.
    'translate_file' should be 2 column tsv file.
    """    
        
    ## Create ID conversion dictionary for MetaCyc 
    
    translate_file = translate_file or '/Users/wbryant/Dropbox/Bacteroides/BioCyc_-_Protein-Gene-relations/BioCyc_BT_-_Protein-Gene-relations.txt' 
    trans_in = open(translate_file,'r')
    id_dict = {}
    for line in trans_in:
        ids = line.split("\t")
        if len(ids[1]) > 0:
            id_dict[ids[0]] = ids[1].strip()
    
    
    id_identity = id_identity or model_metacyc_identifier
    id_formatter = id_formatter or model_metacyc_gene_2_biocyc
    
    
    ## Create gene -> locus dictionary from NCBI file
    
    ncbi_gene_file = '/Users/wbryant/work/BTH/data/NCBI/gene_list.dat'
    ncbi_in = open(ncbi_gene_file,'r')
    ncbi_id_dict = {}
    for line in ncbi_in:
        if re.search('[0-9]+\.[ ].+',line):
            ncbi_id = line.strip().split(" ")[-1]
        elif 'Other Aliases' in line:
            bt_ids = re.findall('BT\_[0-9]+',line)
            for bt_id in bt_ids:
                ncbi_id_dict[ncbi_id] = bt_id
    ncbi_in.close()
    
    
    ## Some specific UniProt IDs do not map - so put them here manually:
    
    uniprot_manual_dict = {}
    
    uniprot_manual_dict['Q8A1G3_BACTN'] = 'BT_3698'
    uniprot_manual_dict['G8JZS4_BACTN'] = 'BT_3703'
    uniprot_manual_dict['Q8A1G0_BACTN'] = 'BT_3704'
    uniprot_manual_dict['Q89YR9_BACTN'] = 'BT_4662'
    

    ### Run through lines of input file replacing relevant gene IDs with new gene IDs
    
    u = UniProt(verbose=False)
    xml_file_out = re.sub('\.xml','_out.xml',xml_file_in)
    f_in = open(xml_file_in,'r')
    f_out = open(xml_file_out,'w') 
    for line in f_in:
        if 'GENE ASSOCIATION' in line:
            ## Look for genes fitting id_identity, convert and replace
            
            #print line
            
            ###! Change!
            line = re.sub('(\<[^\>]+\>[ \n]*$)',' \g<1>',line)
            
            old_ids = re.findall(id_identity, line)
            
            if len(old_ids) > 0:
                #print old_ids.groups(1)
                for old_id in old_ids:
                    old_id_formatted = id_formatter(old_id)
                    try:
                        new_id = id_dict[old_id_formatted]
                    except:
                        new_id = old_id_formatted
                        print("ID '%s' not found ..." % new_id)
                    #print("%20s: %20s" % (old_id, new_id))
                    line = line.replace(old_id,new_id,1)
            
            ## Remove extraneous gene surrounds
            line = re.sub('\(gene\:([^\)]+)_i\)','\g<1>',line)
            
            
            ## Look for UniProt genes and convert
            if 'uniprot' in line:
                
                uniprot_entries = re.findall('\(uniprot\:[^\)]+\)',line)
                
                for uniprot_entry in uniprot_entries:
                    ## Map IDs 
                    
                    uniprot_id = re.sub('\(uniprot\:([^\)]+)\)','\g<1>',uniprot_entry)
                    
                    try:
                        new_entry = u.mapping(fr='ACC',to='KEGG_ID',query=uniprot_id)[uniprot_id][0]
                    except:
                        print("Protein ID '%s' not found in mapping, trying local ..." % uniprot_id)
                        try:
                            new_entry = uniprot_manual_dict[uniprot_id]
                        except:
                            print("Protein ID '%s' not found in local ..." % uniprot_id)
                            new_entry = uniprot_id
                    
                    
                    new_id = re.sub('bth\:([^\)]+)','\g<1>',new_entry)
                    line = line.replace(uniprot_entry,new_id,1)
                    
                    #u.mapping(fr='BIOCYC_ID',to='KEGG_ID',query='GJXV-2505')
            
            
        
            
            ## Get gene string
            line_groups = re.search('(.+GENE ASSOCIATION\:[ ]*)(.+)([ ]*\<.+)',line)
            gene_string = line_groups.group(2)
            
            
            if '_BACTN' in gene_string:
                print gene_string
            
            ## Look for NCBI IDs (like susG) and replace with BT IDs
            potential_ncbis = re.findall('[a-zA-Z0-9\_]+',gene_string)
            if '_BACTN' in gene_string:
                print ", ".join(potential_ncbis)
            for potential_ncbi in potential_ncbis:
                if potential_ncbi in ncbi_id_dict:
                    new_id = ncbi_id_dict[potential_ncbi]
                    gene_string = gene_string.replace(potential_ncbi,new_id,1)
                elif potential_ncbi in uniprot_manual_dict:
                    new_id = uniprot_manual_dict[potential_ncbi]
                    gene_string = gene_string.replace(potential_ncbi,new_id,1)
            
            
            ##Remove duplicates
            gene_list = gene_string.split(" or ")
            gene_list = list(set(gene_list))
            
            
            ## Reconstitute line
            line = line_groups.group(1)
            line += " or ".join(gene_list)
            line += line_groups.group(3)
            
            f_out.write(line)
            
        else:
            f_out.write(line)
    
    f_out.close()
Esempio n. 55
0
def get_more_node_ids(the_network, **kwargs):
    """ Script to add more identifiers to model notes
    based on the node.id

    Arguments:
     the_network: a Network object, modified in place

    kwargs:
     node_id_type: current type of ids used for the nodes.
      Currently can be Entrez Gene (GeneID) or any of the 
      options in the BioServices UniProt mappings
     mapping_types: a list of target mapping id types to include
     verbose:

    Returns:
     the_network 

    TODO: determine the best source db/module
    for pairings from bioservices
                  
    """
    continue_flag = True

    try:
        from bioservices import UniProt
        u = UniProt(verbose=False)
    except:
        print("No bioservices module installed or cannot connect, exiting...")
        print("e.g. if you are using pip, did you 'pip install bioservices'?")
        continue_flag = False

    the_node_locations = the_network.get_node_locations()
    if len(the_node_locations) == 0:
        print 'The network has no nodes, exiting...'
        continue_flag = False

    if 'node_id_type' in kwargs:
        node_id_type = kwargs['node_id_type'] 
    else:
        node_id_type = "Entrez Gene (GeneID)"

    if 'mapping_types' in kwargs:
        mapping_types = kwargs['mapping_types'] 
    else:
        mapping_types = default_mapping_target_list

    if 'verbose' in kwargs:
        verbose = kwargs['verbose'] 
    else:
        verbose = True

    # Maximum number of items to
    # query at a time 
    # Note there is a length limit in bioservices 1.2.1
    # for the web-based query string.
    # Trial-and-error suggests the most
    # id's that can be queried are
    # between 100 and 1000
    max_query_length = 500

    if continue_flag:
        query_string = ''
        model_node_ids = []
        for the_nodetype in the_network.nodetypes:
            model_node_ids += [x.id for x in the_nodetype.nodes]

        the_node_id_list_list = [[]]
        i = 0
        j = 0
        for the_node_id in model_node_ids:
            if (j + 1) % max_query_length == 0:
                the_node_id_list_list.append([])
                i += 1
                the_node_id_list_list[i] = []
                j = 0
            the_node_id_list_list[i].append(the_node_id)
            j += 1

        query_string_list = []
        for i, the_node_id_list in enumerate(the_node_id_list_list):
            query_string = ''
            for the_node_id in the_node_id_list:
                if len(query_string) > 0:
                    query_string = query_string + ' ' + the_node_id
                else:
                    query_string = the_node_id
            query_string_list.append(query_string)

        
        for the_target_type in mapping_types:
            the_result = {}
            for the_query_string in query_string_list:
                the_result.update(u.mapping(fr = available_mapping_source[node_id_type], to = available_mapping_target[the_target_type], query = the_query_string))
            if verbose:
                print("**Finished mapping for %s to %s.**" % (node_id_type, the_target_type))
            for the_nodetype in the_network.nodetypes:
                for the_node in the_nodetype.nodes:
                    if (the_node.id in the_result.keys()):
                        if len(the_result[the_node.id]) > 0:
                            the_node.notes[the_target_type] = the_result[the_node.id]
                        else:
                            the_node.notes[the_target_type] = []
                    else:
                        the_node.notes[the_target_type] = []

    return the_network
Esempio n. 56
0
class Annotations(MassSpecReader):
    """Create/store/read annotations from uniprot and figure out entry names

    The Annotations classes allows one to populate the dataframe attribute
    :attr:`df` with the **Entry** and **Entry_name**  columns (UniProt entries).
    This is not strictly speaking required columns but provide more tools if
    available. The annotations also creates a new dataframe called :attr:`annotations`
    that stores in particular the protein sequence and the GO terms. The former
    being used to check the peptide sequence and the latter to plot relevant
    histogram about GO terms.

    This class inherits from :class:`msdas.readers.MassSpecReader`. Consequently,
    input can be a MassSpecReader instance, or a filename or even nothing (data
    can be read at a later stage). The dataframe must contain the Protein column.

    One reason to fetch the entries from UniProt is that the protein column name
    may contain typos or non-uniprot entries, therefore it is quite useful to
    fetch all entries from uniprot based on the protein name provided. This can
    be done thanks to the :meth:`get_uniprot_entries`. This method fills a dictionary called
    :attr:`_mapping` (note the underscore), which is used to populate a new column
    in the dataframe called **Entry**.

    If your initial dataframe contains the columns "Entry" with all valid UniProt
    entries (e.g., P23300) then the :attr:`_mapping` attribute is populated during the
    initialisation and the call to :meth:`get_uniprot_entries` can be skipped.
    If called, it will also be faster but will overwrite the content of the Entry column.
    You can also fill/correct/complete the :attr:`_mapping` attribute before calling
    :meth:`get_uniprot_entries`

    .. doctest::

        >>> from msdas import annotations
        >>> import pandas as pd
        >>> df = pd.DataFrame({
            'Protein':['DIG1'],
            'Sequence_Phospho':['SS(Phospho)T'],
            'Psite':['S2']})
        >>> a = annotations.Annotations(df, "YEAST")
        >>> a._mapping
        {}
        >>> a.get_uniprot_entries()
        {'DIG1_YEAST': ['Q03063']}
        >>> a.df.Entry
        0    Q03063
        Name: Entry, dtype: object

    Then, call :meth:`set_annotations`, which will fetch all annotations from
    uniprot and store them in a new dataframe in the :attr:`annotations` attribute

    ::

        a.set_annotations()
        a.annotations

    A new field called **Entry_name** is also added to the dataframe itself::

        a.df.Entry_name

    On a big data set, it may take a few minutes to fetch all information from uniprot.
    So, we also provide tools to save and read back the relevant information (
    :meth:`read_annotations`, :meth:`to_pickle`, :meth:`to_csv`  ) ::

        from msdas import *
        r = readers.MassSpecReader(get_yeast_raw_data())
        # this takes about 10 minutes depending on the connection for 1600 unique protein names
        r.get_uniprot_entries()
        r.set_annotations()
        r.to_pickle(tag="test") # creates a file called YEAST_annotations_test.pkl
        r.to_csv("data.csv")

    Next time, just type::

        from msdas import *
        a = annotations.Annotations("data.csv", "YEAST")
        a.read_annotations("YEAST_annotations_test.pkl")

    To check that the entries are correct, one thing that can be done is to
    look for the peptide sequence into the FASTA sequence found in the annotations::

        a.check_entries_versus_sequence()

    This is a very good sanity check to verify that the entry names found
    correspond to the peptide provided. If not, the protein name was probably wrong
    or was a gene name that could not be mapped correctly to the correct protein.

    If some entries are not found or mapping was not found, you need to
    manually check the issues and update the :attr:`_mapping` attribute,
    update the uniprot entries and annotations::

        a._mapping[entry] = ['entry name']
        a.get_uniprot_entries()
        a.set_annotations()
        a.check_entries_versus_sequence()

    if you cannot find a mapping, we would recommend to delete the item from
    the dataframe :attr:`df`.
    """
    def __init__(self, data, organism=None, verbose=True, annotations=None, **kargs):
        """.. rubric:: Constructor

        :param data: a MassSpecReader compatible input (e.g., CSV file, None, a
            MassSpecReader instance). See :class:`msdas.readers.MassSpecReader`
            documentation for details
        :param organism: valid uniprot identifier for the organism e.g., HUMAN
            YEAST.
        :param annotations: a pickled file containing the annotations saved
            using :meth:`to_pickle`.
        :param kargs: valid parameter recognised by :class:`msdas.readers.MassSpecReader`

        """
        super(Annotations, self).__init__(data=data, verbose=verbose, **kargs)
        if organism is None:
            raise ValueError("organism must be provided e.g. YEAST, HUMAN")

        self.organism = organism

        #: the dataframe where annotations from uniprot will be stored.
        self.annotations = None

        self._mapping = {}
        self.build_mapping_from_df()# if Entry is provided

        self._init_uniprot()

        if annotations:
            self.read_pickle(annotations)

    def _init_uniprot(self):
        if hasattr(self, "_uniprot") == False:
            self._uniprot = UniProt(verbose=self.debugLevel)

    def _update_species_to_find(self):
        entry_names = [x + "_" + self.organism for x in self.df.Protein]
        #unique_entry_names = list(set(entry_names))

        species_to_find = [k for k in entry_names if k not in self._mapping.keys()]
        species_to_find = list(set(species_to_find))
        self._species_to_find = list(set(species_to_find))

    def build_mapping_from_df(self):
        """Populate the _mapping dictionary using the Uniprot Entry column"""
        if "Entry" in self.df.columns:
            for index in self.df.index:
                k = self.df.Protein.ix[index]
                if k.endswith("_"+self.organism) == False:
                    k += "_" + self.organism
                v = self.df.Entry.ix[index]
                self._mapping[k] = [v]
        else:
            self.warning("Entry column not found in the dataframe. call get_uniprot_entries")

    def get_uniprot_entries(self, Nmax=50):
        """Search for the uniprot entries and entry names given protein column.

        Protein names from the dataframe are first used to feed uniprot mapping tool.
        Some protein names won't be found as a uniprot entry because there are
        not uniprot entry name but gene names.  We therefore also scan
        missing entries by looking for gene names. Once found, the proposed
        items that contain the gene names and organism are candidates for the
        entry names. There may be several solutions though, which explain why
        the values in the :attr:`_mapping` dictionary  are made of lists. If several
        candidates are found, warning and raised.

        Results are stored in :attr:`_mapping` and in the dataframe itself.

        Let us show one example with 3 protein names that cover all cases:

            * DIG1, is a valid uniprot entry
            * ASC1 is not a uniprot entry. It is a gene name from which the entry
              may be retrieved automatically.
            * LEU1 is a gene name AND a uniprot entry. This is an ambiguous
              case. The default is to use the uniprot entry but if you call
              :meth:`check_entries_versus_sequence` (after meth:`set_annotations`)
              you will see that there is a mismatch meaning that LEU1_YEAST provided
              in the protein column is catually not the protein name but the gene name


        ::

            >>> import pandas as pd
            >>> from msdas import *
            >>> df = pd.DataFrame({'Protein':['DIG1', 'LEU1', 'ASC1'],
                'Sequence_Phospho':['S(Phospho)APAQVTQHSK', 'VEVTS(Phospho)EDEK',
                                    'DS(Phospho)VTIISAGNDK'],
                'Psite':['S142','S495', 'S166']})
            >>> a = Annotations(df, "YEAST)
            >>> a.get_uniprot_entries()
            >>> a._mapping
            {'ASC1_YEAST': ['P38011', 'P01120'],
             'DIG1_YEAST': ['Q03063'],
             'LEU1_YEAST': ['P06208-1', 'P06208']}

        Here, DIG1 has one unique entry. This is expected because DIG1 is in fact
        an entry name (unique by definition). ASC1 is a gene name. This method
        figures out that it correspond to either P38011 or P01120. There are
        several entries because mapping from gene to protein is not unique.
        By default, the entry with highest score appears first. There is no 100% guarantee
        that this mapping is correct and :meth:`check_entries_versus_sequence`
        should be called to check that the peptide sequence is contained in
        this entry sequence. The last case (LEU1) is even more problematic because
        it is a valid entry name even though the protein name provided is actually
        a gene name... again call :meth:`check_entries_versus_sequence`.


            >>> a.set_annotations()
            >>> a.check_entries_versus_sequence()
            P06208-1 not found in the annotations index

        So, here we are told that amongst the 3 entries, P06208-1 is not found.
        This the LEU1 case. If you were to use batch tool, you would figure out
        given the peptide sequence that this is actually LEUC_YEAST entry with
        uniprot entry LEUC_YEAST/P07264.

        So, you need to manually update the mapping:

            >>> a._mapping['LEU1_YEAST'] = ['P07264']
            >>> a.get_uniprot_entries() # to update the main df with new entries
            >>> a.set_annotations() # to retrieve the sequence of LEUC_YEAST
            >>> a.check_entries_versus_sequence()

        .. seealso:: :meth:`set_annotations`

        """
        # get the mapping using bioservices.uniprot

        # apply function is 3 times slower than list...
        # entry_names = self.df.Protein.apply(lambda x: x + "_" + self.organism)
        self._update_species_to_find()
        if len(self._species_to_find)>0:
            self.logging.info("Fetching uniprot accession numbers for %s entries" % len(self.df.Protein))
            self.logging.info("Fetching uniprot accession numbers for %s unique entries" % len(self.df.Protein.unique()))
            mapping = self._uniprot.multi_mapping(fr="ID", to="ACC",Nmax=Nmax,
                                              query=self._species_to_find)
            for k,v in mapping.iteritems():
                if k not in self._mapping.keys():
                    self._mapping[k] = v

        # some species may not be found (secondary accession number) if _human
        # appended in tcell case. so we may need to call again the mapping but
        # without the appended organism string.
        self._update_species_to_find()
        if len(self._species_to_find):
            self.logging.info("Some species were not found ({}). Using secondary accession:".format(len(self._species_to_find)))
            self.logging.info("Fetching uniprot without trailing species")
            self.logging.info("Fetching  %s new ones " % len(self._species_to_find))
            mapping = self._uniprot.multi_mapping(fr="ID", to="ACC",
                    query=[x.split("_")[0] for x in self._species_to_find], Nmax=Nmax)
            for k,v in mapping.iteritems():
                self._mapping[k+ "_" + self.organism] = v

        # Some are not yet found. this could be because the provided protein name is actually a
        # gene name...
        def func(x, tag):
            if len(x)==0:
                return False
            else:
                return tag in x[0].split()

        self._genes = {}
        self._update_species_to_find()
        if len(self._species_to_find):

            self.logging.info("Some species are still not found {}. Trying to use gene names".format(len(self._species_to_find)))
            self.logging.info("Fetching uniprot accession numbers for those without _species appended")
            self.logging.info("Fetching  %s new ones " % len(self._species_to_find))
            for i,this in enumerate(self._species_to_find):
                if " " in this:
                    continue
                self.logging.info("Searching for entry {}/{} for gene names".format(i+1,len(self._species_to_find)))
                df = self._uniprot.get_df(this.split("_")[0], organism=self.organism)
                l1 = df['Gene names'].apply(lambda x : func(x,this.split("_")[0] ))
                l2 = df['Entry name'].apply(lambda x: x.endswith(self.organism))

                if sum(l1&l2) >= 1:
                    k = list(df.ix[l2&l1]['Entry name'])
                    v = list(df.ix[l2&l1]['Entry'])
                    self.logging.debug(k, v, this)
                    if k in self._mapping.keys():
                        raise ValueError("!!!!!!!%s Already in the dictionary " % k)
                    #self._mapping[k] = v
                    self._mapping[this] = v
                else:
                    print("skipping %s... sum=%s" % (this, sum(l1&l2)))

        self._update_species_to_find()
        if len(self._species_to_find):
            self.logging.info("Some species were not found. Using gene names")
            self.logging.info("Fetching uniprot accession numbers for those without _species appended")
            self.logging.info("Fetching  %s new ones " % len(self._species_to_find))

        self._append_uniprot_entries_to_df()

    def _append_uniprot_entries_to_df(self):

        if "Entry" in self.df.columns:
            self.logging.warning("Overwritting column called Entry in the dataframe")

        # get list of unique entry names.
        entry_names = self.df.Protein.apply(lambda x: x + "_" + self.organism)
        #remapping = [(k,v[0]) for k,va.g in a._mapping.iteritems()]
        # add into dataframe the uniprot entries but must have same order as
        # in the dataframe (entry_names)
        uniprot_entries = []
        for name in entry_names:
            # if not found, let us use unknown as a label. could use NA?
            uniprot_entry = self._mapping.get(name, "")
            if uniprot_entry == "":
                uniprot_entries.append("")
                print("!! ", name, " not found")
            else:
                if len(uniprot_entry)>1:
                    self.logging.info("Found entry with several matches: %s %s . Only first one is selected (highest uniprot score)" % (name,
                                      uniprot_entry))
                uniprot_entries.append(uniprot_entry[0])

        # index=df.index is important to use the join afterwards
        thisdf = pd.DataFrame({'Entry': uniprot_entries}, index=self.df.index)
        if "Entry" in self.df.columns:
            del self.df['Entry']
        self.df = self.df.join(thisdf)

    def _append_uniprot_entry_names_to_df(self):

        if isinstance(self.annotations, types.NoneType) == True:
            self.error("must call set_annotations first")
            return
        # let us add the Entry_name column as well
        entry_names = [self.annotations.ix[e]['Entry name'] if e in self.annotations.index else "" for e in self.df.Entry]
        self.df['Entry_name'] = entry_names

    def plot_goid_histogram(self, drop_duplicates=True):
        """Histrogram of the number of GO terms per petide or protein

        :param drop_duplicates: ignore duplicates entries

        .. plot::
            :width: 80%
            :include-source:

            from msdas import *
            m = Annotations(get_yeast_small_data(), "YEAST", verbose=False)
            m.set_annotations()
            m.plot_goid_histogram()

        .. todo:: is this functional process or not

        """
        if self.annotations is False:
            raise AttributeError(self._error_messages['annotations'])
        if drop_duplicates:
            entries = self.df.Entry.drop_duplicates()
        counter = self.annotations.ix[entries]['Gene ontology IDs'].apply(lambda x: len(x))

        M = counter.max()
        # if we want the GO per peptides, then we need to look at the original
        # dataframe that contains several psites per peptide. UniProt_entry is
        # not a set so values from counter may be duplicated, which is what we
        # want for this first figure
        duplicated_counter = [counter[x] for x in self.df.Entry]
        pylab.figure(1)
        pylab.clf()
        pylab.hist(duplicated_counter, bins=[x+.5 for x in range(0,M+1)])
        pylab.title("Distribution of number of GO id terms per peptide")
        pylab.grid()

        # annotations contains the unique protein entry, so here we get the number of GO terms per protein
        counter = self.annotations['Gene ontology IDs'].apply(lambda x: len(x))
        M = counter.max()
        pylab.figure(2)
        pylab.clf()
        pylab.hist(counter, bins = [x+.5 for x in range(0,M+1)])
        pylab.title("Distribution of number of GO id terms per protein")
        pylab.grid()

    def set_annotations(self, Nmax=100):
        """Fetched all information from uniprot and set :attr:`annotations`
        as a pandas dataframe.


        Look into the dataframe Entry column and update the annotations dataframe
        to populate missing entries. The Entry column in the :attr:`df` should have been
        populated by :meth:`get_uniprot_entries` with valid entries from Uniprot.

        If you have thousand of entries, this is taking a few minutes. You can
        save the annotations and read them back using :meth:`msdas.MassSpecReader.read_annotations`
        and :meth:`to_pickle`.


        """
        self.logging.info("Fectching information from uniprot. Takes some time")

        #could split if too long
        entries = [this for this in list(set(self.df.Entry)) if this]

        # not need to search again if already present in the attribute
        if self.annotations is not None:
            entries = [x for x in entries if x not in list(self.annotations.index)]

        if len(entries)==0:
            self.warning("No new entries found. Your annotations dataframe is already up-to-date")
            self.annotations.drop_duplicates(subset="Entry name", inplace=True)
            self._append_uniprot_entry_names_to_df()
            return

        annotations = self._uniprot.get_df(entries, nChunk=Nmax)
        annotations = annotations[annotations.Entry.apply(lambda x: x in entries)]

        if len(annotations) == 0:
            raise ValueError("your list of protein is empty")
        self.logging.info("Fectching {}".format(len(annotations)))
        annotations.set_index(["Entry"], inplace=True)
        if self.annotations is None:
            self.annotations = annotations
        else:
            self.annotations = self.annotations.append(annotations)
        #self.annotations.set_index(["Entry"], inplace=True)
        self.logging.info("Annotations have been loaded. You can save the annotations" +
            " dataframe attribute using x.to_pickle('annotations.pkl') " +
            " Next time, you could just load if using \n\n" +
            "     >>> m = readers.MassSpecReader(filename, mode='yeast')\n" +
            "     >>>  m.read_annotations('annotations.pkl')")

        #indices are the uniprot entry. Some may be identical with slightly different columns
        # but the entry name should be unique. Here, we keep the first instance of each entry
        self.annotations.drop_duplicates(subset="Entry name", inplace=True)
        self._append_uniprot_entry_names_to_df()

    def to_pickle(self, tag=None, overwrite=False):
        """Save annotations dataframe as a pickle

        :param tag: a tag to append to the name of the annotations file.
        :param overwrite: overwrite file if it exists

        filename is going to be organism_annotations_tag.pkl
        """
        filename = self.organism + "_annotations"
        if tag != None and isinstance(tag, str):
            filename += "_" + tag
        filename += ".pkl"
        if overwrite == False:
            if os.path.exists(filename):
                raise IOError("file %s already exists" % filename)
        self.annotations.to_pickle(filename)

    def read_pickle(self, filename):
        """Read annotations in pickled format as saved by :meth:`to_pickle`

        :param str filename: filename to read
        """
        try:
            self.annotations = pd.read_pickle(filename)

            # update the mapping dictionary
            for k,v in self.annotations['Entry name'].iteritems():
                if k not in self._mapping.keys():
                    self._mapping[v] = [k]
        except:
            self.logging.error("Could not read your file. Expected a pkl \
            containing a dataframe with Entry name and index being uniprot \
            indices. ")

    def hist_most_relevant_goids(self, N=10, tight_layout=True, wrap_length=40,
                                 drop_duplicates=True, **kargs):
        """Plot histogram of the GO identifiers found in all proteins.

        :param int N: restrict histogram to terms that appear at least N times
        :param int wrap_length:  wrap text on the y-axis by wrap_length (defaults to 40)
        :param drop_duplicates: drop the duplicated entries
        :param kargs: pandas.plot arguments accepted.

        .. plot::
            :include-source:
            :width: 80%

            from msdas import *
            m = Annotations(get_yeast_small_data(), "YEAST", verbose=False)
            m.set_annotations()
            m.hist_most_relevant_goids(N=5)


        .. todo:: this is made on the annotations dataframe. Should be
            done based on the entry names in the dataframe

        """
        if self.annotations is False:
            raise AttributeError(self._error_messages['annotations'])
        kargs['legend'] = kargs.get("legend", False)


        if drop_duplicates:
            entries = self.df.Entry.drop_duplicates()

        goids = [y for x in self.annotations.ix[entries]['Gene ontology (GO)'] for y in x]
        uniq_goids = set(goids)
        names = [x for x in uniq_goids]

        # let us wrap the string by 40 character max to avoid long labels in the figure
        names = ["\n".join(textwrap.wrap(name, width=wrap_length)) for name in names]

        count = [goids.count(x) for x in uniq_goids]
        df = pd.DataFrame({'name':names, 'size':count}, index=range(0, len(uniq_goids)))
        if N:
            subdf = df[df['size']>N].set_index("name")
        subdf.sort("size").plot(kind="barh", **kargs)
        if tight_layout:
            pylab.tight_layout()

    def check_entries_versus_sequence(self):
        """Check that peptide sequence are contained in uniprot sequence

        This is a very good sanity check on the validity of the uniprot entry names
        found by :meth:`get_uniprot_entries` method

        If a peptide sequence is not found, it means that the protein name is
        not correct.

        See AnnotationsYeast class where the :meth:`AnnotationsYeast.update_mapping` is used to
        update the incorrect mapping.

        .. seealso:: :meth:`find_sequence_blast`

        """

        self.logging.info("Comparing peptide sequence in the attribute df with sequences in the annotations")
        self.logging.info("row index, protein name, uniprot entry")
        if isinstance(self.annotations, types.NoneType):
            raise Exception("annotations not set. call set_annotations")
        found = False
        for i in self.df.index:
            entry = self.df.ix[i].Entry
            if entry not in self.annotations.index:
                print("{} not found in the annotations index".format(entry))
                continue
            if self.df.ix[i].Sequence not in self.annotations.ix[entry].Sequence:
                if found == False:
                    print("Found unknown entries\nindex, protein name, uniprot entry ")
                    found = True
                print(i, self.df.ix[i].Protein, self.df.ix[i].Entry)

    def find_sequence_blast(self, seq, email):
        """Utility to search for a sequence using BLAST via bioservices

        :param str seq: the sequence
        :param email: a valid email address

        .. note:: This is using NCIBlast web service via
            `BioServices <https://pypi.python.org/pypi/bioservices>`_.
        """
        from bioservices import NCBIblast
        s = NCBIblast(verbose=self.level)
        jobid = s.run(program="blastp", sequence=seq, stype="protein",
                      database="uniprotkb", email=email)
        return s.getResult(jobid, "out")

    def to_csv(self, filename):
        """Export the dataframe with data and annotations into a CSV file

        :meth:`set_annotations` and :meth:`get_uniprot_entries` must have been called.


        """
        if "Entry" not in self.df.columns or "Entry_name" not in self.df.columns:
            raise ValueError("Entry or Entry_name missing in dataframe. You must call get_entries_uniprot and set_annotations methods")
        self.df.Identifier = self.df.Protein + "_" + self.df.Psite
        self.df.to_csv(filename, index=False, sep=",")
Esempio n. 57
0
def get_more_node_ids(the_network, **kwargs):
    """ Script to add more identifiers to model notes
    based on the node.id

    Arguments:
     the_network: a Network object, modified in place

    kwargs:
     node_id_type: current type of ids used for the nodes.
      Currently can be 'Entrez Gene (GeneID)' or any of the 
      options in the BioServices UniProt mappings
     mapping_types: a list of target mapping id types to include.
      Options can be viewed in core.parameters.py
      Note "Symbol" is an additional option for the
      officieal gene nomenclature symbol.
     email: optional, for NCBI queries.
     verbose: [True (default), False]
     

    Returns:
     the_network 

    TODO: determine the best source db/module
    for pairings from bioservices
                  
    """
    continue_flag = True
    valid_mapping_targets = available_mapping_target.keys() + ['Symbol']
    verbose = test_kwarg('verbose', kwargs, [True, False])

    try:
        from bioservices import UniProt
        # Don't want verbosity at this low of a level
        u = UniProt(verbose = False)
    except:
        print("No bioservices module installed or cannot connect, exiting...")
        print("e.g. if you are using pip, did you 'pip install bioservices'?")
        continue_flag = False

    the_node_locations = the_network.get_node_locations()
    if len(the_node_locations) == 0:
        print 'The network has no nodes, exiting...'
        continue_flag = False

    if 'node_id_type' in kwargs:
        node_id_type = kwargs['node_id_type'] 
        if node_id_type == 'Symbol':
            print "'Symbol' is a special case, not yet able to query with this option, exiting..."
            continue_flag = False            
    else:
        print "No node id type specified, attempting to use 'Entrez Gene (GeneID)'"
        node_id_type = 'Entrez Gene (GeneID)'

    if 'mapping_types' in kwargs:
        mapping_types = [x for x in kwargs['mapping_types'] if x in valid_mapping_targets]
        if len(mapping_types) == 0:
            print('No valid mapping_types selected, exiting...')
            continue_flag = False
        elif 'Symbol' in mapping_types:
            if (('Entrez Gene (GeneID)' not in mapping_types) & (node_id_type != 'Entrez Gene (GeneID)')):
                print "'Symbol' mapping type needs 'Entrez Gene (GeneID)', exiting..."
                continue_flag = False
    else:
        mapping_types = default_mapping_target_list

    if 'email' in kwargs:
        email = kwargs['email'] 
    else:
        email = ''

    # Maximum number of items to
    # query at a time 
    # Note there is a length limit in bioservices 1.2.1
    # for the web-based query string.
    # Trial-and-error suggests the most
    # id's that can be queried are
    # between 100 and 1000
    max_query_length = 500

    if continue_flag:
        query_string = ''
        model_node_ids = []
        model_nodes = []
        for the_nodetype in the_network.nodetypes:
            model_nodes += [x for x in the_nodetype.nodes]

        the_node_id_list_list = [[]]
        i = 0
        j = 0
        for the_node in model_nodes:
            if (j + 1) % max_query_length == 0:
                the_node_id_list_list.append([])
                i += 1
                the_node_id_list_list[i] = []
                j = 0
            the_node_id_list_list[i].append(the_node.id)
            j += 1

        query_string_list = []
        for i, the_node_id_list in enumerate(the_node_id_list_list):
            query_string = ''
            for the_node_id in the_node_id_list:
                if len(query_string) > 0:
                    query_string = query_string + ' ' + the_node_id
                else:
                    query_string = the_node_id
            query_string_list.append(query_string)

        
        for the_target_type in mapping_types:
            if the_target_type != 'Symbol':
                the_result = {}
                for the_query_string in query_string_list:
                    the_result.update(u.mapping(fr = available_mapping_source[node_id_type], to = available_mapping_target[the_target_type], query = the_query_string))
                if verbose:
                    print("**Finished mapping for %s to %s.**" % (node_id_type, the_target_type))
                for the_node in model_nodes:
                    if (the_node.id in the_result.keys()):
                        if len(the_result[the_node.id]) > 0:
                            the_node.notes[the_target_type] = the_result[the_node.id]
                        else:
                            the_node.notes[the_target_type] = []
                    else:
                        the_node.notes[the_target_type] = []

        # To avoid a loss of information, we should also make 
        # sure queried IDs are returned in the appropriate 
        # field in case they weren't available in the database.
        if node_id_type in mapping_types:
            # Not yet supported anyway, but can leave this here.
            if node_id_type != 'Symbol':
                for the_node in model_nodes:
                    if the_node.id not in the_node.notes[node_id_type]:
                        the_node.notes[node_id_type].append(the_node.id)
                    
        if "Symbol" in mapping_types:
            if ((node_id_type == "Entrez Gene (GeneID)") | ("Entrez Gene (GeneID)" in mapping_types)):
                the_entrez_to_query = []
                query_dict = {}
                for the_node in model_nodes:
                    query_dict[the_node.id] = {}
                    query_dict[the_node.id]["Entrez Gene (GeneID)"] = []
                    if node_id_type == "Entrez Gene (GeneID)":
                        query_dict[the_node.id]["Entrez Gene (GeneID)"].append(the_node.id)
                    if "Entrez Gene (GeneID)" in mapping_types:
                        the_entrez_list = the_node.notes["Entrez Gene (GeneID)"]
                        if len(the_entrez_list) > 0:
                            for the_entrez_id in the_entrez_list:
                                if the_entrez_id not in query_dict[the_node.id]["Entrez Gene (GeneID)"]:
                                    query_dict[the_node.id]["Entrez Gene (GeneID)"].append(the_entrez_id)
                    the_entrez_to_query += query_dict[the_node.id]["Entrez Gene (GeneID)"]
                the_entrez_to_query = list(set(the_entrez_to_query))
                the_symbol_dict = get_entrez_annotation(the_entrez_to_query, email = email, verbose = verbose)
                for the_node in model_nodes:
                    the_node.notes["Symbol"] = []
                    for the_entrez_id in query_dict[the_node.id]["Entrez Gene (GeneID)"]:
                        the_symbol_id = the_symbol_dict[the_entrez_id]['NomenclatureSymbol']
                        if len(the_symbol_id) > 0:
                            the_node.notes["Symbol"].append(the_symbol_id)
                print("**Finished mapping for %s to %s.**" % (node_id_type, "Symbol"))
            elif verbose:
                print "'Entrez Gene (GeneID)' mappings are needed first in order to query symbols, skipping..."

    return the_network
Esempio n. 58
0
def get_more_source_dict_ids(source_dict, primary_key_type, **kwargs):
    """ Script to add more ids to source dict nodes
    to facilitate pairing to a network

    Arguments:
     source_dict: id_key: value

     primary_key: current type of ids used for the top level dict key.
      Currently can be 'Entrez Gene (GeneID)' or any of the options 
      in the BioServices UniProt mappings.

    kwargs:
     mapping_types: a list of mapping types to include.
      See core.parameters for the full list.  Note
      'Symbol' is a special case for querying that depends
       on Entrez ID availability.
     verbose: [False (default), True]
     email: optional, for NCBI if querying for 'Symbol'

    Returns:
     source_dict, also modified in place

    
    """

    continue_flag = True
    verbose = test_kwarg('verbose', kwargs, [False, True])
    valid_mapping_targets = available_mapping_target.keys() + ['Symbol']

    if primary_key_type not in available_mapping_source.keys():
        if primary_key_type == 'Symbol':
            print "'Symbol' is a special case, not yet able to query with this as a primary key."
            print "Error, you must specify a valid primary_key_type descriptor to match to in the available database, exiting..."
        continue_flag = False   

    if 'mapping_types' in kwargs:
        mapping_types = [x for x in kwargs['mapping_types'] if x in valid_mapping_targets]
        if len(mapping_types) == 0:
            print('No valid mapping_types selected, exiting...')
            continue_flag = False
        elif 'Symbol' in mapping_types:
            if (('Entrez Gene (GeneID)' not in mapping_types) & (primary_key_type != 'Entrez Gene (GeneID)')):
                print "'Symbol' mapping type needs 'Entrez Gene (GeneID)', exiting..."
                continue_flag = False
    else:
        mapping_types = default_mapping_target_list

    if 'email' in kwargs:
        email = kwargs['email'] 
    else:
        email = ''

    try:
        from bioservices import UniProt
        # Don't want verbosity at this low of a level
        u = UniProt(verbose = False)
    except ImportError:
        print("No BioServices module installed or cannot connect, exiting...")
        print("e.g. if you are using pip, did you 'pip install bioservices'?")
        continue_flag = False

    # Maximum number of items to
    # query at a time 
    # Note there is a length limit in bioservices 1.2.1
    # for the web-based query string.
    # Trial-and-error suggests the most
    # id's that can be queried are
    # between 100 and 1000
    max_query_length = 500

    if continue_flag:

        the_query_id_list_list = [[]]
        i = 0
        j = 0
        for the_query_id in source_dict.keys():
            if (j + 1) % max_query_length == 0:
                the_query_id_list_list.append([])
                i += 1
                the_query_id_list_list[i] = []
                j = 0
            the_query_id_list_list[i].append(the_query_id)
            j += 1

        the_query_string_list = []
        for i, the_query_id_list in enumerate(the_query_id_list_list):
            query_string = ''
            for the_query_id in the_query_id_list:
                if len(query_string) > 0:
                    query_string = query_string + ' ' + the_query_id
                else:
                    query_string = the_query_id
            the_query_string_list.append(query_string)

        for the_key in source_dict.keys():
            if type(source_dict[the_key]) != dict:
                the_value = source_dict[the_key]
                source_dict[the_key] = {}
                source_dict[the_key]['value'] = the_value
                
        for the_target_type in mapping_types:
            if the_target_type != 'Symbol':
                the_result = {}
                for the_query_string in the_query_string_list:
                    the_result.update(u.mapping(fr = available_mapping_source[primary_key_type], to = available_mapping_target[the_target_type], query = the_query_string))
                if verbose:
                    print("** Finished mapping for %s to %s. **" % (primary_key_type, the_target_type))
                for the_query_id in source_dict.keys():
                    if the_query_id in the_result.keys():
                        if len(the_result[the_query_id]) > 0:
                            source_dict[the_query_id][the_target_type] = the_result[the_query_id]
                        else:
                            source_dict[the_query_id][the_target_type] = []
                    else:
                        source_dict[the_query_id][the_target_type] = []

        # To avoid a loss of information, we should also make 
        # sure queried IDs are returned in the appropriate 
        # field in case they weren't available in the database.
        if primary_key_type in mapping_types:
            # Not yet supported but we can check to avoid breaking this
            if primary_key_type != 'Symbol':
                for the_source_dict_id in source_dict.keys():
                    if the_source_dict_id not in source_dict[the_source_dict_id][primary_key_type]:
                        source_dict[the_source_dict_id][primary_key_type].append(the_source_dict_id)
                    
        if "Symbol" in mapping_types:
            if ((primary_key_type == "Entrez Gene (GeneID)") | ("Entrez Gene (GeneID)" in mapping_types)):
                the_entrez_to_query = []
                # Make query_dict in case "Entrez Gene (GeneID)" was 
                # a primary_key_type but not in mapping_types
                query_dict = {}
                for the_source_dict_id in source_dict.keys():
                    query_dict[the_source_dict_id] = {}
                    query_dict[the_source_dict_id]["Entrez Gene (GeneID)"] = []
                    if primary_key_type == "Entrez Gene (GeneID)":
                        query_dict[the_source_dict_id]["Entrez Gene (GeneID)"].append(the_source_dict_id)
                    if "Entrez Gene (GeneID)" in mapping_types:
                        the_entrez_list = source_dict[the_source_dict_id]["Entrez Gene (GeneID)"]
                        if len(the_entrez_list) > 0:
                            for the_entrez_id in the_entrez_list:
                                if the_entrez_id not in query_dict[the_source_dict_id]["Entrez Gene (GeneID)"]:
                                    query_dict[the_source_dict_id]["Entrez Gene (GeneID)"].append(the_entrez_id)
                    the_entrez_to_query += query_dict[the_source_dict_id]["Entrez Gene (GeneID)"]
                the_entrez_to_query = list(set(the_entrez_to_query))
                the_symbol_dict = get_entrez_annotation(the_entrez_to_query, email = email, verbose = verbose)
                for the_source_dict_id in source_dict.keys():
                    source_dict[the_source_dict_id]["Symbol"] = []
                    for the_entrez_id in query_dict[the_source_dict_id]["Entrez Gene (GeneID)"]:
                        the_symbol_id = the_symbol_dict[the_entrez_id]['NomenclatureSymbol']
                        if len(the_symbol_id) > 0:
                            source_dict[the_source_dict_id]["Symbol"].append(the_symbol_id)
                print("**Finished mapping for %s to %s.**" % (primary_key_type, "Symbol"))
                        

    return source_dict
Esempio n. 59
0
class GOTermAdder(object):
        
    def __init__(self, input_file, gene_id_column, output_file):
        self._input_file = input_file
        self._gene_id_column = gene_id_column
        self._output_file = output_file
        self._tmp_folder = "tmp_data"
        self._uniprot = UniProt(verbose=False)
        self._quickgo = QuickGO(verbose=False)
        if os.path.exists(self._tmp_folder) is False:
            os.mkdir(self._tmp_folder)
            
    def add_go_terms(self):
        with open(self._output_file, "w") as output_fh:
            for row in csv.reader(open(self._input_file), delimiter="\t"):
                if len(row[0]) == 0:
                    self._write_row(row, output_fh)
                    continue
                else:
                    row = self._add_go_term_column(row)
                    self._write_row(row, output_fh)

    def _write_row(self, row, output_fh):
        output_fh.write("\t".join(row) + "\n")

    def _add_go_term_column(self, row):
        gene_id = self._gene_id(row)
        uniprot_id = self._uniprot_id(gene_id)
        if uniprot_id is None:
            return row
        go_terms = self._go_terms(uniprot_id)
        go_term_names = [
            self._go_term_name(go_term) for go_term in go_terms]
        assert len(go_terms) == len(go_term_names)
        row.append(", ".join(
            ["%s (%s)" % (go_terms, go_term_names) 
             for go_terms, go_term_names in 
             zip(go_terms, go_term_names)]))
        return row

    def _uniprot_id(self, gene_id):
        file_path = self._tmp_file_path(gene_id)
        if os.path.exists(file_path) is True:
            with open(file_path) as json_fh:
                return json.load(json_fh)["Uniprot"] 
        else:
            uniprot_id = self._search_uniprot_id(gene_id)
            gene_data = {"Uniprot" : uniprot_id}
            with open(file_path, "w") as json_fh:
                json.dump(gene_data, json_fh)

    def _go_terms(self, uniprot_id):
        file_path = self._tmp_file_path(uniprot_id)
        if os.path.exists(file_path) is True:
            with open(file_path) as json_fh:
                return json.load(json_fh)["GO-Terms"]
        else:
            uniprot_entry = self._uniprot.searchUniProtId(uniprot_id)
            go_ids = []
            for dbref in uniprot_entry.findAll("dbreference"):
                if dbref.attrs["type"] == "GO":
                    go_ids.append(dbref.attrs["id"])
                go_term_data = {"GO-Terms" : go_ids}
                with open(file_path, "w") as json_fh:
                    json.dump(go_term_data, json_fh)
            return go_ids

    def _go_term_name(self, go_term):
        file_path = self._tmp_file_path(go_term)
        if os.path.exists(file_path) is True:
            with open(file_path) as json_fh:
                return json.load(json_fh)["name"]
        else:
            go_term_info = self._quickgo.Term(go_term).soup
            go_term_name = go_term_info.term.find("name").text
            go_term_data = {"name" : go_term_name}
            with open(file_path, "w") as json_fh:
                    json.dump(go_term_data, json_fh)
            return go_term_name

    def _search_uniprot_id(self, gene_id):
        uniprot_id_search = self._uniprot.quick_search(gene_id)
        if len(uniprot_id_search) == 1:
            uniprot_id = uniprot_id_search.keys()[0]
            return uniprot_id
        elif len(uniprot_id_search) > 1:
            pass
        elif len(uniprot_id_search) > 0:
            pass

    def _tmp_file_path(self, gene_id):
        return "%s/%s.json" % (self._tmp_folder, gene_id)

    def _gene_id(self, row):
        return row[self._gene_id_column-1].split("GeneID:")[1].split(";")[0]