Ejemplo n.º 1
0
 def prepare_client(self):
     """
     Load the biothings_client for the class
     :return:
     """
     if not self.client_name:
         raise NotImplementedError("Define client_name in subclass")
     if self.url:
         self._state["client"] = biothings_client.get_client(
             self.client_name, url=self.url)
     else:
         self._state["client"] = biothings_client.get_client(
             self.client_name)
     self.logger.info("Registering biothings_client {}".format(
         self.client_name))
Ejemplo n.º 2
0
    def build_entrezgenes(result_list):
        """
        Build a dictionary of entrezgenes for each uniprot
        :param uniprots:
        :return:
        """
        # Build the set of Uniprot entries to query
        uniprots = set()
        for r in result_list:
            uniprots.add(r['interactor_a']['uniprot'])
            uniprots.add(r['interactor_b']['uniprot'])

        # Query MyGene.info
        mg = biothings_client.get_client('gene')
        qr = mg.querymany(list(uniprots),
                          scopes='uniprot',
                          species='human',
                          as_generator=True)

        # Build the Entrezgene dictionary to return
        entrezgenes = {}
        for q in qr:
            if 'query' in q and 'entrezgene' in q:
                entrezgenes[q['query']] = q['entrezgene']
        return entrezgenes
Ejemplo n.º 3
0
 def setUp(self):
     self.mv = biothings_client.get_client("variant")
     self.query_list1 = [
         'chr1:g.866422C>T',
         'chr1:g.876664G>A',
         'chr1:g.69635G>C',
         'chr1:g.69869T>A',
         'chr1:g.881918G>A',
         'chr1:g.865625G>A',
         'chr1:g.69892T>C',
         'chr1:g.879381C>T',
         'chr1:g.878330C>G'
     ]
     self.query_list2 = [
         'rs374802787',
         'rs1433078',
         'rs1433115',
         'rs377266517',
         'rs587640013',
         'rs137857980',
         'rs199710579',
         'rs186823979',
         'rs2276240',
         'rs372452565'
     ]
def get_uniprot2geneid_dict(uniprot_list):
    """This function returns the UniProt to NCBI gene ID dictionary from BioThings."""

    mg = get_client('gene')
    r_df = mg.querymany(uniprot_list,
                        scopes='uniprot',
                        fields='entrezgene',
                        as_dataframe=True)
    #print(r_df.head(2))

    # get the dictionary from the dataframe
    p2g = r_df[['_id']].dropna()
    p2g_dict = {}
    for idx in p2g.index:
        uniprot = 'UniProt:' + str(idx)
        value = p2g.at[idx, '_id']
        if not isinstance(value, str):
            for val in value.tolist():
                ncbigene = 'NCBIGene:' + str(val)
                p2g_dict = utils.add_elem_dictionary2(p2g_dict, uniprot,
                                                      ncbigene)
        else:
            ncbigene = 'NCBIGene:' + str(value)
            p2g_dict = utils.add_elem_dictionary2(p2g_dict, uniprot, ncbigene)

    return p2g_dict
Ejemplo n.º 5
0
def batch_query_mondo_from_doid(doid_list):
    """ convert a list of doids to a list of mondo ids

    Keyword arguments:
    doid_list: a list of doids
    """
    mapping_dict = {}
    print('total doids: {}'.format(len(doid_list)))
    id_list = list(set(doid_list))
    print('unique doids: {}'.format(len(id_list)))
    # initiate the mydisease.info python client
    client = get_client('disease')
    # the batch query can only handle 1000 ids at most a time
    for i in range(0, len(id_list), 1000):
        if i + 1000 <= len(id_list):
            batch = id_list[i:i + 1000]
        else:
            batch = id_list[i:]
        params = ','.join(batch)
        res = client.querymany(params, scopes="mondo.xrefs.doid", fields="_id")
        for _doc in res:
            if '_id' not in _doc:
                print('can not convert', _doc)
            mapping_dict[_doc[
                'query']] = _doc['_id'] if '_id' in _doc else _doc["query"]
    return mapping_dict
Ejemplo n.º 6
0
 def setUp(self):
     self.mv = get_client("variant", url=os.environ.get("V_CLIENT_HOST", "http://myvariant.info/v1"))
     self.query_list1 = [
         'chr1:g.866422C>T',
         'chr1:g.876664G>A',
         'chr1:g.69635G>C',
         'chr1:g.69869T>A',
         'chr1:g.881918G>A',
         'chr1:g.865625G>A',
         'chr1:g.69892T>C',
         'chr1:g.879381C>T',
         'chr1:g.878330C>G'
     ]
     self.query_list2 = [
         'rs374802787',
         'rs1433078',
         'rs1433115',
         'rs377266517',
         'rs587640013',
         'rs137857980',
         'rs199710579',
         'rs186823979',
         'rs2276240',
         'rs372452565'
     ]
Ejemplo n.º 7
0
    def __init__(self, taxon):
        GenericSimilarity.__init__(self)
        self.mg = get_client('gene')
        self.taxon = taxon
        if self.taxon == 'mouse':
            self.ont = 'mp'
        if self.taxon == 'human':
            self.ont = 'hp'
        self.meta = {
            'input_type': {
                'complexity': 'set',
                'id_type': 'HGNC',
                'data_type': 'gene',
            },
            'output_type': {
                'complexity': 'set',
                'id_type': 'HGNC',
                'data_type': 'gene',
            },
            'source': 'Monarch Biolink',
            'predicate': ['blm:has phenotype']
        }

        # Load the associated Biolink (Monarch)
        # phenotype ontology and annotation associations
        self.load_associations(taxon)
def convert_nm_ids_to_flybase(df1):
    # Remove all rows summing to 0
    df11 = df1.loc[~(df1 == 0).all(axis=1)]

    # Use mygene to change refseq into flybase ID
    mg = mygene.MyGeneInfo()
    mg = get_client('gene')

    # Calling mygene to map NM_ IDs to Flybase names
    print("Calling mygene.")
    refseq_list = df11.index.tolist()
    df_geneIDs = mg.querymany(
        refseq_list,
        scopes="refseq",
        fields=["ensembl.gene", "uniprot", "symbol", "reporter"],
        species="fruitfly",
        as_dataframe=True)
    new_index_list = df_geneIDs["ensembl.gene"].tolist()

    # Plotting loss of gene IDs per replicate
    plot_NaN("df_merged", df_geneIDs)

    df11['flybase_id'] = new_index_list
    cols = list(df11.columns)
    cols = [cols[-1]] + cols[:-1]
    df11 = df11[cols]
    # Adding the flybase names to dataframe
    #df1 = df1.set_index([pd.Index(new_index_list)])
    #df_converted=df1.reset_index().dropna().set_index("gene_id")
    #print("Convertion complete: \n", df_converted.head())
    #print("df_merged lost ", len(df1)-len(df_converted)," thus ",
    #      1-(len(df_converted)/len(df1)),"% gene IDs.")
    return df11
Ejemplo n.º 9
0
    def __init__(self, taxon):
        GenericSimilarity.__init__(self)
        self.mg = get_client('gene')
        self.input_object = ''
        self.taxon = taxon
        self.ont = 'go'
        self.meta = {
            'input_type': {
                'complexity': 'set',
                'id_type': 'HGNC',
                'data_type': 'gene',
            },
            'output_type': {
                'complexity': 'set',
                'id_type': 'HGNC',
                'data_type': 'gene',
            },
            'source':
            'Monarch Biolink',
            'predicate': [
                'blm:macromolecular machine to biological process association',
                # TODO: also requires a blm curie?
                'macromolecular machine to molecular activity association'
            ]
        }

        # Load the functional catalog of
        # GO ontology and annotation associations
        self.load_associations(taxon)
Ejemplo n.º 10
0
def gene_to_uniprot_from_mygene(id):
    """
    Query MyGeneInfo with a gene and get its corresponding UniProt ID
    """
    uniprot_ids = []
    mg = get_client('gene')
    try:
        results = mg.query(id, fields='uniprot')
        if results['hits']:
            for hit in results['hits']:
                if 'Swiss-Prot' in hit['uniprot']:
                    uniprot_id = hit['uniprot']['Swiss-Prot']
                    if not uniprot_id.startswith('UniProtKB'):
                        uniprot_id = "UniProtKB:{}".format(uniprot_id)
                    uniprot_ids.append(uniprot_id)
                else:
                    trembl_ids = hit['uniprot']['TrEMBL']
                    for x in trembl_ids:
                        if not x.startswith('UniProtKB'):
                            x = "UniProtKB:{}".format(x)
                        uniprot_ids.append(x)
    except ConnectionError:
        logging.error("ConnectionError while querying MyGeneInfo with {}".format(id))

    return uniprot_ids
Ejemplo n.º 11
0
 def __init__(self):
     self.blw = BioLinkApiWrapper(Config().get_biolink_api_endpoint())
     self.mg = get_client('gene')
     self.meta = {
         'taxon': 'human',
         'limit': None,
     }
Ejemplo n.º 12
0
class MyVariantInfo(get_client('variant', instance=False)):
    '''This is the client for MyVariant.info web services.
    Example:

        >>> mv = MyVariantInfo()

    '''
    pass
Ejemplo n.º 13
0
 def setUp(self):
     self.mg = get_client("gene",
                          url=os.environ.get('G_CLIENT_HOST',
                                             'http://mygene.info/v3'))
     self.query_list1 = [
         '1007_s_at', '1053_at', '117_at', '121_at', '1255_g_at', '1294_at',
         '1316_at', '1320_at', '1405_i_at', '1431_at'
     ]
 def __init__(self, taxon):
     GenericSimilarity.__init__(self)
     self.mg = get_client('gene')
     self.taxon = taxon
     if self.taxon == 'mouse':
         self.ont = 'mp'
     if self.taxon == 'human':
         self.ont = 'hp'
Ejemplo n.º 15
0
 def setUp(self):
     self.mc = biothings_client.get_client("chem")
     self.query_list1 = [
         "QCYGXOCMWHSXSU-UHFFFAOYSA-N", "ADFOMBKCPIMCOO-BTVCFUMJSA-N",
         "DNUTZBZXLPWRJG-UHFFFAOYSA-N", "DROLRDZYPMOKLM-BIVLZKPYSA-N",
         "KPBZROQVTHLCDU-GOSISDBHSA-N", "UTUUIUQHGDRVPU-UHFFFAOYSA-K",
         "WZWDUEKBAIXVCC-IGHBBLSQSA-N", "IAJIIJBMBCZPSW-BDAKNGLRSA-N",
         "NASIOHFAYPRIAC-JTQLQIEISA-N", "VGWIQFDQAFSSKA-UHFFFAOYSA-N"
     ]
Ejemplo n.º 16
0
    def __init__(self, taxon):
        GenericSimilarity.__init__(self)
        self.mg = get_client('gene')
        self.input_object = ''
        self.taxon = taxon
        self.ont = 'go'

        # Load the functional catalog of
        # GO ontology and annotation associations
        self.load_associations(taxon)
Ejemplo n.º 17
0
    def _get_client(self):
        """
        Get Client - return a client appropriate for IDLookup

        This method must be defined in the child class.  It is an artifact
        of multithreading.
        :return:
        """
        if not self.client:
            self.client = biothings_client.get_client('gene')
        return self.client
Ejemplo n.º 18
0
    def __init__(self, taxon):
        GenericSimilarity.__init__(self)
        self.mg = get_client('gene')
        self.taxon = taxon
        if self.taxon == 'mouse':
            self.ont = 'mp'
        if self.taxon == 'human':
            self.ont = 'hp'

        # Load the associated Biolink (Monarch)
        # phenotype ontology and annotation associations
        self.load_associations(taxon)
Ejemplo n.º 19
0
    def __init__(self, biothings_type: str, search_scope: Union[List[str],
                                                                str],
                 value_fields: Union[List[str], str]):

        super().__init__()
        self.client: biothings_client.BiothingClient = biothings_client.get_client(
            biothings_type)
        self.search_scope = search_scope if type(search_scope) is list else [
            search_scope
        ]
        self.value_fields = value_fields if type(value_fields) is list else [
            value_fields
        ]
Ejemplo n.º 20
0
 def mygene(self,
            query=[],
            fr="accession_prot",
            to=["uniprot", "name"],
            as_dataframe=False,
            returnall=False):
     from biothings_client import get_client
     mg = get_client('gene')
     fr = self.__fieldTrans(field=fr, dict_type="mygene")
     to = [self.__fieldTrans(field=x, dict_type="mygene") for x in to]
     return mg.querymany(query,
                         scopes=fr,
                         fields=to,
                         as_dataframe=as_dataframe,
                         returnall=returnall)
Ejemplo n.º 21
0
def batch_query_entrez_from_locus_tag(locus_tag_list):
    """ convert a list of locus tags to list of entrez ids
    Keyword arguments:
    locus_tag_list: a list of locus tags
    """
    mapping_dict = {}
    id_list = list(set(locus_tag_list))
    # initiate the mydisease.info python client
    client = get_client('gene')
    params = ','.join(locus_tag_list)
    res = client.querymany(params, scopes="locus_tag", fields="_id")
    for _doc in res:
        if '_id' not in _doc:
            print('can not convert', _doc)
        mapping_dict[
            _doc['query']] = _doc['_id'] if '_id' in _doc else _doc["query"]
    return mapping_dict
Ejemplo n.º 22
0
def get_ensembl_ids(entrez_ids):
    mg = get_client('gene')
    ensembl_id_raw = mg.querymany(entrez_ids,
                                  scopes='entrezgene',
                                  fields='ensembl.gene',
                                  species='human')
    translate, drop_list = {}, []
    for ret in ensembl_id_raw:
        query = int(ret['query'])
        if "ensembl" in ret:
            ret = ret['ensembl']
            if isinstance(ret, list):
                ret = ret[0]
            translate[query] = ret['gene']
        else:
            drop_list.append(query)
    return translate, drop_list
Ejemplo n.º 23
0
def import_ortholog(csv_file, pattern, nthread):
    print("Parsing csv")

    if os.path.exists("%s/data" % os.path.dirname(csv_file)):
        path = "%s/data" % os.path.dirname(csv_file)
    else:
        path = os.path.dirname(os.path.dirname(csv_file))
    file_name = os.path.basename(csv_file)
    os.makedirs("%s/csv/%s" % (path, pattern), exist_ok=True)
    index_file = '%s/csv/%s/index_%s_%s.csv' % (path, pattern, file_name[:-4], pattern)
    df = import_csv(csv_file)
    mt = get_client("taxon")

    print("Extracting %s phosphorylation site" % pattern)

    uniprot_id_list = []
    if os.path.exists(index_file) and os.path.getsize(index_file) > 0:
        index_df = pd.read_csv(index_file, sep=';')
        uniprot_id_list = index_df["uniprotID"].value_counts().keys().tolist()

    print("Preparing queries")
    uniprot_to_convert = set(df["acc"].tolist()) - set(uniprot_id_list)
    resp = uniprotid_to_geneid(uniprot_to_convert)

    sub_df = df[df["acc"].isin(list(uniprot_to_convert))]
    sub_df.reset_index()
    with open(index_file, 'a+', newline='') as g:
        writer = csv.writer(g, delimiter=";")
        g.seek(0)
        first_char = g.read(1)
        if not first_char:
            writer.writerow(['uniprotID', 'geneID', 'taxID', 'metazoan', 'code',
                             'seq_in_window', 'pos_sites', 'clusterID', 'sequence'])

        group_acc_seq = sub_df.groupby(["acc"], observed=True)
        data_thread = np.array_split(group_acc_seq, nthread)
        thread_list = []
        for data in data_thread:
            thread_list.append(fill_csv(data, uniprot_id_list,
                                        pattern, mt, path, writer, resp))
        for thread in thread_list:
            thread.start()
        for thread in thread_list:
            thread.join()
    return index_file
Ejemplo n.º 24
0
def batch_query_hgvs_from_rsid(rsid_list):
    hgvs_rsid_dict = {}
    rsid_list = list(set(rsid_list))
    variant_client = get_client('variant')
    for i in range(0, len(rsid_list), 1000):
        if i + 1000 <= len(rsid_list):
            batch = rsid_list[i:i + 1000]
        else:
            batch = rsid_list[i:]
        params = ','.join(batch)
        res = variant_client.getvariants(params, fields="_id")
        # print("currently processing {}th variant".format(i))
        for _doc in res:
            if '_id' not in _doc:
                print('can not convert', _doc)
            hgvs_rsid_dict[_doc[
                'query']] = _doc['_id'] if '_id' in _doc else _doc["query"]
    return hgvs_rsid_dict
Ejemplo n.º 25
0
def convert_genes(genes, conv_from, conv_to, species):
    mg = get_client('gene')
    out = {}
    i = 0
    tmp_fname = 'data/enst-tmp.pickle'
    if os.path.isfile(tmp_fname):
        out = pickle.load(open(tmp_fname, 'rb'))
    for gene in genes:
        i += 1
        if gene in out:
            continue
        fields = ','.join([conv_to, 'refseq'])
        query = mg.query(gene,
                         scopes=conv_from,
                         fields=fields,
                         species=species)
        out[gene] = None
        if 'hits' in query:
            if len(query['hits']) > 0:
                if 'symbol' in query['hits'][0]:
                    out[gene] = query['hits'][0]['symbol']
        if i % 1000 == 0:
            pickle.dump(out, open(tmp_fname, 'wb'))
            print(i, gene, out[gene])
    pickle.dump(out, open(tmp_fname, 'wb'))
    mapping = out
    no_results = []
    new_ids = []
    for x in genes:
        new_ids.append(mapping[x])
        if mapping[x] is None:
            no_results.append(x)
    # for x in out:
    #    if conv_to in x.keys():
    #        mapping[x['query']] = x[conv_to]
    #    else:
    #        no_results.append(x)
    # for x in genes:
    #    if x not in mapping.keys():
    #        no_results.append(x)
    # new_ids = [mapping[x] if x not in no_results else '-' for x in genes]
    # if '-' in new_ids:
    #    new_ids.remove('-')
    return new_ids, mapping, no_results
def split_fasta(file):
    file_name = os.path.basename(file)
    path = os.path.dirname(os.path.dirname(file))
    os.makedirs("%s/metazoa" % path, exist_ok=True)
    os.makedirs("%s/non_metazoa" % path, exist_ok=True)
    path2metazoa = "%s/metazoa/%s_metazoa.fasta" % (
        path, os.path.splitext(file_name)[0])
    path2nonmetazoa = "%s/non_metazoa/%s_non_metazoa.fasta" % (
        path, os.path.splitext(file_name)[0])
    if not os.path.exists(path2nonmetazoa) and not os.path.exists(
            path2metazoa):
        for record in SeqIO.parse(open(file), "fasta"):
            mt = get_client("taxon")
            position = str(record.id).find(":")
            taxID = record.id[:position]
            metazoa = is_metazoan(float(taxID), mt)
            f_out = path2metazoa if metazoa else path2nonmetazoa
            SeqIO.write([record], open(f_out, 'a'), "fasta")
    return {"metazoa": path2metazoa, "nonmetazoa": path2nonmetazoa}
Ejemplo n.º 27
0
 def __init__(self):
     self.blw = BioLinkWrapper(Config().get_biolink_api_endpoint())
     self.mg = get_client('gene')
     self.input_object = ''
     self.meta = {
         'data_type': 'disease',
         'input_type': {
             'complexity': 'single',
             'id_type': ['MONDO', 'DO', 'OMIM'],
         },
         'output_type': {
             'complexity': 'set',
             'id_type': 'HGNC'
         },
         'taxon': 'human',
         'limit': None,
         'source': 'Monarch Biolink',
         'predicate': 'blm:gene associated with condition'
     }
Ejemplo n.º 28
0
def get_database_dict(database: pandas.DataFrame, gene_symbols: set) -> dict:
    try:
        database_entries = set(database['query'].to_list())
    except KeyError:
        database_entries = set()
    query_gene_symbols = gene_symbols.difference(database_entries)

    if len(query_gene_symbols) > 0:
        gene_client = biothings_client.get_client("gene")
        gene_query_result = gene_client.querymany(query_gene_symbols,
                                                  scopes='symbol',
                                                  species='human',
                                                  fields="entrezgene, ensembl")
        gene_df = pandas.json_normalize(gene_query_result)
        database = pandas.concat([database, gene_df])
        database.to_csv(args.database)

    database = database[['query', 'entrezgene']]
    database = database.set_index('query')
    return database.to_dict()['entrezgene']
Ejemplo n.º 29
0
def uniprot_to_gene_from_mygene(id):
    """
    Query MyGeneInfo with a UniProtKB id and get its corresponding HGNC gene
    """
    gene_id = None
    if id.startswith('UniProtKB'):
        id = id.split(':', 1)[1]

    mg = get_client('gene')
    try:
        results = mg.query(id, fields='HGNC')
        if results['hits']:
            hit = results['hits'][0]
            gene_id = hit['HGNC']
            if not gene_id.startswith('HGNC'):
                gene_id = 'HGNC:{}'.format(gene_id)
    except ConnectionError:
        logging.error("ConnectionError while querying MyGeneInfo with {}".format(id))

    return [gene_id]
Ejemplo n.º 30
0
    def __init__(self, iquery, fields, annotfunc, transcript=False):
        """

        :param iquery: query input information
        :param ofields: annotation output
        :param annotfunc: function
        """
        # 在这里添加判断iquery是list还是str,进而进行判断是否是多个list的请求

        self._variant = set(['gene', 'variant', 'chem', 'disease', 'taxon'])

        self.annotfunc = annotfunc
        self.iquery = iquery
        self.ofields = fields

        if self.annotfunc not in self._variant:
            raise ClientAttributionError(
                "The function \"{}\" can't find in \"{}\"".format(
                    self.annotfunc, '; '.join(self._variant)))

        self._myclient = get_client(self.annotfunc)
        self.query = self.__query()