コード例 #1
0
ファイル: Downloader.py プロジェクト: turoger/scheduled-bots
 def query(self):
     mg = MyGeneInfo(url=self.base_url)
     # get the total
     q = mg.query(self.q, fields=self.fields, entrezonly=self.entrezonly)
     total = q['total']
     # get the cursor
     q = mg.query(self.q,
                  fields=self.fields,
                  fetch_all=True,
                  entrezonly=self.entrezonly)
     return q, total
コード例 #2
0
    def load_gene_set(self):
        for gene in self.input_object['input']:
            mg = MyGeneInfo()
            gene_curie = ''
            sim_input_curie = ''
            symbol = ''
            if 'MGI' in gene['hit_id']:
                gene_curie = gene['hit_id']
                sim_input_curie = gene['hit_id'].replace('MGI', 'MGI:MGI')
                symbol = None
            if 'HGNC' in gene['hit_id']:
                gene_curie = gene['hit_id'].replace('HGNC', 'hgnc')
                scope = 'HGNC'
                mg_hit = mg.query(
                    gene_curie,
                    scopes=scope,
                    species=self.input_object['parameters']['taxon'],
                    fields='uniprot, symbol, HGNC',
                    entrezonly=True)
                try:
                    gene_curie = gene['hit_id']
                    sim_input_curie = 'UniProtKB:{}'.format(
                        mg_hit['hits'][0]['uniprot']['Swiss-Prot'])
                except Exception as e:
                    print(gene, e)

            self.gene_set.append({
                'input_id': gene_curie,
                'sim_input_curie': sim_input_curie,
                'input_symbol': gene['hit_symbol']
            })
コード例 #3
0
def get_summary(symbol):
    version = 'Mozilla/5.0 (Windows; U; Windows NT 5.1; it; rv:1.8.1.11) ' \
              'Gecko/20071127 Firefox/2.0.0.11'
    mg = MyGeneInfo()
    try:
        entrez_id = mg.query('symbol:%s' % symbol,
                             species='human')['hits'][0]['entrezgene']
    except Exception as e:
        logging.info("Error with query: " + str(e))
        return "Not found", "No entries found. (Entrez ID not found)"

    url = 'http://www.ncbi.nlm.nih.gov/gene/' + str(entrez_id)
    response = get(url, version)
    html_output = response.text

    search_string_start = '<dt>Summary</dt>'
    match_start = html_output.find(search_string_start)
    if match_start != -1:
        match_start += len(search_string_start)
        html_output = html_output[match_start:]

        search_string_end = '<dt>Orthologs</dt>'
        match_end = html_output.find(search_string_end)
        if match_end != -1:
            html_output = html_output[:match_end]

            # takes out the HTML tags
            extract_string = re.sub('<[^<]+?>', '', html_output)
        else:
            extract_string = "No entries found. (match_end = -1)"

    else:
        extract_string = "No entries found. (match_start = -1)"
    return entrez_id, extract_string
コード例 #4
0
def get_pert_agent(noble_coder, pert_text, title):
    """
    Extract perturbation agent
    Args:
        noble_coder: the execution path of Noble Coder
        pert_text: the perturbation text
        title: the title of the GSE

    Returns:
        the perturbation agent
    """

    # Try to identify perturbation agent from perturbation text first, if unsuccessful, try with title
    pert_agent = run_noble_coder(pert_text, noble_coder)
    if pert_agent is None:
        pert_agent = run_noble_coder(title, noble_coder)

    # Extract gene symbol
    if pert_agent is not None:
        for special_char in SPECIAL_CHARS:
            pert_agent = pert_agent.replace(special_char, " ")

        pert_agent = pert_agent.replace("Superfamily", "")
        mg = MyGeneInfo()
        response = mg.query(pert_agent)

        if response["hits"]:
            pert_agent = response["hits"][0]["symbol"]

    return pert_agent
コード例 #5
0
    def load_gene_set(self, input_gene_set):
        annotated_gene_set = []
        for gene in input_gene_set.get_input_curie_set():
            mg = MyGeneInfo()
            gene_curie = ''
            sim_input_curie = ''
            symbol = ''
            if 'MGI' in gene['hit_id']:
                gene_curie = gene['hit_id']
                sim_input_curie = gene['hit_id'].replace('MGI', 'MGI:MGI')
                symbol = None
            if 'HGNC' in gene['hit_id']:
                gene_curie = gene['hit_id'].replace('HGNC', 'hgnc')
                scope = 'HGNC'
                mg_hit = mg.query(gene_curie,
                                  scopes=scope,
                                  species=self.taxon,
                                  fields='uniprot, symbol, HGNC',
                                  entrezonly=True)
                try:
                    gene_curie = gene['hit_id']
                    sim_input_curie = 'UniProtKB:{}'.format(
                        mg_hit['hits'][0]['uniprot']['Swiss-Prot'])
                except Exception as e:
                    print(__name__ + ".load_gene_set() Exception: ", gene, e)

            annotated_gene_set.append({
                'input_id': gene_curie,
                'sim_input_curie': sim_input_curie,
                'input_symbol': gene['hit_symbol']
            })

        return annotated_gene_set
コード例 #6
0
def pathway_enrichment(gene_names, pipe_section=1, dbs=None, total_genes=20531, p_cutoff=0.05, cache_path='../data/cache/'):
    mg = MyGeneInfo()
    mg.set_caching(cache_db=os.path.join(cache_path, 'mygene_cache'), verbose=False)
    if not os.path.exists(cache_path):
        os.makedirs(cache_path)

    gene_ids = []
    for g in gene_names:
        gene_ids.append(g.split('|')[pipe_section])
    gene_info = mg.getgenes(geneids=gene_ids, fields='pathway', as_dataframe=True, df_index=False)
    try:
        pathways = gene_info['pathway']
    except Exception as e:
        print(e)
        print('No pathways found with the selected genes:')
        print(gene_names)
        return None
    p_df = []
    for idx, p in pathways.iteritems():
        if not (p is np.nan or p != p):
            # print(p)
            path = dict(p)
            for key in path.keys():
                if dbs is not None and key not in dbs:
                    continue
                p_dict = path[key]
                if type(p_dict) is list:
                    for k in p_dict:
                        p_df.append([k['id'], k['name'], key, str(gene_info['query'][idx])])
                else:
                    p_df.append([p_dict['id'], p_dict['name'], key, str(gene_info['query'][idx])])

    p_df = pd.DataFrame(p_df, columns=['id', 'name', 'db', 'genes'])
    p_df = p_df.groupby(['id', 'name', 'db'], as_index=False)['genes'].apply(list)
    p_df = p_df.reset_index()
    p_df.columns = ['id', 'name', 'db', 'genes']
    pathway_size = []
    for idx, p_row in p_df.iterrows():
        if idx % 50 == 0:
            print('querying {}/{}'.format(idx, p_df.shape[0]))
        p_size = mg.query('pathway.{}.id:{}'.format(p_row.db, p_row.id), size=0, verbose=False)['total']
        pathway_size.append(p_size)

    p_df['sup'] = [len(x) for x in p_df.genes.as_matrix()]
    p_df['size'] = pathway_size

    p_p = []
    nb_slected_genes = len(gene_names)
    for idx, p_row in p_df.iterrows():
        p_p.append(hypergeom.sf(p_row['sup'] - 1, total_genes, p_row['size'], nb_slected_genes))
    p_df['p_value'] = p_p

    p_df = p_df[p_df['p_value'] <= p_cutoff]

    p_df['ratio'] = [x['sup'] / x['size'] for i, x in p_df.iterrows()]
    p_df = p_df.sort_values(by=['p_value']).reset_index(drop=True)

    return p_df
コード例 #7
0
ファイル: Downloader.py プロジェクト: turoger/scheduled-bots
    def get_mg_cursor(self, taxid, filter_f=None):
        # get a cursor to all mygene docs for a specific taxid
        # accepts a function that can be used to filter the gene cursor (returns True or False for each doc)
        mg = MyGeneInfo(url=self.base_url)
        # get the total
        q = mg.query(self.q,
                     fields=self.fields,
                     species=str(taxid),
                     entrezonly=self.entrezonly)
        total = q['total']
        # get the cursor
        q = mg.query(self.q,
                     fields=self.fields,
                     species=str(taxid),
                     fetch_all=True,
                     entrezonly=self.entrezonly)
        if filter_f:
            q = filter(filter_f, q)

        return q, total
コード例 #8
0
ファイル: name2uid.py プロジェクト: subkar/msda
def get_uid(name):
    mg = MyGeneInfo()
    res = mg.query(name, scopes='symbol, alias',
                   fields='uniprot, symbol', species='human')
    symbol = []
    uid = []
    for hit in res['hits']:
        try:
            uid.append(hit['uniprot']['Swiss-Prot'])
            symbol.append(hit['symbol'])
        except KeyError:
            uid.append('unable to retrieve')
    dict = {s: i for s, i in zip(symbol, uid)}
    try:
        uid = dict[name]
        out = uid
    except KeyError:
        out = dict
    return out
コード例 #9
0
def convert_gene_api(query):

    mg = MyGeneInfo()
    dic = {}
    out = float('nan')
    out_format = 'entrezgene'
    try:
        res = mg.query(query)
    except:
        res = {}
        res['hits'] = []
    if len(res['hits']) > 0:
        for h in res['hits']:
            if h['taxid'] == 9606 and out_format in h.keys():
                out = h[out_format]
    else:
        out = float('nan')

    dic[query] = out

    return (dic)
コード例 #10
0
ファイル: name2uid.py プロジェクト: smkartik/msda
def get_uid(name):
    mg = MyGeneInfo()
    res = mg.query(name,
                   scopes='symbol, alias',
                   fields='uniprot, symbol',
                   species='human')
    symbol = []
    uid = []
    for hit in res['hits']:
        try:
            uid.append(hit['uniprot']['Swiss-Prot'])
            symbol.append(hit['symbol'])
        except KeyError:
            uid.append('unable to retrieve')
    dict = {s: i for s, i in zip(symbol, uid)}
    try:
        uid = dict[name]
        out = uid
    except KeyError:
        out = dict
    return out
コード例 #11
0
class FunctionalSimilarity(GenericSimilarity):
    def __init__(self, associations: AssociationSet = None):
        GenericSimilarity.__init__(self)
        self.mg = MyGeneInfo()
        self.gene_set = []
        self.input_object = ''
        self.ont = 'go'
        self.group = ''
        self.meta = {
            'input_type': {
                'complexity': 'set',
                'id_type': 'HGNC',
                'data_type': 'gene',
            },
            'output_type': {
                'complexity': 'set',
                'id_type': 'HGNC',
                'data_type': 'gene',
            },
            'source':
            'Monarch Biolink',
            'predicate': [
                'blm:macromolecular machine to biological process association',
                'macromolecular machine to molecular activity association'
            ]
        }

    def metadata(self):
        print("""Mod1A Functional Similarity metadata:""")
        pprint(self.meta)

    def load_input_object(self, input_object):
        self.input_object = input_object
        if self.input_object['parameters']['taxon'] == 'mouse':
            self.group = 'mouse'
        if self.input_object['parameters']['taxon'] == 'human':
            self.group = 'human'

    def load_associations(self):
        self.retrieve_associations(ont=self.ont, group=self.group)

    def load_gene_set(self):
        for gene in self.input_object['input']:
            mg = MyGeneInfo()
            gene_curie = ''
            sim_input_curie = ''
            symbol = ''
            if 'MGI' in gene['hit_id']:
                gene_curie = gene['hit_id']
                sim_input_curie = gene['hit_id'].replace('MGI', 'MGI:MGI')
                symbol = None
            if 'HGNC' in gene['hit_id']:
                gene_curie = gene['hit_id'].replace('HGNC', 'hgnc')
                scope = 'HGNC'
                mg_hit = mg.query(
                    gene_curie,
                    scopes=scope,
                    species=self.input_object['parameters']['taxon'],
                    fields='uniprot, symbol, HGNC',
                    entrezonly=True)
                try:
                    gene_curie = gene['hit_id']
                    sim_input_curie = 'UniProtKB:{}'.format(
                        mg_hit['hits'][0]['uniprot']['Swiss-Prot'])
                except Exception as e:
                    print(gene, e)

            self.gene_set.append({
                'input_id': gene_curie,
                'sim_input_curie': sim_input_curie,
                'input_symbol': gene['hit_symbol']
            })

    def compute_similarity(self):
        group = self.input_object['parameters']['taxon']
        lower_bound = float(self.input_object['parameters']['threshold'])
        results = self.compute_jaccard(self.gene_set, lower_bound)
        for result in results:
            if group == 'human':
                result['hit_id'] = self.symbol2hgnc(result['hit_symbol'])
            for gene in self.gene_set:
                if gene['sim_input_curie'] != result['input_id']:
                    result['input_id'] = self.symbol2hgnc(
                        result['input_symbol'])
        return results

    def symbol2hgnc(self, symbol):
        mg_hit = self.mg.query('symbol:{}'.format(symbol),
                               fields='HGNC,symbol,taxon',
                               species='human',
                               entrezonly=True)
        if mg_hit['total'] == 1:
            return 'HGNC:{}'.format(mg_hit['hits'][0]['HGNC'])
コード例 #12
0
class FunctionalSimilarity(GenericSimilarity):
    def __init__(self, taxon):
        GenericSimilarity.__init__(self)
        self.mg = MyGeneInfo()
        self.input_object = ''
        self.taxon = taxon
        self.ont = 'go'
        self.meta = {
            'input_type': {
                'complexity': 'set',
                'id_type': 'HGNC',
                'data_type': 'gene',
            },
            'output_type': {
                'complexity': 'set',
                'id_type': 'HGNC',
                'data_type': 'gene',
            },
            'source':
            'Monarch Biolink',
            'predicate': [
                'blm:macromolecular machine to biological process association',
                'macromolecular machine to molecular activity association'
            ]
        }

        # Load the functional catalog of
        # GO ontology and annotation associations
        self.load_associations(taxon)

    def metadata(self):
        print("""Mod1A Functional Similarity metadata:""")
        pprint(self.meta)

    def load_gene_set(self, input_gene_set):
        annotated_gene_set = []
        for gene in input_gene_set.get_input_curie_set():
            mg = MyGeneInfo()
            gene_curie = ''
            sim_input_curie = ''
            symbol = ''
            if 'MGI' in gene['hit_id']:
                gene_curie = gene['hit_id']
                sim_input_curie = gene['hit_id'].replace('MGI', 'MGI:MGI')
                symbol = None
            if 'HGNC' in gene['hit_id']:
                gene_curie = gene['hit_id'].replace('HGNC', 'hgnc')
                scope = 'HGNC'
                mg_hit = mg.query(gene_curie,
                                  scopes=scope,
                                  species=self.taxon,
                                  fields='uniprot, symbol, HGNC',
                                  entrezonly=True)
                try:
                    gene_curie = gene['hit_id']
                    sim_input_curie = 'UniProtKB:{}'.format(
                        mg_hit['hits'][0]['uniprot']['Swiss-Prot'])
                except Exception as e:
                    print(__name__ + ".load_gene_set() Exception: ", gene, e)

            annotated_gene_set.append({
                'input_id': gene_curie,
                'sim_input_curie': sim_input_curie,
                'input_symbol': gene['hit_symbol']
            })

        return annotated_gene_set

    def compute_similarity(self, annotated_gene_set, threshold):
        lower_bound = float(threshold)
        results = self.compute_jaccard(annotated_gene_set, lower_bound)
        for result in results:
            if self.taxon == 'human':
                result['hit_id'] = self.symbol2hgnc(result['hit_symbol'])
            for gene in annotated_gene_set:
                if gene['sim_input_curie'] != result['input_id']:
                    result['input_id'] = self.symbol2hgnc(
                        result['input_symbol'])
        return results

    def symbol2hgnc(self, symbol):
        mg_hit = self.mg.query('symbol:{}'.format(symbol),
                               fields='HGNC,symbol,taxon',
                               species='human',
                               entrezonly=True)
        if mg_hit['total'] == 1:
            return 'HGNC:{}'.format(mg_hit['hits'][0]['HGNC'])