Example #1
0
def get_summary(symbol):
    version = 'Mozilla/5.0 (Windows; U; Windows NT 5.1; it; rv:1.8.1.11) ' \
              'Gecko/20071127 Firefox/2.0.0.11'
    mg = MyGeneInfo()
    try:
        entrez_id = mg.query('symbol:%s' % symbol,
                             species='human')['hits'][0]['entrezgene']
    except Exception as e:
        logging.info("Error with query: " + str(e))
        return "Not found", "No entries found. (Entrez ID not found)"

    url = 'http://www.ncbi.nlm.nih.gov/gene/' + str(entrez_id)
    response = get(url, version)
    html_output = response.text

    search_string_start = '<dt>Summary</dt>'
    match_start = html_output.find(search_string_start)
    if match_start != -1:
        match_start += len(search_string_start)
        html_output = html_output[match_start:]

        search_string_end = '<dt>Orthologs</dt>'
        match_end = html_output.find(search_string_end)
        if match_end != -1:
            html_output = html_output[:match_end]

            # takes out the HTML tags
            extract_string = re.sub('<[^<]+?>', '', html_output)
        else:
            extract_string = "No entries found. (match_end = -1)"

    else:
        extract_string = "No entries found. (match_start = -1)"
    return entrez_id, extract_string
    def __init__(self, taxon):
        GenericSimilarity.__init__(self)
        self.mg = MyGeneInfo()
        self.input_object = ''
        self.taxon = taxon
        self.ont = 'go'
        self.meta = {
            'input_type': {
                'complexity': 'set',
                'id_type': 'HGNC',
                'data_type': 'gene',
            },
            'output_type': {
                'complexity': 'set',
                'id_type': 'HGNC',
                'data_type': 'gene',
            },
            'source':
            'Monarch Biolink',
            'predicate': [
                'blm:macromolecular machine to biological process association',
                'macromolecular machine to molecular activity association'
            ]
        }

        # Load the functional catalog of
        # GO ontology and annotation associations
        self.load_associations(taxon)
    def load_gene_set(self, input_gene_set):
        annotated_gene_set = []
        for gene in input_gene_set.get_input_curie_set():
            mg = MyGeneInfo()
            gene_curie = ''
            sim_input_curie = ''
            symbol = ''
            if 'MGI' in gene['hit_id']:
                gene_curie = gene['hit_id']
                sim_input_curie = gene['hit_id'].replace('MGI', 'MGI:MGI')
                symbol = None
            if 'HGNC' in gene['hit_id']:
                gene_curie = gene['hit_id'].replace('HGNC', 'hgnc')
                scope = 'HGNC'
                mg_hit = mg.query(gene_curie,
                                  scopes=scope,
                                  species=self.taxon,
                                  fields='uniprot, symbol, HGNC',
                                  entrezonly=True)
                try:
                    gene_curie = gene['hit_id']
                    sim_input_curie = 'UniProtKB:{}'.format(
                        mg_hit['hits'][0]['uniprot']['Swiss-Prot'])
                except Exception as e:
                    print(__name__ + ".load_gene_set() Exception: ", gene, e)

            annotated_gene_set.append({
                'input_id': gene_curie,
                'sim_input_curie': sim_input_curie,
                'input_symbol': gene['hit_symbol']
            })

        return annotated_gene_set
    def load_gene_set(self):
        for gene in self.input_object['input']:
            mg = MyGeneInfo()
            gene_curie = ''
            sim_input_curie = ''
            symbol = ''
            if 'MGI' in gene['hit_id']:
                gene_curie = gene['hit_id']
                sim_input_curie = gene['hit_id'].replace('MGI', 'MGI:MGI')
                symbol = None
            if 'HGNC' in gene['hit_id']:
                gene_curie = gene['hit_id'].replace('HGNC', 'hgnc')
                scope = 'HGNC'
                mg_hit = mg.query(
                    gene_curie,
                    scopes=scope,
                    species=self.input_object['parameters']['taxon'],
                    fields='uniprot, symbol, HGNC',
                    entrezonly=True)
                try:
                    gene_curie = gene['hit_id']
                    sim_input_curie = 'UniProtKB:{}'.format(
                        mg_hit['hits'][0]['uniprot']['Swiss-Prot'])
                except Exception as e:
                    print(gene, e)

            self.gene_set.append({
                'input_id': gene_curie,
                'sim_input_curie': sim_input_curie,
                'input_symbol': gene['hit_symbol']
            })
Example #5
0
def get_pert_agent(noble_coder, pert_text, title):
    """
    Extract perturbation agent
    Args:
        noble_coder: the execution path of Noble Coder
        pert_text: the perturbation text
        title: the title of the GSE

    Returns:
        the perturbation agent
    """

    # Try to identify perturbation agent from perturbation text first, if unsuccessful, try with title
    pert_agent = run_noble_coder(pert_text, noble_coder)
    if pert_agent is None:
        pert_agent = run_noble_coder(title, noble_coder)

    # Extract gene symbol
    if pert_agent is not None:
        for special_char in SPECIAL_CHARS:
            pert_agent = pert_agent.replace(special_char, " ")

        pert_agent = pert_agent.replace("Superfamily", "")
        mg = MyGeneInfo()
        response = mg.query(pert_agent)

        if response["hits"]:
            pert_agent = response["hits"][0]["symbol"]

    return pert_agent
 def __init__(self, associations: AssociationSet = None):
     GenericSimilarity.__init__(self)
     self.mg = MyGeneInfo()
     self.gene_set = []
     self.input_object = ''
     self.ont = 'go'
     self.group = ''
     self.meta = {
         'input_type': {
             'complexity': 'set',
             'id_type': 'HGNC',
             'data_type': 'gene',
         },
         'output_type': {
             'complexity': 'set',
             'id_type': 'HGNC',
             'data_type': 'gene',
         },
         'source':
         'Monarch Biolink',
         'predicate': [
             'blm:macromolecular machine to biological process association',
             'macromolecular machine to molecular activity association'
         ]
     }
def ensemble_to_symbol(ens):
    mg = MyGeneInfo()

    gene_info = mg.getgenes(geneids=ens, fields='symbol', as_dataframe=True, df_index=False)
    gene_info = gene_info.drop_duplicates('query').reset_index()
    
    gene_symbol = gene_info['symbol'].values
    gene_id = gene_info.symbol.str.cat([gene_info['query']], sep='|', na_rep='?').values

    return gene_symbol, gene_id
Example #8
0
 def query(self):
     mg = MyGeneInfo(url=self.base_url)
     # get the total
     q = mg.query(self.q, fields=self.fields, entrezonly=self.entrezonly)
     total = q['total']
     # get the cursor
     q = mg.query(self.q,
                  fields=self.fields,
                  fetch_all=True,
                  entrezonly=self.entrezonly)
     return q, total
Example #9
0
    def _batch_query(self, ids):
        """
        Uses mygene.info service to query many Entrez gene IDs. It returns a
        dict of {id-1: result-1, id-2: ... } with the IDs that were found (i.e.
        leaves out the not found ones).
        """
        if not hasattr(self, 'mg'):
            self.mg = MyGeneInfo()

        for batch_of_ids in grouped(ids, self.BATCH_SIZE):
            batch_annotations = {}
            for hit in self.mg.querymany(batch_of_ids,
                                         scopes='entrezgene',
                                         fields='all',
                                         verbose=self.VERBOSE):
                if 'notfound' not in hit and hit['taxid'] == self.TAXID:
                    batch_annotations[hit['query']] = hit
            yield batch_annotations
Example #10
0
def get_uid(name):
    mg = MyGeneInfo()
    res = mg.query(name, scopes='symbol, alias',
                   fields='uniprot, symbol', species='human')
    symbol = []
    uid = []
    for hit in res['hits']:
        try:
            uid.append(hit['uniprot']['Swiss-Prot'])
            symbol.append(hit['symbol'])
        except KeyError:
            uid.append('unable to retrieve')
    dict = {s: i for s, i in zip(symbol, uid)}
    try:
        uid = dict[name]
        out = uid
    except KeyError:
        out = dict
    return out
Example #11
0
    def get_mg_cursor(self, taxid, filter_f=None):
        # get a cursor to all mygene docs for a specific taxid
        # accepts a function that can be used to filter the gene cursor (returns True or False for each doc)
        mg = MyGeneInfo(url=self.base_url)
        # get the total
        q = mg.query(self.q,
                     fields=self.fields,
                     species=str(taxid),
                     entrezonly=self.entrezonly)
        total = q['total']
        # get the cursor
        q = mg.query(self.q,
                     fields=self.fields,
                     species=str(taxid),
                     fetch_all=True,
                     entrezonly=self.entrezonly)
        if filter_f:
            q = filter(filter_f, q)

        return q, total
Example #12
0
def convert_gene_api(query):

    mg = MyGeneInfo()
    dic = {}
    out = float('nan')
    out_format = 'entrezgene'
    try:
        res = mg.query(query)
    except:
        res = {}
        res['hits'] = []
    if len(res['hits']) > 0:
        for h in res['hits']:
            if h['taxid'] == 9606 and out_format in h.keys():
                out = h[out_format]
    else:
        out = float('nan')

    dic[query] = out

    return (dic)
Example #13
0
def get_uid(name):
    mg = MyGeneInfo()
    res = mg.query(name,
                   scopes='symbol, alias',
                   fields='uniprot, symbol',
                   species='human')
    symbol = []
    uid = []
    for hit in res['hits']:
        try:
            uid.append(hit['uniprot']['Swiss-Prot'])
            symbol.append(hit['symbol'])
        except KeyError:
            uid.append('unable to retrieve')
    dict = {s: i for s, i in zip(symbol, uid)}
    try:
        uid = dict[name]
        out = uid
    except KeyError:
        out = dict
    return out
def pathway_enrichment(gene_names, pipe_section=1, dbs=None, total_genes=20531, p_cutoff=0.05, cache_path='../data/cache/'):
    mg = MyGeneInfo()
    mg.set_caching(cache_db=os.path.join(cache_path, 'mygene_cache'), verbose=False)
    if not os.path.exists(cache_path):
        os.makedirs(cache_path)

    gene_ids = []
    for g in gene_names:
        gene_ids.append(g.split('|')[pipe_section])
    gene_info = mg.getgenes(geneids=gene_ids, fields='pathway', as_dataframe=True, df_index=False)
    try:
        pathways = gene_info['pathway']
    except Exception as e:
        print(e)
        print('No pathways found with the selected genes:')
        print(gene_names)
        return None
    p_df = []
    for idx, p in pathways.iteritems():
        if not (p is np.nan or p != p):
            # print(p)
            path = dict(p)
            for key in path.keys():
                if dbs is not None and key not in dbs:
                    continue
                p_dict = path[key]
                if type(p_dict) is list:
                    for k in p_dict:
                        p_df.append([k['id'], k['name'], key, str(gene_info['query'][idx])])
                else:
                    p_df.append([p_dict['id'], p_dict['name'], key, str(gene_info['query'][idx])])

    p_df = pd.DataFrame(p_df, columns=['id', 'name', 'db', 'genes'])
    p_df = p_df.groupby(['id', 'name', 'db'], as_index=False)['genes'].apply(list)
    p_df = p_df.reset_index()
    p_df.columns = ['id', 'name', 'db', 'genes']
    pathway_size = []
    for idx, p_row in p_df.iterrows():
        if idx % 50 == 0:
            print('querying {}/{}'.format(idx, p_df.shape[0]))
        p_size = mg.query('pathway.{}.id:{}'.format(p_row.db, p_row.id), size=0, verbose=False)['total']
        pathway_size.append(p_size)

    p_df['sup'] = [len(x) for x in p_df.genes.as_matrix()]
    p_df['size'] = pathway_size

    p_p = []
    nb_slected_genes = len(gene_names)
    for idx, p_row in p_df.iterrows():
        p_p.append(hypergeom.sf(p_row['sup'] - 1, total_genes, p_row['size'], nb_slected_genes))
    p_df['p_value'] = p_p

    p_df = p_df[p_df['p_value'] <= p_cutoff]

    p_df['ratio'] = [x['sup'] / x['size'] for i, x in p_df.iterrows()]
    p_df = p_df.sort_values(by=['p_value']).reset_index(drop=True)

    return p_df
Example #15
0
def download_targets_for_diseases(data_dir: str):
    my_gene_info = MyGeneInfo()
    open_targets_client = OpenTargetsClient()
    for disease_efo_id, disease_abbreviation in zip(disease_efo_ids,
                                                    DISEASE_ABBREVIATIONS):
        path = os.path.join(data_dir, disease_abbreviation,
                            opentargets_file_name)
        if os.path.exists(path):
            continue
        with open(path, 'w+') as file:
            download_targets_for_disease(
                disease_efo_id=disease_efo_id,
                my_gene_info=my_gene_info,
                open_targets_client=open_targets_client,
                file=file,
            )
Example #16
0
 def __init__(self):
     self.blw = BioLinkWrapper()
     self.mg = MyGeneInfo()
     self.input_object = ''
     self.meta = {
         'data_type': 'disease',
         'input_type': {
             'complexity': 'single',
             'id_type': ['MONDO', 'DO', 'OMIM'],
         },
         'output_type': {
             'complexity': 'set',
             'id_type': 'HGNC'
         },
         'taxon': 'human',
         'limit': None,
         'source': 'Monarch Biolink',
         'predicate': 'blm:gene associated with condition'
     }
Example #17
0
class MygeneAnnotator(WebAnnotatorWithCache):
    """
    Provides gene annotations given one or more gene entrez IDs.
    """
    SOURCE_NAME = 'mygene'
    ANNOTATIONS_ARE_JSON = True
    VERBOSE = False
    TAXID = 9606
    # ^ Human ID, used to avoid annotating with genes from another species
    # It can be replaced at Runtime to annotate with other species
    BATCH_SIZE = 1000

    def _batch_query(self, ids):
        """
        Uses mygene.info service to query many Entrez gene IDs. It returns a
        dict of {id-1: result-1, id-2: ... } with the IDs that were found (i.e.
        leaves out the not found ones).
        """
        if not hasattr(self, 'mg'):
            self.mg = MyGeneInfo()

        for batch_of_ids in grouped(ids, self.BATCH_SIZE):
            batch_annotations = {}
            for hit in self.mg.querymany(batch_of_ids,
                                         scopes='entrezgene',
                                         fields='all',
                                         verbose=self.VERBOSE):
                if 'notfound' not in hit and hit['taxid'] == self.TAXID:
                    batch_annotations[hit['query']] = hit
            yield batch_annotations

    @staticmethod
    def _parse_annotation(raw_annotation):
        annotation = {k: v for k, v in raw_annotation.items()}

        if 'uniprot' in annotation:
            swissprot_id = annotation['uniprot'].get('Swiss-Prot')
            if swissprot_id:
                annotation['swissprot'] = swissprot_id
            del (annotation['uniprot'])

        return annotation
Example #18
0
def download_targets_for_disease(
        disease_efo_id: str,
        open_targets_client: Optional[OpenTargetsClient] = None,
        my_gene_info: Optional[MyGeneInfo] = None,
        file: Optional[TextIO] = None,
) -> None:
    """

    :param disease_efo_id: A disease's EFO identifier
    :param open_targets_client: An OpenTargetsClient
    :param my_gene_info: A MyGeneInfo client
    :param file: Place to output targets for disease
    """
    if open_targets_client is None:
        open_targets_client = OpenTargetsClient()
    associations = open_targets_client.get_associations_for_disease(
        disease_efo_id,
        fields=[
            'associationscore.datatypes',
            'target.id',
        ],
    ).filter(
        datatype='known_drug',
    )
    ensembl_list = [
        association['target']['id']
        for association in associations
    ]

    if my_gene_info is None:
        my_gene_info = MyGeneInfo()

    id_mappings = my_gene_info.getgenes(ensembl_list, fields="entrezgene")

    print('efo', 'ncbigene', file=file, sep='\t')
    for mapping in id_mappings:
        entrez_gene_id = mapping.get('entrezgene')
        if entrez_gene_id is not None:
            print(disease_efo_id, entrez_gene_id, file=file, sep='\t')
Example #19
0
    group_by,
    desc,
    arrange,
    slice_head,
    tibble,
    left_join,
    mutate,
    is_na,
    across,
    if_else,
    filter,
    pull,
    select,
)

mygene = MyGeneInfo()


class QueryGenesNotFound(Exception):
    """When genes cannot be found"""


def gene_name_conversion(
    genes,
    species,
    infmt,
    outfmt,
    notfound,
):
    """Convert gene names using MyGeneInfo
class FunctionalSimilarity(GenericSimilarity):
    def __init__(self, associations: AssociationSet = None):
        GenericSimilarity.__init__(self)
        self.mg = MyGeneInfo()
        self.gene_set = []
        self.input_object = ''
        self.ont = 'go'
        self.group = ''
        self.meta = {
            'input_type': {
                'complexity': 'set',
                'id_type': 'HGNC',
                'data_type': 'gene',
            },
            'output_type': {
                'complexity': 'set',
                'id_type': 'HGNC',
                'data_type': 'gene',
            },
            'source':
            'Monarch Biolink',
            'predicate': [
                'blm:macromolecular machine to biological process association',
                'macromolecular machine to molecular activity association'
            ]
        }

    def metadata(self):
        print("""Mod1A Functional Similarity metadata:""")
        pprint(self.meta)

    def load_input_object(self, input_object):
        self.input_object = input_object
        if self.input_object['parameters']['taxon'] == 'mouse':
            self.group = 'mouse'
        if self.input_object['parameters']['taxon'] == 'human':
            self.group = 'human'

    def load_associations(self):
        self.retrieve_associations(ont=self.ont, group=self.group)

    def load_gene_set(self):
        for gene in self.input_object['input']:
            mg = MyGeneInfo()
            gene_curie = ''
            sim_input_curie = ''
            symbol = ''
            if 'MGI' in gene['hit_id']:
                gene_curie = gene['hit_id']
                sim_input_curie = gene['hit_id'].replace('MGI', 'MGI:MGI')
                symbol = None
            if 'HGNC' in gene['hit_id']:
                gene_curie = gene['hit_id'].replace('HGNC', 'hgnc')
                scope = 'HGNC'
                mg_hit = mg.query(
                    gene_curie,
                    scopes=scope,
                    species=self.input_object['parameters']['taxon'],
                    fields='uniprot, symbol, HGNC',
                    entrezonly=True)
                try:
                    gene_curie = gene['hit_id']
                    sim_input_curie = 'UniProtKB:{}'.format(
                        mg_hit['hits'][0]['uniprot']['Swiss-Prot'])
                except Exception as e:
                    print(gene, e)

            self.gene_set.append({
                'input_id': gene_curie,
                'sim_input_curie': sim_input_curie,
                'input_symbol': gene['hit_symbol']
            })

    def compute_similarity(self):
        group = self.input_object['parameters']['taxon']
        lower_bound = float(self.input_object['parameters']['threshold'])
        results = self.compute_jaccard(self.gene_set, lower_bound)
        for result in results:
            if group == 'human':
                result['hit_id'] = self.symbol2hgnc(result['hit_symbol'])
            for gene in self.gene_set:
                if gene['sim_input_curie'] != result['input_id']:
                    result['input_id'] = self.symbol2hgnc(
                        result['input_symbol'])
        return results

    def symbol2hgnc(self, symbol):
        mg_hit = self.mg.query('symbol:{}'.format(symbol),
                               fields='HGNC,symbol,taxon',
                               species='human',
                               entrezonly=True)
        if mg_hit['total'] == 1:
            return 'HGNC:{}'.format(mg_hit['hits'][0]['HGNC'])
Example #21
0
 def get_mg_gene(self, entrezgene):
     mg = MyGeneInfo(url=self.base_url)
     q = mg.getgene(entrezgene, fields=self.fields)
     return q, 1
class FunctionalSimilarity(GenericSimilarity):
    def __init__(self, taxon):
        GenericSimilarity.__init__(self)
        self.mg = MyGeneInfo()
        self.input_object = ''
        self.taxon = taxon
        self.ont = 'go'
        self.meta = {
            'input_type': {
                'complexity': 'set',
                'id_type': 'HGNC',
                'data_type': 'gene',
            },
            'output_type': {
                'complexity': 'set',
                'id_type': 'HGNC',
                'data_type': 'gene',
            },
            'source':
            'Monarch Biolink',
            'predicate': [
                'blm:macromolecular machine to biological process association',
                'macromolecular machine to molecular activity association'
            ]
        }

        # Load the functional catalog of
        # GO ontology and annotation associations
        self.load_associations(taxon)

    def metadata(self):
        print("""Mod1A Functional Similarity metadata:""")
        pprint(self.meta)

    def load_gene_set(self, input_gene_set):
        annotated_gene_set = []
        for gene in input_gene_set.get_input_curie_set():
            mg = MyGeneInfo()
            gene_curie = ''
            sim_input_curie = ''
            symbol = ''
            if 'MGI' in gene['hit_id']:
                gene_curie = gene['hit_id']
                sim_input_curie = gene['hit_id'].replace('MGI', 'MGI:MGI')
                symbol = None
            if 'HGNC' in gene['hit_id']:
                gene_curie = gene['hit_id'].replace('HGNC', 'hgnc')
                scope = 'HGNC'
                mg_hit = mg.query(gene_curie,
                                  scopes=scope,
                                  species=self.taxon,
                                  fields='uniprot, symbol, HGNC',
                                  entrezonly=True)
                try:
                    gene_curie = gene['hit_id']
                    sim_input_curie = 'UniProtKB:{}'.format(
                        mg_hit['hits'][0]['uniprot']['Swiss-Prot'])
                except Exception as e:
                    print(__name__ + ".load_gene_set() Exception: ", gene, e)

            annotated_gene_set.append({
                'input_id': gene_curie,
                'sim_input_curie': sim_input_curie,
                'input_symbol': gene['hit_symbol']
            })

        return annotated_gene_set

    def compute_similarity(self, annotated_gene_set, threshold):
        lower_bound = float(threshold)
        results = self.compute_jaccard(annotated_gene_set, lower_bound)
        for result in results:
            if self.taxon == 'human':
                result['hit_id'] = self.symbol2hgnc(result['hit_symbol'])
            for gene in annotated_gene_set:
                if gene['sim_input_curie'] != result['input_id']:
                    result['input_id'] = self.symbol2hgnc(
                        result['input_symbol'])
        return results

    def symbol2hgnc(self, symbol):
        mg_hit = self.mg.query('symbol:{}'.format(symbol),
                               fields='HGNC,symbol,taxon',
                               species='human',
                               entrezonly=True)
        if mg_hit['total'] == 1:
            return 'HGNC:{}'.format(mg_hit['hits'][0]['HGNC'])
from mygene import MyGeneInfo
from pprint import pprint
import csv
import sys

mg = MyGeneInfo()
dict_symbol = {}

def gene_name(ensg_id):
	gene = mg.getgene(ensg_id,fields='symbol')
	if gene != None and type(gene) is dict:
		return gene['symbol']
	elif type(gene) is list:
		print(ensg_id)
		pprint(gene)
		return ensg_id
	else:
		return ensg_id

'''
print(gene_name('ENSG00000273842'))
'''

with open('sample.txt','r') as f:
	rows = csv.reader(f, delimiter='\t')
	with open('gene_table.csv','w') as f_write:
		f_csv = csv.writer(f_write)
		i = 0
		for row in rows:
			i += 1
			name = gene_name(row[0][:15])