Exemple #1
0
def convert_ID_getAllIsoformsNM(
        ensemble_ID,
        boolSave,
        path,
        preffix='NM'
):  #define prefix, default = NM for mRNA transcript variants
    print 'Converting IDs'
    mg = mygene.MyGeneInfo()

    list_xli = mg.getgene(ensemble_ID, fields='all',
                          as_dataframe=True)['accession']['rna']
    list_transcript = [
        mg.getgene(ensemble_ID, fields='all', as_dataframe=True)['symbol']
    ]
    for i in range(0, len(list_xli)):
        if preffix == 'NM':
            if list_xli[i][:2] == preffix:
                list_transcript.append(list_xli[i])
        elif list_xli[i][:2] == preffix:
            list_transcript.append(list_xli[i])
    print 'ID converted'
    if boolSave == False:
        return list_transcript
    if boolSave == True:
        if len(list_transcript) != 0:
            batch_retrieve_seq(list_transcript, 'True', path)
        else:
            print 'No RefSeq Ids with the Prefix %s found' % preffix
Exemple #2
0
def geneid_converter(listofids, input_identifier=None, output_identifier=None):
    '''
    Method will take gene identifier and convert into desired one. Identifiers availble:
    [u'accession', u'alias', u'biocarta', u'chr', u'end', u'ensemblgene', u'ensemblprotein', u'ensembltranscript',
    u'entrezgene', u'exons', u'flybase', u'generif', u'go', u'hgnc', u'homologene', u'hprd', u'humancyc', u'interpro',
    u'ipi', u'kegg', u'mgi', u'mim', u'mirbase', u'mousecyc', u'name', u'netpath', u'pdb', u'pfam', u'pharmgkb', u'pid',
    u'pir', u'prosite', u'ratmap', u'reactome', u'reagent', u'refseq', u'reporter', u'retired', u'rgd', u'smpdb',
    u'start', u'strand', u'summary', u'symbol', u'tair', u'taxid', u'type_of_gene', u'unigene', u'uniprot',
    u'wikipathways', u'wormbase', u'xenbase', u'yeastcyc', u'zfin']
    :param input_identifier: input identifier eg. entrezgene
    :param output_identifier: list of output identifier eg. ["symbol", "ensembl.gene"]
    :param listofids: list of ids to be mapped eg. ['1', '10', '10001']
    :return: DataFrame of mapped ids
    '''
    if input_identifier is None:
        input_identifier = "entrezgene"
    if output_identifier is None:
        output_identifier = ["symbol", "ensembl.gene"]
    mygene_object = mygene.MyGeneInfo()
    mapped_dataframe = mygene_object.querymany(listofids,
                                               scopes=input_identifier,
                                               fields=output_identifier,
                                               species="human",
                                               as_dataframe=True)
    return mapped_dataframe
Exemple #3
0
def convert_ensembl_to_symbol(gene_list, species="human"):
    """Convert ENSEMBL gene ids to SYMBOLS
    Uses the python package mygene to look up the supplied list of ENSEMBLE Ids and return
    the equivalent list of Symbols. Species needs to be supplied.

    parameters
    ----------
    gene_list: `list`
        list of ensemble_Ids that need to be converted
    species: `str` | default = 'human'
        string identifying the species of the supplied Ensemble IDs

    returns
    -------
    list
        List containing the converted Symbols

    """

    mg = mygene.MyGeneInfo()
    gene_symbols = mg.querymany(gene_list,
                                scopes="ensembl.gene",
                                fields="symbol",
                                species=species)

    symbols = []
    for x in gene_symbols:
        symbols.append(x.get("symbol"))

    return symbols
Exemple #4
0
def protein_interaction_reference ():
    #HuRI = pd.read_csv("~/Downloads/HI-union.tsv", sep='\t', header = None)
    ## download the reference file 
    ## https://stringdb-static.org/download/protein.physical.links.detailed.v11.0/9606.protein.physical.links.detailed.v11.0.txt.gz
    #HuRI = pd.read_csv("~/Downloads/9606.protein.physical.links.detailed.v11.0.txt", sep=' ')
    HuRI = pd.read_csv("~/Downloads/9606.protein.links.detailed.v11.0.txt", sep=' ')
    HuRI.head
    HuRI['protein1'] = HuRI['protein1'].str.split('\.').str[-1].str.strip()
    HuRI['protein2'] = HuRI['protein2'].str.split('\.').str[-1].str.strip()
    codes1, uniques1 = pd.factorize(HuRI['protein1'])
    codes2, uniques2 = pd.factorize(HuRI['protein2'])
    # Mapping ensembl gene ids to gene symbols¶
    mg = mygene.MyGeneInfo()
    node1 = mg.querymany(uniques1, 
                        scopes  = 'ensembl.protein', 
                        fields  = 'symbol', 
                        species = 'human',
                        as_dataframe = True)
    node2 = mg.querymany(uniques2, 
                        scopes  = 'ensembl.protein', 
                        fields  = 'symbol', 
                        species = 'human',
                        as_dataframe = True)
    dict1 = pd.Series(node1.symbol.values,index = node1.index).to_dict()
    HuRI['protein1'] = HuRI['protein1'].map(dict1)
    HuRI['protein2'] = HuRI['protein2'].map(dict1)
    return (HuRI)
Exemple #5
0
def convert(file):
    mg = mygene.MyGeneInfo()
    query = []
    with open(csv_path) as csvfile:
        dataReader = csv.reader(csvfile) 
        for row in dataReader:
            if row[0] != 'Symbols':
                #print(row[0])
                query.append(row[0].upper())

    query_results = mg.querymany(query, scopes='ensemblgene', fields='symbol', species='mouse')
    #print(query_results)

    converted_symbols = []
    seen = []
    for d in query_results:
        if 'notfound' in d:
            converted_symbols.append(d['query'])
        elif d['query'] in seen:
            pass
        else:
            converted_symbols.append(d['symbol'])
            seen.append(d['query'])

    return converted_symbols


#test_list = ['ENSMUSG00000000171','abc','ENSMUSG00000000190','efg','ENSMUSG00000000244','ENSMUSG00000000303','ENSMUSG00000000440']
#test = user_convert(test_list)
#print(test[0])
#print(test[1])
Exemple #6
0
def hgnc_to_entrez(iterable):
    """
    Give this an iterable with HGNC gene symbols and convert them to Entrez
    Gene IDs.

    :param iterable: HGNC gene symbols.
    :return: pd.Series with equal number of corresponding gene symbols.
    """
    import mygene as _mygene

    _mg = _mygene.MyGeneInfo()

    target = 'entrezgene'

    input_iterable = iterable

    try:
        iterable = iterable.unique()
    except AttributeError:
        pass

    out = _mg.querymany(iterable,
                        scopes=['symbol', 'alias'],
                        fields=target,
                        species='human',
                        as_dataframe=True)

    mgidf = out.loc[out.entrezgene.notnull()].copy()
    # remove '.0' from IDs and convert to string. Mygene should be improved to return entrez ids as string.
    mgidf.loc[:, target] = mgidf.loc[:, target].astype(int).astype(str)
    return _pd.Series(input_iterable).apply(
        lambda x: _match_response_with_query(mgidf, x, target))
Exemple #7
0
def entrez_to_name_online(entrezID):
    mg = mygene.MyGeneInfo()
    out = mg.querymany(entrezID,
                       scopes='entrezgene',
                       fields='symbol',
                       species='human')
    return out[0]['symbol']
def query_mygene(entrez_set, tax_id):
    """Query MyGene.info to get detailed gene information."""

    q_genes = entrez_set
    q_scopes = ['entrezgene', 'retired']
    output_fields = ['entrezgene', 'ensembl.gene', 'symbol', 'uniprot']

    mg = mygene.MyGeneInfo()
    logging.info(f"Querying {q_scopes} in MyGene.info ...")
    q_results = mg.querymany(q_genes,
                             scopes=q_scopes,
                             fields=output_fields,
                             species=tax_id,
                             returnall=True)

    genes_info = dict()
    for gene in q_results['out']:
        q_str = gene["query"]
        genes_info[q_str] = {
            'source': q_str,
            'mygene': gene.get('_id', None),
            'ncbigene': gene.get('entrezgene', None),
            'ensemblgene': gene.get('ensembl', None),
            'symbol': gene.get('symbol', None),
            'uniprot': gene.get('uniprot', None)
        }

    return genes_info
Exemple #9
0
def user_convert(gene_list):
    mg = mygene.MyGeneInfo()

    query = []
    remaining = gene_list.copy()
    for e in gene_list:
        if e.startswith("ENSMUSG"):
            query.append(e)
            remaining.remove(e)

    query_results = mg.querymany(query,
                                 scopes='ensemblgene',
                                 fields='symbol',
                                 species='mouse')

    converted_symbols = []
    seen, unmapped = [], []
    for d in query_results:
        if 'notfound' in d:
            unmapped.append(d['query'].upper())
        elif d['query'] in seen:
            pass
        else:
            converted_symbols.append(d['symbol'].upper())
            seen.append(d['query'])

    converted_gene_list = remaining + converted_symbols

    return [converted_gene_list, unmapped]
Exemple #10
0
    def id_mapping(filename):
        """ Map the gene id to the gene name from the pathway file from wikipathway
        Args:
            filename (:obj: str): the file path
        Returns:
            :obj:'list': a list of gene names        
        """
        df_id = pd.read_csv(filename, sep='\t', header=0, low_memory=False)
        mg = mygene.MyGeneInfo()
        gene_list = []
        count = 0
        gene_not_found = 0
        print("Now we're mapping the gene list of the pathway.")
        for index, row in df_id.iterrows():
            gene_id = row['Identifier']
            gene_name = mg.getgene(gene_id, fields='symbol')
            if gene_name is not None:
                gene_list.append(gene_name['symbol'])
            else:
                print('ID: ', gene_id, 'is not found.')
                gene_not_found += 1
            count += 1

        print("There is/are", count, "gene(s) in this pathway.", gene_not_found,
              "of them is/are not found when id mapping.")
        return gene_list
Exemple #11
0
def main():
    """
	Translates Ensembl IDs to HUGO using mygene queries.
	"""
    # SNAKEMAKE I/O #
    rnaseq_transcripts = snakemake.input[0]
    rnaseq_transcripts_translated = snakemake.output[0]

    # Load data and trim version number from transcript ensembl id
    print("Annotating Ensembl IDs to HUGO...")

    ensembl_transcripts = pd.read_csv(rnaseq_transcripts, sep='\t')
    ensembl_transcripts['Ensembl_ID'] = ensembl_transcripts[
        'Ensembl_ID'].str.split('.').str[0].str.strip()

    MyGene = mygene.MyGeneInfo()
    gene_query = ensembl_transcripts['Ensembl_ID']

    annotated_names = MyGene.getgenes(gene_query,
                                      fields='symbol',
                                      as_dataframe=True)

    # Get rid of unknown transcript-genes relationships
    annotated_names = annotated_names[~(annotated_names['notfound'] == True)]

    # Drop duplicates... we are losing some info, but we can afford it. 7 genes
    # duplicated. Only one transcript will be used for these

    annotated_names = annotated_names.drop_duplicates(keep='first')
    annotated_names.drop(['_id', '_score', 'notfound'], axis=1, inplace=True)
    annotated_names.reset_index(inplace=True)
    annotated_names.columns = ['Ensembl_ID', 'Hugo_Symbol']

    annotated_names.to_csv(rnaseq_transcripts_translated, sep=',', index=False)
Exemple #12
0
 def __init__(self):
     self.mg = mygene.MyGeneInfo()
     ref_path = '/stor/work/Lambowitz/ref/hg19_ref/genes'
     self.trans = ref_path + '/trans.txt'
     self.trans = pd.read_csv(self.trans,
                              sep='\t',
                              names=['gsymbol', 'tid'])
def convert_nm_ids_to_flybase(df1):
    # Remove all rows summing to 0
    df11 = df1.loc[~(df1 == 0).all(axis=1)]

    # Use mygene to change refseq into flybase ID
    mg = mygene.MyGeneInfo()
    mg = get_client('gene')

    # Calling mygene to map NM_ IDs to Flybase names
    print("Calling mygene.")
    refseq_list = df11.index.tolist()
    df_geneIDs = mg.querymany(
        refseq_list,
        scopes="refseq",
        fields=["ensembl.gene", "uniprot", "symbol", "reporter"],
        species="fruitfly",
        as_dataframe=True)
    new_index_list = df_geneIDs["ensembl.gene"].tolist()

    # Plotting loss of gene IDs per replicate
    plot_NaN("df_merged", df_geneIDs)

    df11['flybase_id'] = new_index_list
    cols = list(df11.columns)
    cols = [cols[-1]] + cols[:-1]
    df11 = df11[cols]
    # Adding the flybase names to dataframe
    #df1 = df1.set_index([pd.Index(new_index_list)])
    #df_converted=df1.reset_index().dropna().set_index("gene_id")
    #print("Convertion complete: \n", df_converted.head())
    #print("df_merged lost ", len(df1)-len(df_converted)," thus ",
    #      1-(len(df_converted)/len(df1)),"% gene IDs.")
    return df11
def main():
    """
	Translates Ensembl IDs to HUGO using mygene queries.
	"""
    # SNAKEMAKE INPUT #
    snp_array_db = snakemake.input[0]

    # SNAKEMAKE OUTPUT #
    translated_affy_snp = snakemake.output[0]

    # Load data and trim version number from ensembl id

    print("Annotating Ensembl IDs to HUGO...")

    ensembl_snp_array = pd.read_csv(snp_array_db, sep='\t')

    ensembl_snp_array['Gene Symbol'] = ensembl_snp_array[
        'Gene Symbol'].str.split('.').str[0].str.strip()

    MyGene = mygene.MyGeneInfo()
    gene_query = ensembl_snp_array['Gene Symbol']

    annotated_names = MyGene.getgenes(gene_query,
                                      fields='symbol',
                                      as_dataframe=True)

    ensembl_snp_array['Gene Symbol'] = annotated_names['symbol'].values

    del annotated_names

    ensembl_snp_array.to_csv(translated_affy_snp, sep=',', index=False)
Exemple #15
0
def uniprotid_to_geneid(uniprotid_list):
    mg = mygene.MyGeneInfo()
    if len(uniprotid_list):
        return mg.querymany(uniprotid_list, scope='symbol,accession',
                            fields='uniprot, taxid', species="all", as_dataframe=True)
    else:
        return []
Exemple #16
0
    def __init__(self):
        self.trimmomatic_path = None
        self.adp_pe_path = None
        self.adp_se_path = None

        self.l_fastq = []
        self.fasta_path = None
        self.hisat2_index_path = None
        self.gtf_path = None
        self.output_dir_path = None
        self.cpu_num = None

        self.l_fastq_single = []
        self.l_fastq_paired = []
        self.l_sample_single = []
        self.l_sample_paired = []

        self.l_sample = []
        self.l_sam_path = []
        self.l_bam_path = []
        self.l_gtf_path = []
        self.l_tsv_path = []

        self.columns = []
        self.data = []
        self.all_refseq_id = None
        self.mg = mygene.MyGeneInfo()
        self.d_search_mygene = defaultdict(lambda: defaultdict(lambda: None))
        self.d_search_entrez = defaultdict(lambda: defaultdict(lambda: None))
Exemple #17
0
def genesymbol2entrez(genelist, species="human"):
    """
    Convert Gene Symbol to Entrez Gene ID.
    """
    mg = mygene.MyGeneInfo()

    res_list = [None] * len(genelist)

    for gidx, gene in enumerate(genelist):
        print(gene)
        gene_query = "symbol:%s" % gene
        res = mg.query(gene_query, species=species)
        if not res["hits"]:
            res_list[gidx] = "NA"
        else:
            for i in range(len(res)):
                try:
                    entrez_geneid = res["hits"][i]["entrezgene"]
                    break
                except:
                    print("searching for entrez_geneid")

            res_list[gidx] = entrez_geneid

    return (res_list)
Exemple #18
0
def gene_info_worker(gene):
    mg = mygene.MyGeneInfo()
    species = ''
    if gene.dataset.species == SpeciesType.HOMO_SAPIENS:
        species = 'human'
    if gene.dataset.species == SpeciesType.MUS_MUSCULUS:
        species = 'mouse'
    if gene.dataset.species == SpeciesType.RATTUS_NORVEGICUS:
        species = 'rat'
    query = 'symbol:{}'.format(gene.gene_symbol)

    result = mg.query(query, species=species)
    if len(result['hits']) == 0:
        _LOG.info("WARNING IT WAS 0, info: {}".format(result['hits']))
    else:
        for hit in result['hits']:
            try:
                if gene.ncbi_uid == None:
                    gene.ncbi_uid = hit['entrezgene']
                    gene.save()
                    _LOG.info("Set entrezgene for {}:{} to {} .".format(
                        gene.gene_symbol, species, hit['entrezgene']))
                else:
                    _LOG.info(
                        "ERROR!  Tried to replace ncbi_uid {} with {} for {}:{}"
                        .format(gene.ncbi_uid, hit['entrezgene'],
                                gene.gene_symbol, species))
                    gene.ncbi_uid = None
                    gene.save()
            except KeyError:
                continue
Exemple #19
0
def get_names(table, db):   # table should be an input
    global valid_db
    gene_ids = []
    translate = {}
    ids = []
    with open(table,'r') as csv:
        csv.readline()
        for line in csv:
            elem = line.split('\t')
            gene = mygene.MyGeneInfo()
            ids.append(elem[1])
            # print(gene.getgene('uniport:P24941','name,symbol'))
    # print(f'{len(ids)} {sorted(ids)}')
    #print(ids)
    if db == valid_db[0]:
        ret = gene.querymany(ids, scopes='uniprot', fields='name,symbol', verbose= False)           # set verbose 'True' to get info about duplicates/missing values of name parsing
        sciname_dict = collect_scinames(ret)
    elif db == valid_db[1] or db == valid_db[2] or db == valid_db[3]:
        for id in range(len(ids)):
            entry = ids[id].split('_')
            ids[id] = entry[1]
       # print(ids)
        ret = gene.querymany(ids, scopes='uniprot', fields='name,symbol', verbose= False)           # set verbose 'True' to get info about duplicates/missing values of name parsing
        sciname_dict = collect_scinames(ret)
    elif db == valid_db[4]:
        ret = gene.querymany(ids, scopes='pdb', fields='name,symbol', verbose= False)           # set verbose 'True' to get info about duplicates/missing values of name parsing
        sciname_dict = collect_scinames(ret)
    else:
        print('Sth went wrong in function get_names.')
        exit()

    # print(sciname_dict)
    return sciname_dict
Exemple #20
0
def annotate_de(df,
                gene_id_col='gene_ids',
                keys=['scores', 'pvals', 'pvals_adj', 'logfoldchanges']):
    """
    Annotate genes in dataframe by querying for gene name, summary, and related pathways
    """
    top_genes = df[gene_id_col]
    mg = mygene.MyGeneInfo()
    fields = ['symbol', 'name', 'summary', 'go.BP.term', 'pathway.kegg.name']
    df_annotation = mg.querymany(top_genes,
                                 scopes='ensembl.gene',
                                 species='human',
                                 fields=fields,
                                 dotfield=True,
                                 as_dataframe=True)
    df_annotation = df_annotation.loc[:, fields]
    cols = []
    for key in keys:
        # col = str(cluster)+'_'+key
        if key in df.columns:
            cols.append('DE_' + key)
            map_key = df[[gene_id_col, key]]
            map_key = map_key.set_index(gene_id_col)
            df_annotation['DE_' + key] = df_annotation.index.map(
                map_key[key]).values
    return df_annotation.loc[:, fields + cols]
Exemple #21
0
def generate_targets_file(disease_id, outpath, anno_type: str = 'entrezgene') -> None:
    """Creates a disease list

    :param disease_id: EFO code from the disease.
    :param outpath:
    :param anno_type: `entrezgene` for Entrez Id or `symbol` for Gene symbol.
    :return:
    """
    ot = OpenTargetsClient()
    assoc = ot.get_associations_for_disease(
        disease_id,
        fields=['association_scoredatatypes', 'target.id']
    ).filter(
        datatype='known_drug'
    )
    ensembl_list = [a['target']['id'] for a in assoc]

    # TODO use the converters.get_converter_to_entrez
    mg = mygene.MyGeneInfo()
    id_mappings = mg.getgenes(ensembl_list, fields=anno_type)

    with open(outpath, 'w+') as outfile:
        for mapping in id_mappings:
            if anno_type in mapping.keys():
                outfile.write(mapping[anno_type])
                outfile.write('\n')
def getMapFromList(idList,
                   inputType='symbol',
                   outputType='All',
                   Species='human'):
    """

    :param model:
    :param idType:
    :return:
    """

    # If not specified, return everything
    if outputType is 'All':
        outputType = [
            'entrezgene', 'kegg', 'ec', 'refseq', 'refseq.protein', 'HGNC',
            'ensembl.gene', 'ensembl.protein', 'uniprot', 'pdb', 'humancyc',
            'MIM', 'reactome'
        ]

    # Use MyGene API to query identifiers as Pandas dataframe
    mg = mygene.MyGeneInfo()
    return mg.querymany(idList,
                        scopes=inputType,
                        fields=outputType,
                        species=Species,
                        as_dataframe=True,
                        df_index=True,
                        returnall=False)
def get_gene_name(df, gene_id_column, scope):
    """This function takes in a data frame, a gene id column name, and a scope to find all gene names for the
    respective gene ids in the data frame. Has a lot of overlap with Get_Gene_Names but isn't as hardcoded.

    Arguments:
        df: data frame to iterate over
        gene_id_column: name of the column containing gene names
        scope: the scope that is to be searched (the database -- 'refseq', 'ensembl'"""
    # Initialize mygene
    mg = mygene.MyGeneInfo()

    # Create two lists to store all names and symbols
    gene_name_list = []
    gene_symbol_list = []

    gene_id_list = df[gene_id_column].tolist()

    # Iterate through all gene IDs to get gene names and symbols
    mg_list = mg.querymany(gene_id_list, scopes=scope, returnall=True)
    for mg in mg_list['out']:
        try:
            gene_name_list.append(mg['name'])
            gene_symbol_list.append(mg['symbol'])
        # If no name or symbol is found, insert - into the data frame
        except:
            gene_name_list.append(" - ")
            gene_symbol_list.append(" - ")
            # print("No value found.")

    # Add gene symbol and gene name to new data frame
    outputdf = pd.DataFrame()
    outputdf['Gene Symbol'] = gene_symbol_list
    outputdf['Gene Name'] = gene_name_list
    return outputdf
Exemple #24
0
def gene_symbol_map(df, organism):
    """
    """
    # mygene API will map UniProt IDs to gene symbols
    mg = mygene.MyGeneInfo()
    #df = pd.read_excel(organism)
    uniprot = df["UniprotID"]

    # mapping uniprot ID to gene symbol
    out = mg.querymany(uniprot,
                       species=organism,
                       scopes='uniprot',
                       fields='symbol',
                       as_dataframe=True)
    df = pd.merge(df, out, how='inner', left_on='UniprotID', right_index=True)

    # clean df to contain relevant info
    df = df[['symbol', 'Pathway', 'parameter.type','parameter.associatedSpecies', \
             'parameter.startValue', 'Substrate', \
             'Product']].set_index('symbol').drop_duplicates(keep='first')

    # Expand Substrate Scope
    s = df['Substrate'].str.split(';').apply(pd.Series, 1).stack()
    s.index = s.index.droplevel(-1)
    s.name = 'Substrate'

    del df['Substrate']
    df = df.join(s).drop_duplicates(keep='first')

    return df
Exemple #25
0
def mapSingleColumn(args):
	## read original file
	ids = set([line.strip() for line in open(args.infile).readlines()])
	print 'Querying %d total IDs:' % (len(ids))

	## get gene IDs
	mg = mygene.MyGeneInfo()
	result = mg.querymany(ids,scopes=args.mapfrom,fields=args.mapto,species=args.species)

	## build mapped dictionary.
	mapped = build_mapped_dict(result,args)
		
	## write mapped file
	out = open(args.outfile,'w')
	missing = 0
	tot=0
	for thisid in ids:
		if thisid not in mapped:
			missing += 1
			continue
		nodes = mapped[thisid]
		for node in nodes:
			if args.retain_orig:
				out.write('%s\t%s\n' % (thisid,node))
			else:
				out.write('%s\n' % (node))
			tot+=1

	print '%d ids in original file %s'  % (len(ids),args.infile)
	print '%d ids written to %s' % (tot,args.outfile)
	print '%d ids were missing' % (missing)
	return
Exemple #26
0
def convert_id(_id, logger):
    logger.info(colored('Converting ID...', 'blue'))
    mg = mygene.MyGeneInfo()
    entrez_id = mg.query(_id, species='human')['hits'][0]['entrezgene']
    gene_symbol = mg.query(entrez_id, species='human')['hits'][0]['symbol']
    ensembl_id = mg.getgene(entrez_id, 'ensembl')['ensembl']['gene']
    return ensembl_id, gene_symbol
Exemple #27
0
def convert_symbol_to_ensembl(gene_list, species="human"):
    """Convert SYMBOLS to ENSEMBL gene ids
    Uses the python package mygene to look up the supplied list of SYMBOLS and return
    the equivalent list of ENSEMBLE GENEIDs. Species needs to be supplied.

    Note: this can result in an error when non-unqiue Symbols are supplied.

    parameters
    ----------
    gene_list: `list`
        list of ensemble_Ids that need to be converted
    species: `str` | default = 'human'
        string identifying the species of the supplied Ensemble IDs

    returns
    -------
    list
        List containing the converted ENSEMBLE gene ids

    """

    mg = mygene.MyGeneInfo()
    gene_symbols = mg.querymany(gene_list,
                                scopes="symbol",
                                fields="ensembl.gene",
                                species=species)

    ensembl = []
    for x in gene_symbols:
        ensembl.append(x.get("ensembl"))

    return ensembl
Exemple #28
0
def handlinginput(infile):
    """
    Step 1: handling input
    Input:
    Raw input from the client
    - list of Entrez gene IDs, Entrez gene symbols or Ensembl ID

    Output:
    Dataframe with valid genes found in the clients input
    - [query] 

    This function prepares a dataframe of the input from the client.
    The imported tool 'mygene' generates a dataframe out of the raw gene list. 
    The duplicate genes and not found genes in the query list are stripped from the generated dataframe.
    They are put into lists. 
    """
    mg = mygene.MyGeneInfo()
    try: # if none of the query genes are recognized, return three empty lists so that tool does not run other functions.
        mygene_dict = mg.querymany(infile, df_index = True, returnall = True, as_dataframe=True,species='human', scopes="ensemblgene,symbol,entrezgene")
    except:
        return [], [], []
    try:
        idname_df = mygene_dict['out'][['_id', 'name', 'symbol']].drop_duplicates().dropna()
    except:
        idname_df = []
    dupli = mygene_dict['dup'] # genes that mygene noticed as duplicates in the query list
    missing = mygene_dict['missing'] # genes that mygene could not recognize in its database
    print "length mygeneinfo df:",len(idname_df)
    return idname_df, dupli, missing
Exemple #29
0
def convert_ensembl_to_entrez(source_ensembl_list):
    '''
    convert id (ENSEMBL --> Gene Entrez)
    :param source_ensembl_list: ENSEMBL LIST [ 'ENSXXX','ENSXXX',...]
    :return: dictionary[ENSEMBL] = Gene Entrez
    '''
    mg = mygene.MyGeneInfo()

    # tmp_gene = ['ENSP00000000233', 'ENSP00000263431', 'ENSP00000353863', 'ENSP00000342026', 'ENSP00000240874']
    # tmp_gene2 = ['ENSG00000148795', 'ENSG00000165359', 'ENSG00000150676']

    source_geneEntrez = mg.querymany(source_ensembl_list,
                                     scopes='ensembl.protein',
                                     fields='entrezgene',
                                     species='human')
    ensembl_to_symbol_dict = dict()

    na_genes = []
    for g in source_geneEntrez:

        s_query = g['query']
        s_symbol = g.get('entrezgene', 'NA')
        if s_symbol == 'NA': na_genes.append(s_query)
        ensembl_to_symbol_dict[s_query] = s_symbol

    print("NA:", len(na_genes), na_genes)
    return ensembl_to_symbol_dict
Exemple #30
0
def geneToEnsembl(ids):
    #converting from official gene symbol to ensembl id
    mg = mygene.MyGeneInfo()
    gene_dict_list = mg.querymany(ids,
                                  scopes='symbol',
                                  fields='ensembl.gene',
                                  species='human')

    gene_dicts = []

    for dict in gene_dict_list:
        for key, value in dict.iteritems():
            if key == "ensembl":
                gene_dicts.append(value)

    print "# of gene names:"
    print len(gene_dicts)
    count = 0
    ensembl_list = []
    for d in gene_dicts:
        try:
            value = d.get('gene')
            ensembl_list.append(value)
            count += 1
        except AttributeError:
            for item in d:
                value = item.get('gene')
                ensembl_list.append(value)
                count += 1

    print "# of ensembl ids found:"
    print(count)

    return ensembl_list