def convert_ID_getAllIsoformsNM( ensemble_ID, boolSave, path, preffix='NM' ): #define prefix, default = NM for mRNA transcript variants print 'Converting IDs' mg = mygene.MyGeneInfo() list_xli = mg.getgene(ensemble_ID, fields='all', as_dataframe=True)['accession']['rna'] list_transcript = [ mg.getgene(ensemble_ID, fields='all', as_dataframe=True)['symbol'] ] for i in range(0, len(list_xli)): if preffix == 'NM': if list_xli[i][:2] == preffix: list_transcript.append(list_xli[i]) elif list_xli[i][:2] == preffix: list_transcript.append(list_xli[i]) print 'ID converted' if boolSave == False: return list_transcript if boolSave == True: if len(list_transcript) != 0: batch_retrieve_seq(list_transcript, 'True', path) else: print 'No RefSeq Ids with the Prefix %s found' % preffix
def geneid_converter(listofids, input_identifier=None, output_identifier=None): ''' Method will take gene identifier and convert into desired one. Identifiers availble: [u'accession', u'alias', u'biocarta', u'chr', u'end', u'ensemblgene', u'ensemblprotein', u'ensembltranscript', u'entrezgene', u'exons', u'flybase', u'generif', u'go', u'hgnc', u'homologene', u'hprd', u'humancyc', u'interpro', u'ipi', u'kegg', u'mgi', u'mim', u'mirbase', u'mousecyc', u'name', u'netpath', u'pdb', u'pfam', u'pharmgkb', u'pid', u'pir', u'prosite', u'ratmap', u'reactome', u'reagent', u'refseq', u'reporter', u'retired', u'rgd', u'smpdb', u'start', u'strand', u'summary', u'symbol', u'tair', u'taxid', u'type_of_gene', u'unigene', u'uniprot', u'wikipathways', u'wormbase', u'xenbase', u'yeastcyc', u'zfin'] :param input_identifier: input identifier eg. entrezgene :param output_identifier: list of output identifier eg. ["symbol", "ensembl.gene"] :param listofids: list of ids to be mapped eg. ['1', '10', '10001'] :return: DataFrame of mapped ids ''' if input_identifier is None: input_identifier = "entrezgene" if output_identifier is None: output_identifier = ["symbol", "ensembl.gene"] mygene_object = mygene.MyGeneInfo() mapped_dataframe = mygene_object.querymany(listofids, scopes=input_identifier, fields=output_identifier, species="human", as_dataframe=True) return mapped_dataframe
def convert_ensembl_to_symbol(gene_list, species="human"): """Convert ENSEMBL gene ids to SYMBOLS Uses the python package mygene to look up the supplied list of ENSEMBLE Ids and return the equivalent list of Symbols. Species needs to be supplied. parameters ---------- gene_list: `list` list of ensemble_Ids that need to be converted species: `str` | default = 'human' string identifying the species of the supplied Ensemble IDs returns ------- list List containing the converted Symbols """ mg = mygene.MyGeneInfo() gene_symbols = mg.querymany(gene_list, scopes="ensembl.gene", fields="symbol", species=species) symbols = [] for x in gene_symbols: symbols.append(x.get("symbol")) return symbols
def protein_interaction_reference (): #HuRI = pd.read_csv("~/Downloads/HI-union.tsv", sep='\t', header = None) ## download the reference file ## https://stringdb-static.org/download/protein.physical.links.detailed.v11.0/9606.protein.physical.links.detailed.v11.0.txt.gz #HuRI = pd.read_csv("~/Downloads/9606.protein.physical.links.detailed.v11.0.txt", sep=' ') HuRI = pd.read_csv("~/Downloads/9606.protein.links.detailed.v11.0.txt", sep=' ') HuRI.head HuRI['protein1'] = HuRI['protein1'].str.split('\.').str[-1].str.strip() HuRI['protein2'] = HuRI['protein2'].str.split('\.').str[-1].str.strip() codes1, uniques1 = pd.factorize(HuRI['protein1']) codes2, uniques2 = pd.factorize(HuRI['protein2']) # Mapping ensembl gene ids to gene symbols¶ mg = mygene.MyGeneInfo() node1 = mg.querymany(uniques1, scopes = 'ensembl.protein', fields = 'symbol', species = 'human', as_dataframe = True) node2 = mg.querymany(uniques2, scopes = 'ensembl.protein', fields = 'symbol', species = 'human', as_dataframe = True) dict1 = pd.Series(node1.symbol.values,index = node1.index).to_dict() HuRI['protein1'] = HuRI['protein1'].map(dict1) HuRI['protein2'] = HuRI['protein2'].map(dict1) return (HuRI)
def convert(file): mg = mygene.MyGeneInfo() query = [] with open(csv_path) as csvfile: dataReader = csv.reader(csvfile) for row in dataReader: if row[0] != 'Symbols': #print(row[0]) query.append(row[0].upper()) query_results = mg.querymany(query, scopes='ensemblgene', fields='symbol', species='mouse') #print(query_results) converted_symbols = [] seen = [] for d in query_results: if 'notfound' in d: converted_symbols.append(d['query']) elif d['query'] in seen: pass else: converted_symbols.append(d['symbol']) seen.append(d['query']) return converted_symbols #test_list = ['ENSMUSG00000000171','abc','ENSMUSG00000000190','efg','ENSMUSG00000000244','ENSMUSG00000000303','ENSMUSG00000000440'] #test = user_convert(test_list) #print(test[0]) #print(test[1])
def hgnc_to_entrez(iterable): """ Give this an iterable with HGNC gene symbols and convert them to Entrez Gene IDs. :param iterable: HGNC gene symbols. :return: pd.Series with equal number of corresponding gene symbols. """ import mygene as _mygene _mg = _mygene.MyGeneInfo() target = 'entrezgene' input_iterable = iterable try: iterable = iterable.unique() except AttributeError: pass out = _mg.querymany(iterable, scopes=['symbol', 'alias'], fields=target, species='human', as_dataframe=True) mgidf = out.loc[out.entrezgene.notnull()].copy() # remove '.0' from IDs and convert to string. Mygene should be improved to return entrez ids as string. mgidf.loc[:, target] = mgidf.loc[:, target].astype(int).astype(str) return _pd.Series(input_iterable).apply( lambda x: _match_response_with_query(mgidf, x, target))
def entrez_to_name_online(entrezID): mg = mygene.MyGeneInfo() out = mg.querymany(entrezID, scopes='entrezgene', fields='symbol', species='human') return out[0]['symbol']
def query_mygene(entrez_set, tax_id): """Query MyGene.info to get detailed gene information.""" q_genes = entrez_set q_scopes = ['entrezgene', 'retired'] output_fields = ['entrezgene', 'ensembl.gene', 'symbol', 'uniprot'] mg = mygene.MyGeneInfo() logging.info(f"Querying {q_scopes} in MyGene.info ...") q_results = mg.querymany(q_genes, scopes=q_scopes, fields=output_fields, species=tax_id, returnall=True) genes_info = dict() for gene in q_results['out']: q_str = gene["query"] genes_info[q_str] = { 'source': q_str, 'mygene': gene.get('_id', None), 'ncbigene': gene.get('entrezgene', None), 'ensemblgene': gene.get('ensembl', None), 'symbol': gene.get('symbol', None), 'uniprot': gene.get('uniprot', None) } return genes_info
def user_convert(gene_list): mg = mygene.MyGeneInfo() query = [] remaining = gene_list.copy() for e in gene_list: if e.startswith("ENSMUSG"): query.append(e) remaining.remove(e) query_results = mg.querymany(query, scopes='ensemblgene', fields='symbol', species='mouse') converted_symbols = [] seen, unmapped = [], [] for d in query_results: if 'notfound' in d: unmapped.append(d['query'].upper()) elif d['query'] in seen: pass else: converted_symbols.append(d['symbol'].upper()) seen.append(d['query']) converted_gene_list = remaining + converted_symbols return [converted_gene_list, unmapped]
def id_mapping(filename): """ Map the gene id to the gene name from the pathway file from wikipathway Args: filename (:obj: str): the file path Returns: :obj:'list': a list of gene names """ df_id = pd.read_csv(filename, sep='\t', header=0, low_memory=False) mg = mygene.MyGeneInfo() gene_list = [] count = 0 gene_not_found = 0 print("Now we're mapping the gene list of the pathway.") for index, row in df_id.iterrows(): gene_id = row['Identifier'] gene_name = mg.getgene(gene_id, fields='symbol') if gene_name is not None: gene_list.append(gene_name['symbol']) else: print('ID: ', gene_id, 'is not found.') gene_not_found += 1 count += 1 print("There is/are", count, "gene(s) in this pathway.", gene_not_found, "of them is/are not found when id mapping.") return gene_list
def main(): """ Translates Ensembl IDs to HUGO using mygene queries. """ # SNAKEMAKE I/O # rnaseq_transcripts = snakemake.input[0] rnaseq_transcripts_translated = snakemake.output[0] # Load data and trim version number from transcript ensembl id print("Annotating Ensembl IDs to HUGO...") ensembl_transcripts = pd.read_csv(rnaseq_transcripts, sep='\t') ensembl_transcripts['Ensembl_ID'] = ensembl_transcripts[ 'Ensembl_ID'].str.split('.').str[0].str.strip() MyGene = mygene.MyGeneInfo() gene_query = ensembl_transcripts['Ensembl_ID'] annotated_names = MyGene.getgenes(gene_query, fields='symbol', as_dataframe=True) # Get rid of unknown transcript-genes relationships annotated_names = annotated_names[~(annotated_names['notfound'] == True)] # Drop duplicates... we are losing some info, but we can afford it. 7 genes # duplicated. Only one transcript will be used for these annotated_names = annotated_names.drop_duplicates(keep='first') annotated_names.drop(['_id', '_score', 'notfound'], axis=1, inplace=True) annotated_names.reset_index(inplace=True) annotated_names.columns = ['Ensembl_ID', 'Hugo_Symbol'] annotated_names.to_csv(rnaseq_transcripts_translated, sep=',', index=False)
def __init__(self): self.mg = mygene.MyGeneInfo() ref_path = '/stor/work/Lambowitz/ref/hg19_ref/genes' self.trans = ref_path + '/trans.txt' self.trans = pd.read_csv(self.trans, sep='\t', names=['gsymbol', 'tid'])
def convert_nm_ids_to_flybase(df1): # Remove all rows summing to 0 df11 = df1.loc[~(df1 == 0).all(axis=1)] # Use mygene to change refseq into flybase ID mg = mygene.MyGeneInfo() mg = get_client('gene') # Calling mygene to map NM_ IDs to Flybase names print("Calling mygene.") refseq_list = df11.index.tolist() df_geneIDs = mg.querymany( refseq_list, scopes="refseq", fields=["ensembl.gene", "uniprot", "symbol", "reporter"], species="fruitfly", as_dataframe=True) new_index_list = df_geneIDs["ensembl.gene"].tolist() # Plotting loss of gene IDs per replicate plot_NaN("df_merged", df_geneIDs) df11['flybase_id'] = new_index_list cols = list(df11.columns) cols = [cols[-1]] + cols[:-1] df11 = df11[cols] # Adding the flybase names to dataframe #df1 = df1.set_index([pd.Index(new_index_list)]) #df_converted=df1.reset_index().dropna().set_index("gene_id") #print("Convertion complete: \n", df_converted.head()) #print("df_merged lost ", len(df1)-len(df_converted)," thus ", # 1-(len(df_converted)/len(df1)),"% gene IDs.") return df11
def main(): """ Translates Ensembl IDs to HUGO using mygene queries. """ # SNAKEMAKE INPUT # snp_array_db = snakemake.input[0] # SNAKEMAKE OUTPUT # translated_affy_snp = snakemake.output[0] # Load data and trim version number from ensembl id print("Annotating Ensembl IDs to HUGO...") ensembl_snp_array = pd.read_csv(snp_array_db, sep='\t') ensembl_snp_array['Gene Symbol'] = ensembl_snp_array[ 'Gene Symbol'].str.split('.').str[0].str.strip() MyGene = mygene.MyGeneInfo() gene_query = ensembl_snp_array['Gene Symbol'] annotated_names = MyGene.getgenes(gene_query, fields='symbol', as_dataframe=True) ensembl_snp_array['Gene Symbol'] = annotated_names['symbol'].values del annotated_names ensembl_snp_array.to_csv(translated_affy_snp, sep=',', index=False)
def uniprotid_to_geneid(uniprotid_list): mg = mygene.MyGeneInfo() if len(uniprotid_list): return mg.querymany(uniprotid_list, scope='symbol,accession', fields='uniprot, taxid', species="all", as_dataframe=True) else: return []
def __init__(self): self.trimmomatic_path = None self.adp_pe_path = None self.adp_se_path = None self.l_fastq = [] self.fasta_path = None self.hisat2_index_path = None self.gtf_path = None self.output_dir_path = None self.cpu_num = None self.l_fastq_single = [] self.l_fastq_paired = [] self.l_sample_single = [] self.l_sample_paired = [] self.l_sample = [] self.l_sam_path = [] self.l_bam_path = [] self.l_gtf_path = [] self.l_tsv_path = [] self.columns = [] self.data = [] self.all_refseq_id = None self.mg = mygene.MyGeneInfo() self.d_search_mygene = defaultdict(lambda: defaultdict(lambda: None)) self.d_search_entrez = defaultdict(lambda: defaultdict(lambda: None))
def genesymbol2entrez(genelist, species="human"): """ Convert Gene Symbol to Entrez Gene ID. """ mg = mygene.MyGeneInfo() res_list = [None] * len(genelist) for gidx, gene in enumerate(genelist): print(gene) gene_query = "symbol:%s" % gene res = mg.query(gene_query, species=species) if not res["hits"]: res_list[gidx] = "NA" else: for i in range(len(res)): try: entrez_geneid = res["hits"][i]["entrezgene"] break except: print("searching for entrez_geneid") res_list[gidx] = entrez_geneid return (res_list)
def gene_info_worker(gene): mg = mygene.MyGeneInfo() species = '' if gene.dataset.species == SpeciesType.HOMO_SAPIENS: species = 'human' if gene.dataset.species == SpeciesType.MUS_MUSCULUS: species = 'mouse' if gene.dataset.species == SpeciesType.RATTUS_NORVEGICUS: species = 'rat' query = 'symbol:{}'.format(gene.gene_symbol) result = mg.query(query, species=species) if len(result['hits']) == 0: _LOG.info("WARNING IT WAS 0, info: {}".format(result['hits'])) else: for hit in result['hits']: try: if gene.ncbi_uid == None: gene.ncbi_uid = hit['entrezgene'] gene.save() _LOG.info("Set entrezgene for {}:{} to {} .".format( gene.gene_symbol, species, hit['entrezgene'])) else: _LOG.info( "ERROR! Tried to replace ncbi_uid {} with {} for {}:{}" .format(gene.ncbi_uid, hit['entrezgene'], gene.gene_symbol, species)) gene.ncbi_uid = None gene.save() except KeyError: continue
def get_names(table, db): # table should be an input global valid_db gene_ids = [] translate = {} ids = [] with open(table,'r') as csv: csv.readline() for line in csv: elem = line.split('\t') gene = mygene.MyGeneInfo() ids.append(elem[1]) # print(gene.getgene('uniport:P24941','name,symbol')) # print(f'{len(ids)} {sorted(ids)}') #print(ids) if db == valid_db[0]: ret = gene.querymany(ids, scopes='uniprot', fields='name,symbol', verbose= False) # set verbose 'True' to get info about duplicates/missing values of name parsing sciname_dict = collect_scinames(ret) elif db == valid_db[1] or db == valid_db[2] or db == valid_db[3]: for id in range(len(ids)): entry = ids[id].split('_') ids[id] = entry[1] # print(ids) ret = gene.querymany(ids, scopes='uniprot', fields='name,symbol', verbose= False) # set verbose 'True' to get info about duplicates/missing values of name parsing sciname_dict = collect_scinames(ret) elif db == valid_db[4]: ret = gene.querymany(ids, scopes='pdb', fields='name,symbol', verbose= False) # set verbose 'True' to get info about duplicates/missing values of name parsing sciname_dict = collect_scinames(ret) else: print('Sth went wrong in function get_names.') exit() # print(sciname_dict) return sciname_dict
def annotate_de(df, gene_id_col='gene_ids', keys=['scores', 'pvals', 'pvals_adj', 'logfoldchanges']): """ Annotate genes in dataframe by querying for gene name, summary, and related pathways """ top_genes = df[gene_id_col] mg = mygene.MyGeneInfo() fields = ['symbol', 'name', 'summary', 'go.BP.term', 'pathway.kegg.name'] df_annotation = mg.querymany(top_genes, scopes='ensembl.gene', species='human', fields=fields, dotfield=True, as_dataframe=True) df_annotation = df_annotation.loc[:, fields] cols = [] for key in keys: # col = str(cluster)+'_'+key if key in df.columns: cols.append('DE_' + key) map_key = df[[gene_id_col, key]] map_key = map_key.set_index(gene_id_col) df_annotation['DE_' + key] = df_annotation.index.map( map_key[key]).values return df_annotation.loc[:, fields + cols]
def generate_targets_file(disease_id, outpath, anno_type: str = 'entrezgene') -> None: """Creates a disease list :param disease_id: EFO code from the disease. :param outpath: :param anno_type: `entrezgene` for Entrez Id or `symbol` for Gene symbol. :return: """ ot = OpenTargetsClient() assoc = ot.get_associations_for_disease( disease_id, fields=['association_scoredatatypes', 'target.id'] ).filter( datatype='known_drug' ) ensembl_list = [a['target']['id'] for a in assoc] # TODO use the converters.get_converter_to_entrez mg = mygene.MyGeneInfo() id_mappings = mg.getgenes(ensembl_list, fields=anno_type) with open(outpath, 'w+') as outfile: for mapping in id_mappings: if anno_type in mapping.keys(): outfile.write(mapping[anno_type]) outfile.write('\n')
def getMapFromList(idList, inputType='symbol', outputType='All', Species='human'): """ :param model: :param idType: :return: """ # If not specified, return everything if outputType is 'All': outputType = [ 'entrezgene', 'kegg', 'ec', 'refseq', 'refseq.protein', 'HGNC', 'ensembl.gene', 'ensembl.protein', 'uniprot', 'pdb', 'humancyc', 'MIM', 'reactome' ] # Use MyGene API to query identifiers as Pandas dataframe mg = mygene.MyGeneInfo() return mg.querymany(idList, scopes=inputType, fields=outputType, species=Species, as_dataframe=True, df_index=True, returnall=False)
def get_gene_name(df, gene_id_column, scope): """This function takes in a data frame, a gene id column name, and a scope to find all gene names for the respective gene ids in the data frame. Has a lot of overlap with Get_Gene_Names but isn't as hardcoded. Arguments: df: data frame to iterate over gene_id_column: name of the column containing gene names scope: the scope that is to be searched (the database -- 'refseq', 'ensembl'""" # Initialize mygene mg = mygene.MyGeneInfo() # Create two lists to store all names and symbols gene_name_list = [] gene_symbol_list = [] gene_id_list = df[gene_id_column].tolist() # Iterate through all gene IDs to get gene names and symbols mg_list = mg.querymany(gene_id_list, scopes=scope, returnall=True) for mg in mg_list['out']: try: gene_name_list.append(mg['name']) gene_symbol_list.append(mg['symbol']) # If no name or symbol is found, insert - into the data frame except: gene_name_list.append(" - ") gene_symbol_list.append(" - ") # print("No value found.") # Add gene symbol and gene name to new data frame outputdf = pd.DataFrame() outputdf['Gene Symbol'] = gene_symbol_list outputdf['Gene Name'] = gene_name_list return outputdf
def gene_symbol_map(df, organism): """ """ # mygene API will map UniProt IDs to gene symbols mg = mygene.MyGeneInfo() #df = pd.read_excel(organism) uniprot = df["UniprotID"] # mapping uniprot ID to gene symbol out = mg.querymany(uniprot, species=organism, scopes='uniprot', fields='symbol', as_dataframe=True) df = pd.merge(df, out, how='inner', left_on='UniprotID', right_index=True) # clean df to contain relevant info df = df[['symbol', 'Pathway', 'parameter.type','parameter.associatedSpecies', \ 'parameter.startValue', 'Substrate', \ 'Product']].set_index('symbol').drop_duplicates(keep='first') # Expand Substrate Scope s = df['Substrate'].str.split(';').apply(pd.Series, 1).stack() s.index = s.index.droplevel(-1) s.name = 'Substrate' del df['Substrate'] df = df.join(s).drop_duplicates(keep='first') return df
def mapSingleColumn(args): ## read original file ids = set([line.strip() for line in open(args.infile).readlines()]) print 'Querying %d total IDs:' % (len(ids)) ## get gene IDs mg = mygene.MyGeneInfo() result = mg.querymany(ids,scopes=args.mapfrom,fields=args.mapto,species=args.species) ## build mapped dictionary. mapped = build_mapped_dict(result,args) ## write mapped file out = open(args.outfile,'w') missing = 0 tot=0 for thisid in ids: if thisid not in mapped: missing += 1 continue nodes = mapped[thisid] for node in nodes: if args.retain_orig: out.write('%s\t%s\n' % (thisid,node)) else: out.write('%s\n' % (node)) tot+=1 print '%d ids in original file %s' % (len(ids),args.infile) print '%d ids written to %s' % (tot,args.outfile) print '%d ids were missing' % (missing) return
def convert_id(_id, logger): logger.info(colored('Converting ID...', 'blue')) mg = mygene.MyGeneInfo() entrez_id = mg.query(_id, species='human')['hits'][0]['entrezgene'] gene_symbol = mg.query(entrez_id, species='human')['hits'][0]['symbol'] ensembl_id = mg.getgene(entrez_id, 'ensembl')['ensembl']['gene'] return ensembl_id, gene_symbol
def convert_symbol_to_ensembl(gene_list, species="human"): """Convert SYMBOLS to ENSEMBL gene ids Uses the python package mygene to look up the supplied list of SYMBOLS and return the equivalent list of ENSEMBLE GENEIDs. Species needs to be supplied. Note: this can result in an error when non-unqiue Symbols are supplied. parameters ---------- gene_list: `list` list of ensemble_Ids that need to be converted species: `str` | default = 'human' string identifying the species of the supplied Ensemble IDs returns ------- list List containing the converted ENSEMBLE gene ids """ mg = mygene.MyGeneInfo() gene_symbols = mg.querymany(gene_list, scopes="symbol", fields="ensembl.gene", species=species) ensembl = [] for x in gene_symbols: ensembl.append(x.get("ensembl")) return ensembl
def handlinginput(infile): """ Step 1: handling input Input: Raw input from the client - list of Entrez gene IDs, Entrez gene symbols or Ensembl ID Output: Dataframe with valid genes found in the clients input - [query] This function prepares a dataframe of the input from the client. The imported tool 'mygene' generates a dataframe out of the raw gene list. The duplicate genes and not found genes in the query list are stripped from the generated dataframe. They are put into lists. """ mg = mygene.MyGeneInfo() try: # if none of the query genes are recognized, return three empty lists so that tool does not run other functions. mygene_dict = mg.querymany(infile, df_index = True, returnall = True, as_dataframe=True,species='human', scopes="ensemblgene,symbol,entrezgene") except: return [], [], [] try: idname_df = mygene_dict['out'][['_id', 'name', 'symbol']].drop_duplicates().dropna() except: idname_df = [] dupli = mygene_dict['dup'] # genes that mygene noticed as duplicates in the query list missing = mygene_dict['missing'] # genes that mygene could not recognize in its database print "length mygeneinfo df:",len(idname_df) return idname_df, dupli, missing
def convert_ensembl_to_entrez(source_ensembl_list): ''' convert id (ENSEMBL --> Gene Entrez) :param source_ensembl_list: ENSEMBL LIST [ 'ENSXXX','ENSXXX',...] :return: dictionary[ENSEMBL] = Gene Entrez ''' mg = mygene.MyGeneInfo() # tmp_gene = ['ENSP00000000233', 'ENSP00000263431', 'ENSP00000353863', 'ENSP00000342026', 'ENSP00000240874'] # tmp_gene2 = ['ENSG00000148795', 'ENSG00000165359', 'ENSG00000150676'] source_geneEntrez = mg.querymany(source_ensembl_list, scopes='ensembl.protein', fields='entrezgene', species='human') ensembl_to_symbol_dict = dict() na_genes = [] for g in source_geneEntrez: s_query = g['query'] s_symbol = g.get('entrezgene', 'NA') if s_symbol == 'NA': na_genes.append(s_query) ensembl_to_symbol_dict[s_query] = s_symbol print("NA:", len(na_genes), na_genes) return ensembl_to_symbol_dict
def geneToEnsembl(ids): #converting from official gene symbol to ensembl id mg = mygene.MyGeneInfo() gene_dict_list = mg.querymany(ids, scopes='symbol', fields='ensembl.gene', species='human') gene_dicts = [] for dict in gene_dict_list: for key, value in dict.iteritems(): if key == "ensembl": gene_dicts.append(value) print "# of gene names:" print len(gene_dicts) count = 0 ensembl_list = [] for d in gene_dicts: try: value = d.get('gene') ensembl_list.append(value) count += 1 except AttributeError: for item in d: value = item.get('gene') ensembl_list.append(value) count += 1 print "# of ensembl ids found:" print(count) return ensembl_list