def get_geneset(filename): indices = [pos for pos, char in enumerate(filename) if char == '.'] outfile = '/Volumes/My Book/AHS_projectdata/geneSets2/' + filename[:indices[ -2]] + '.geneset.csv' print('running ', filename[:indices[1]]) genes = pd.read_csv(filename, sep='\t') if genes.shape[0] == 0: genes.to_csv(outfile, sep='\t') return dataset = Dataset(name='hsapiens_gene_ensembl', host='grch37.ensembl.org') df = pd.concat([ dataset.query( attributes=['ensembl_gene_id', 'hgnc_symbol'], filters={ 'link_ensembl_gene_id': genes.gene_ID.tolist()[:int(len(genes.gene_ID.tolist()) / 2)] }), dataset.query( attributes=['ensembl_gene_id', 'hgnc_symbol'], filters={ 'link_ensembl_gene_id': genes.gene_ID.tolist()[int(len(genes.gene_ID.tolist()) / 2):] }) ], sort=False) df.drop_duplicates(subset=["HGNC symbol"], inplace=True) df.dropna(inplace=True) my_genes = df["HGNC symbol"] my_genes.to_csv(outfile, index=False, header=False)
def biomart(ani_list,ani_dict,out,mode): #server = Server(host='http://www.ensembl.org') #mart = server['ENSEMBL_MART_ENSEMBL'] #all_name = mart.list_datasets() for animal in ani_list: dataset = Dataset(name= ani_dict[animal],host='http://www.ensembl.org') #dataset.list_filters #attr_all_list = dataset.attributes() if mode == "GO": print("Downloading " + animal + " Gene information") if not os.path.exists(out + "/" + "GO"): os.mkdir(out + "/" + "GO") attr_list = ["ensembl_gene_id","external_gene_name","start_position","end_position","description","transcript_count","chromosome_name"] df = dataset.query(attributes= attr_list) df.to_csv(out + "/" + "GO" + "/" + animal + "_GO.txt", index = None, header = True) elif mode == "GOD": print("Downloading " + animal + " Gene Ontology") if not os.path.exists(out + "/" + "GOD"): os.mkdir(out + "/" + "GOD") attr_list = ["ensembl_gene_id","go_id","name_1006","definition_1006"] df = dataset.query(attributes= attr_list) df.to_csv(out + "/" + "GOD" + "/" + animal + "_GOD.txt",sep='\t', index = None, header = True) elif mode == "ORTH": print("Downloading " + animal + " Orthologs") if not os.path.exists(out + "/" + "ORTH"): os.mkdir(out + "/" + "ORTH") orth_list = list(ani_dict.keys()) for o in orth_list: if not os.path.exists(out + "/" + "ORTH" + "/" + o): os.mkdir(out + "/" + "ORTH" + "/" + o) orth_list.remove(animal) sp_list = list() for key in orth_list: sp_name = ani_dict[key].split("_")[0] sp_list.append(sp_name) for sp in sp_list: attr_list = ["ensembl_gene_id","external_gene_name",sp + "_homolog_ensembl_gene",sp + "_homolog_associated_gene_name",sp + "_homolog_orthology_type"] df = dataset.query(attributes= attr_list) f = list(animal_name_dict.keys())[list(animal_name_dict.values()).index(sp + "_gene_ensembl")] print("Downloading orthologs between " + animal + " and " + f) df.to_csv(out + "/" + "ORTH" + "/" + animal + "/" + animal + "_" + f + ".txt", index = None, header = True) elif mode == "PC": print("Downloading protein coding genes information for " + animal) if not os.path.exists(out + "/" + "PC"): os.mkdir(out + "/" + "PC") attr_list = ["ensembl_gene_id","external_gene_name","go_id"] filter_list = {"biotype": ["protein_coding"]} df = dataset.query(attributes= attr_list, filters= filter_list) df.to_csv(out + "/" + "PC" + "/" + animal + ".txt", index = None, header = True)
def get_ensembl_table(): dataset = Dataset(name='hsapiens_gene_ensembl', host='http://www.ensembl.org') table = dataset.query( attributes=['ensembl_gene_id', 'external_gene_name', 'unigene']) return table
def test_ensembl(self): """Tests example query to ensembl.""" dataset = Dataset(name='hsapiens_gene_ensembl', host='http://www.ensembl.org', use_cache=False) result = dataset.query( attributes=['ensembl_gene_id', 'external_gene_name']) assert result.shape[0] > 0 assert result.shape[1] == 2
def get_biomart(species, meta): tmp_host = 'http://asia.ensembl.org' server = Server(host=tmp_host) query_set = None try: dataset = Dataset(name=species, host=tmp_host) if meta: query_set = dataset.query(attributes=[ 'ensembl_gene_id', 'external_gene_name', 'description', 'uniprotswissprot', 'kegg_enzyme', 'metacyc' ]) else: query_set = dataset.query(attributes=[ 'ensembl_gene_id', 'external_gene_name', 'description', 'uniprotswissprot', 'kegg_enzyme' ]) except IndexError: mart = server['ENSEMBL_MART_ENSEMBL'] print('Invalid dataset in BioMart') print(mart.list_datasets()) return query_set
def gene_length_normalize(*, genes_info, genes_col='HGNC symbol', length_col='gene_length', scores_df, samples_col): """ Normalize dataset by gene length. if gene lengths file is not provided, info will be retrieved from ensembl. Parameters ---------- genes_info : str file containing gene lengths. If file is not provided, info will be retrieved from ensembl genes_col : str column containing gene names. length_col : str column containing the length of each gene. scores_df : pd.DataFrame dataframe containing data to normalize. samples_col : str column containing samples IDs. Returns ------- pd.Dataframe dataframe containing normalized dataframe. """ unnormalized = [] if not genes_info: dataset = Dataset(name='hsapiens_gene_ensembl', host='http://www.ensembl.org') genes_df = dataset.query( attributes=['hgnc_symbol', 'start_position', 'end_position']) genes_df['gene_length'] = genes_df['Gene end (bp)'] - genes_df[ 'Gene start (bp)'] else: genes_df = pd.read_csv(genes_info, sep='\t') genes_lengths = genes_df.set_index(genes_col).to_dict()[length_col] for (name, data) in tqdm(scores_df.drop(columns=[samples_col]).iteritems(), desc="Normalizing genes scores"): if name not in genes_lengths.keys(): unnormalized.append(name) continue # normalize genes by length scores_df[name] = round(scores_df[name] / genes_lengths[name], 5) scores_df = scores_df.drop(unnormalized, axis=1) return scores_df
def get_homology_lookup() -> pd.DataFrame: """ Returns lookup table consisting of ensembl id of reference species (C.elegans) and ensembl id, gene symbol, orthology type and orthology confidence of the other species (D.melanogaster). """ dataset = Dataset(name=CELEGANS_DATASET_NAME, host=HOST) attributes = [ENSEMBL_ID_ATTRIBUTE] + DROSO_HOMO_ATTRIBUTES df_lookup = dataset.query(attributes=attributes, filters=None) df_lookup.to_csv(LOOKUP_FILENAME, header=True, index=True) return df_lookup
def get_ref_proteins(gene_list): """Get wild type protein sequences for each gene with an alternative junction Args: gene_list (list): list of gene symbols Returns: final_gene_df (df): reference protein df with sequence, length, and ID """ dataset = Dataset(name='hsapiens_gene_ensembl', host='http://www.ensembl.org') total_gene_df = pd.DataFrame() gene_info = dataset.query(attributes=[ "external_gene_name", "ensembl_gene_id", "ensembl_transcript_id", "ensembl_peptide_id", "chromosome_name", "start_position", "end_position", "strand", "transcript_start", "transcript_end", "transcription_start_site", "transcript_length", "transcript_tsl", "transcript_biotype" ]) for gene in gene_list: gene_df = gene_info.loc[gene_info["Gene name"] == gene] # filter out NaN values gene_df = gene_df.loc[gene_df["Transcript support level (TSL)"].astype( str).str.contains("tsl")] gene_df["tsl"] = [ re.search(r'\d', x).group() for x in list(gene_df["Transcript support level (TSL)"]) ] # filter by protein coding and TSL == 1,2 gene_df = gene_df[(gene_df["Transcript type"] == "protein_coding") & (gene_df["tsl"].isin(["1", "2"]))] gene_df["protein sequence"] = [ ensembl_rest.sequence_id(x)["seq"] for x in list(gene_df["Protein stable ID"]) ] gene_df["protein length"] = [ len(x) for x in list(gene_df["protein sequence"]) ] final_gene_df = gene_df[[ "Protein stable ID", "protein sequence", "protein length" ]] final_gene_df["gene"] = gene total_gene_df = total_gene_df.append(final_gene_df, ignore_index=True) total_gene_df.to_csv("protein_sequences.tsv", sep='\t', index=False) return total_gene_df
def get_species_ens_entrez_lookup(dataset_name: str) -> pd.DataFrame: """ Returns lookup table for a ensembl dataset name with 2 columns: ensembl id, entrez id. """ dataset = Dataset(name=dataset_name, host=HOST) df_lookup = dataset.query(attributes=[ ENSEMBL_ID_ATTRIBUTE, ENTREZ_ID_ATTRIBUTE], filters=None) df_lookup.to_csv(RESULTS_DIR / ("{}_ENS_ENTREZ_LOOKUP_.csv" .format(dataset_name.split("_")[0].upper())), header=True, index=True) return df_lookup
outF = open(evalign_stat, "w") print( "Position\tIsoform\tGene_ID\tReads\tEvent_mean\tEvent_median\tSD\tDistance", end="\n", file=outF) [ print(k.split('_')[2] + '\t' + k.split('_')[1] + '\t' + k.split('_')[0] + '\t' + str(len(v)) + '\t' + str(np.mean(v)) + '\t' + str(np.median(v)) + '\t' + str(np.std(v)) + '\t' + str(np.mean(v) - 123.83), end="\n", file=outF) for k, v in data_dict.items() ] evalign_file.close() isoform_file.close() outF.close() outF2.close() dataset = Dataset(name='hsapiens_gene_ensembl', host='http://www.ensembl.org') conversion = dataset.query( attributes=['ensembl_gene_id', 'external_gene_name']) conversion.columns = ['Gene_ID', 'Gene_symbol'] for file in [evalign_stat, evalign_df]: df = pd.read_csv(file, sep='\t') df_merge = pd.merge(df, conversion, how='inner', on=['Gene_ID']) df_merge.to_csv(file, header=True, index=False, sep='\t')
def biomart(ani_list, ani_dict, out, mode): #server = Server(host='http://www.ensembl.org') #mart = server['ENSEMBL_MART_ENSEMBL'] #all_name = mart.list_datasets() #attr_all_list = dataset.attributes() if mode == "Bos_Chromosome_18": dataset = Dataset(name="btaurus_gene_ensembl", host='http://www.ensembl.org') print("Downloading " + mode + " Gene information") if not os.path.exists(path + "temp_Data/" + "Bos_Chromosome_18/"): os.mkdir(path + "temp_Data/" + "Bos_Chromosome_18/") attr_list = ["ensembl_gene_id"] filter_list = {'chromosome_name': ['18']} df = dataset.query(attributes=attr_list, filters=filter_list) df.to_csv(path + "temp_Data/" + "Bos_Chromosome_18/" + "Cow_C_18.txt", index=None, header=True) if mode == "sex": for ani in animal_list: dataset = Dataset(name=ani_dict[ani], host='http://www.ensembl.org') print("Downloading " + ani + mode + " Gene information") if not os.path.exists(path + "temp_Data/" + "sex/"): os.mkdir(path + "temp_Data/" + "sex/") attr_list = ["ensembl_gene_id"] if ani == "Chicken": filter_list = {'chromosome_name': ['W', "Z"]} df = dataset.query(attributes=attr_list, filters=filter_list) df.to_csv(path + "temp_Data/" + "sex/" + ani + "_sex.txt", index=None, header=True) print(ani + "W Z") else: try: filter_list = {'chromosome_name': ["X"]} df = dataset.query(attributes=attr_list, filters=filter_list) df.to_csv(path + "temp_Data/" + "sex/" + ani + "_sex.txt", index=None, header=True) print(ani + "X Y") except: filter_list = {'chromosome_name': ["X"]} df = dataset.query(attributes=attr_list, filters=filter_list) df.to_csv(path + "temp_Data/" + "sex/" + ani + "_sex.txt", index=None, header=True) print(ani + "X") if mode == "MT": for ani in animal_list: dataset = Dataset(name=ani_dict[ani], host='http://www.ensembl.org') print("Downloading " + ani + mode + " Gene information") if not os.path.exists(path + "temp_Data/" + "MT/"): os.mkdir(path + "temp_Data/" + "MT/") attr_list = ["ensembl_gene_id"] try: filter_list = {'chromosome_name': ["MT"]} df = dataset.query(attributes=attr_list, filters=filter_list) df.to_csv(path + "temp_Data/" + "MT/" + ani + "_MT.txt", index=None, header=True) except: print("NO Mitochondrial genes in {}".format(ani))
def handle_upload_2(fn): patients = [] patients1 = [] patients2 = [] genes = [] geneNames = [] #data = {} data1 = {} data2 = {} group1 = [] group2 = [] group_labels1 = [] group_labels2 = [] group1_data = [] group2_data = [] #patient_ids = ['3dbe99d1-e3b8-4ee2-b6a8-2e2e12c6fbe9','6F4C8D30-47FB-47DF-9EB7-4E5881E3711E','95CEF916-5545-455B-920C-773A54FC7676','67C73260-A242-4BBA-87C5-D2302556DFF7','55262FCB-1B01-4480- B322-36570430C917','3dbe99d1-e3b8-4ee2-b6a8-2e2e12c6fbe9','6F4C8D30-47FB-47DF-9EB7-4E5881E3711E','95CEF916-5545-455B-920C-773A54FC7676','67C73260-A242-4BBA-87C5-D2302556DFF7','55262FCB-1B01-4480- B322-36570430C917'] #patientfilename = 'nationwidechildrens.org_clinical_patient_brca.txt' patients.append([1, 2, 3, 4]) patients1.append(['1', '2']) #patients.append(['3','4']) patients2.append(['3', '4']) group_labels1.append([1, 1]) group_labels2.append([2, 2]) logstring = "Creating Plots for given input files... \n\n" logstring = "Reading gene expression data... \n" line_no = 0 patient_ids = [] survival = [] survival_yrs = [] data = [] #group2.append(group_labels2) group1.append([1, 1]) group2.append([2, 2]) genes = () dataset = Dataset(name='hsapiens_gene_ensembl', host='http://www.ensembl.org') conv = dataset.query(attributes=['ensembl_gene_id', 'external_gene_name']) genes_3 = {} logstring = logstring + "\n\n Matching gene and protein IDs... \n" red_patch = mpatches.Patch(color='red', label='Group1') blue_patch = mpatches.Patch(color='blue', label='Group2') #lut = dict(zip(set(endData[0]), sns.hls_palette(len(set(endData[0])), l=0.5, s=0.8))) #col_colors = pd.DataFrame(endData[0])[0].map(lut) #print(col_colors) #colors = np.array(['#BB0000','#BB0000','#0000BB','#0000BB']) #df9 = pd.DataFrame(data=endData[1:,0:],index=geneNames,columns=patients) #df2 = pd.DataFrame(data=endData[0,0:], index='',columns=patients) #my_palette = dict(zip(df[.unique(), ["orange","yellow","brown"])) #row_colors = df2.cyl.map(my_palette) #fig, (ax1, ax2) = plt.subplots(1,2,sharex=True,sharey=True) colordict = {0: '#BB0000', 1: '#0000BB'} #logstring = logstring + str(df9) df2 = pd.read_csv(fn, delim_whitespace=True, header=None, index_col=0) #print(df2.head()) df = df2.transpose() survival_real = df['SURVIVAL'] #df['is_train'] = np.random.uniform(0, 1, len(df)) <= .75 #train, test = df[df['is_train']==True], df[df['is_train']==False] features = df.columns[3:60483] x = df.iloc[:, 5:500] y = df.iloc[:, 2] print(df) rm = linear_model.LinearRegression() rm.fit(x, y) #print(rm.intercept_) #print(rm.coef_) #print(rm.predict(x)) predictions = rm.predict(x) real_values = df.iloc[:, 2].values.tolist() ret = [] for j in range(1, len(survival_real)): accur = "FALSE" if (abs(float(predictions[j]) - float(real_values[j])) < 1.0): print("Foo") accur = "TRUE" ret.append({ 'patient_id': j, 'real_value': real_values[j], 'prediction': predictions[j], 'was_correct': accur }) return (ret)
######################################## Main code ######################################## print("Start:", datetime.datetime.now()) EnsemblRestClient = EnsemblRestClient() ### DOWNLOAD BASIC GENE INFORMATION FROM ENSEMBL (ID, CHROMOSOME, POSITION, STRAND, BIOTYPE) dataset = Dataset(name='hsapiens_gene_ensembl', host='http://grch37.ensembl.org') ensembl_genes = dataset.query(attributes=[ 'ensembl_gene_id', 'chromosome_name', 'start_position', 'end_position', 'strand', 'gene_biotype' ], filters={ 'chromosome_name': [ '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', 'X', 'Y', 'MT' ] }) regions = [] for line in ensembl_genes.iterrows(): index, data = line columns = data.tolist() regions.append({ 'id': columns[0], 'chromosome': columns[1], 'start': int(columns[2]), 'end': int(columns[3]),
else: return y if __name__ == '__main__': # # [H]omo [S]apiens (9606) - [A]liases # print('Mapping HS') # Query bioMart for Gene Name/Description ds_HS = Dataset(name='hsapiens_gene_ensembl', host='http://www.ensembl.org') df_HS_G = ds_HS.query(attributes=[ 'ensembl_gene_id', 'external_gene_name', 'gene_biotype', 'description' ]).set_index('Gene stable ID') rCSVFileCG = "../01-diff-gene-exp/results/HS/HS-DGE_Cyte_vs_Gonia.csv" rCSVFileCT = "../01-diff-gene-exp/results/HS/HS-DGE_Tid_vs_Cyte.csv" df_HS_CG = pd.read_csv(rCSVFileCG, index_col=0).loc[:, ['logFC', 'logCPM', 'FDR']] df_HS_CG.index.name = 'id_gene' df_HS_CG.index = df_HS_CG.index.map(lambda x: x.split('.')[0]) df_HS_CG.columns = [x + '_CyteGonia' for x in df_HS_CG.columns] df_HS_CT = pd.read_csv(rCSVFileCT, index_col=0).loc[:, ['logFC', 'logCPM', 'FDR']] df_HS_CT.columns = [x + '_TidCyte' for x in df_HS_CT.columns] df_HS_CT.index.name = 'id_gene' df_HS_CT.index = df_HS_CT.index.map(lambda x: x.split('.')[0])
# Ahora sacamos las anotaciones y las guardamos en un mapa # Tomato anotacionesTomato = {} datasetTomato = Dataset(name='slycopersicum_eg_gene', virtual_schema='plants_mart', host='http://plants.ensembl.org') # Para cada gen, lanzamos una consulta a Biomart for genSnapdragon in matchesTomato: print('tomato') genTomato, evalue = matchesTomato[genSnapdragon] resultTomato = datasetTomato.query( attributes=[ 'ensembl_gene_id', 'ensembl_transcript_id', 'go_id', 'go_linkage_type', 'namespace_1003' ], filters={'link_ensembl_transcript_stable_id': genTomato}) # Si encuentra match: if len(resultTomato) > 0: # Comprobamos que no haya valores nan lr = resultTomato['GO term accession'].tolist() lr = [(e, evalue) for e in lr if str(e) != 'nan'] # Si sigue habiendo anotaciones, las guardamos if len(lr) > 0: anotacionesTomato[genSnapdragon] = lr # Thaliana anotacionesThaliana = {}
import mygene from pybiomart import Dataset mg = mygene.MyGeneInfo() import math #fh1 = open("test_gene_list_2.txt") #lines = fh1.readlines() #for line in lines: # print(line.split("\t")[1]) # print(mg.getgene(line.split("\t")[1], fields='all')) dataset = Dataset(name='hsapiens_gene_ensembl', host='http://www.ensembl.org') conv = dataset.query( attributes=['ensembl_gene_id', 'external_gene_name', 'entrezgene']) convlist = conv['Gene name'].tolist() text_file = open("test_go_1.txt", "w") fh1 = open("test_go_list.txt") lines = fh1.readlines() term_dict = {} ctr = 0 for line in lines: if (ctr == 50000): print(ctr) print(term_dict) ctr = 0 for key in term_dict: if (key != float('nan')): text_file.write(
def read_ndex_file_4(fn): """ Given an input string/file, parse the network and return two-column array with interaction partners :param fn: Imput NDEx file as string :return: Printed as strings, two-column array with interaction partners """ lines6 = "" # read edges and nodes into arrays if ("edges" in fn.split("nodes")[1]): lines5 = fn.split("{\"nodes\":[") lines3 = lines5[1].split("{\"edges\":[")[0] # remove "cyTableColumn" from array containing edges if ("cyTableColumn" in lines5[1]): lines4 = lines5[1].split("{\"edges\":[")[1].split( "{\"cyTableColumn\":[")[0] lines4 = lines4[:-4] # take protein name from networkAttributes or nodeAttributes if it is defined there. elif ("networkAttributes" in lines5[1]): lines4 = lines5[1].split("{\"edges\":[")[1].split( "{\"networkAttributes\":[")[0] lines4 = lines4[:-4] if ("nodeAttributes" in lines5[1].split("{\"edges\":[")[1].split( "{\"networkAttributes\":[")[1] and "UniprotName" in lines5[1].split("{\"edges\":[") [1].split("{\"networkAttributes\":[")[1]): lines6_temp = \ lines5[1].split("{\"edges\":[")[1].split("{\"networkAttributes\":[")[1].split( "{\"nodeAttributes\":[")[ 1] lines6 = lines6_temp.split("{\"edgeAttributes\":[")[0] else: lines4 = lines5[1].split("{\"edges\":[")[1] # check if edge-array comes before node-array in file elif ("edges" in fn.split("nodes")[0]): lines5 = fn.split("{\"nodes\":[") lines3 = lines5[1].split("]},")[0] + "]]]" lines4 = lines5[0].split("{\"edges\":[")[1][:-4] # lines3 contains the nodes, lines4 the edges, lines6 contains nodeAttributes (information from the ndex file usable for the conversion from node IDs to gene IDs) # remove signs to allow automatic json to array conversion lines3.replace("@", "") lines3.replace("uniprot:", "uniprot") lines3.replace("signor:", "signor") lines3.replace(" ", "") lines3.replace("ncbigene:", "") lines3.replace("\\n", "") lines33 = lines3[:-3].replace("}]", "") node_line = lines33.replace("ncbigene:", "") nodelinesplit = node_line.split(", ") dictlist = [] # node dict is later filled with keys (node IDs) and the values are NCBI gene IDs node_dict = {} if not (node_line.endswith("}")): node_line = node_line + "}" node_line_2 = "[" + node_line + "]" tmp2 = json.loads(node_line_2) node_dict_2 = {} # iterate over lines in nodeAttributes if not (lines6 == ""): lines6 = "[" + lines6 # get array with nodeAttributes for current line tmp4 = json.loads(lines6[:-4]) # if node element has attribute "GeneName_A", then the NCBI ID is given in the nodeAttributes for item in tmp4: if (item['n'] == "GeneName_A"): # use node ID and NCBI ID node_dict_2[item['po']] = item['v'] # print(str(item['po']) + " " + str(item['v'])) # print(node_dict_2) dataset = Dataset(name='hsapiens_gene_ensembl', host='http://www.ensembl.org') conv = dataset.query( attributes=['ensembl_gene_id', 'external_gene_name', 'entrezgene_id']) conv_genelist = conv['Gene name'].tolist() for item in tmp2: dictlist.append(item) # write conversion from node ID to gene ID in dictionary, based on nodeAttributes from the data if ('r' in item): if (any(c.islower() for c in item['r'])): gene_name = item['n'] if (gene_name in conv_genelist): gene_nbr = conv.index[conv['Gene name'] == gene_name] gene_nbr1 = conv.loc[gene_nbr, 'NCBI gene ID'].values[0] node_dict[item['@id']] = gene_nbr1 # print(item) else: node_dict[item['@id']] = item['r'] # print(item) else: if (item['n'].isdigit()): # if gene ID is in node attributes # print(item) node_dict[item['@id']] = item['n'] elif (item['n'] in node_dict_2): # otherwise use conversion table to convert gene ID to NCBI ID gene_name = node_dict_2[item['n']] # print(gene_name) if (gene_name in conv_genelist): gene_nbr = conv.index[conv['Gene name'] == gene_name] gene_nbr1 = conv.loc[gene_nbr, 'NCBI gene ID'].values[0] node_dict[item['@id']] = gene_nbr1 # print(gene_nbr1) # print(node_dict) # remove signs from string to allow json conversion lines4.replace("@", "") lines4.replace("uniprot:", "uniprot") lines4.replace("signor:", "signor") lines4.replace(" ", "") lines4 = lines4.replace("]", "") edge_line = lines4.rstrip() edge_line_2 = "[" + edge_line + "]" edgelinesplit = edge_line.split(", ") edgelist = [] tmp4 = json.loads(edge_line_2) # get dictionary with gene names and NCBI IDs (entrezgene_id) dataset = Dataset(name='hsapiens_gene_ensembl', host='http://www.ensembl.org') conv = dataset.query( attributes=['ensembl_gene_id', 'external_gene_name', 'entrezgene_id']) ret = [] # convert node IDs in edges to NCBI IDs for item in tmp4: # print(item) if (item['s'] in node_dict and item['t'] in node_dict): source = node_dict[item['s']] target = node_dict[item['t']] # print(source) # print(target) if (source != target and not (math.isnan(float(source))) and not (math.isnan(float(target)))): baz = [str(int(source)), str(int(target))] ret.append("\t".join(baz)) # print("\n".join(ret)) return ("\n".join(ret))
modules = data[celltype][layer] # for specie in species print('Calculating GOEA on {celltype:s} {network:s} {threshold:.1f} {layer:s}'.format(celltype=celltype, network=network, threshold=threshold, layer=layer)) # Load Gene Population print("Load gene population (from biomart)") datamart_name = dict_datamart_names[layer] ds = Dataset(name=datamart_name, host='http://www.ensembl.org') if layer == 'DM': attributes = ['ensembl_gene_id', 'uniprotswissprot', 'external_gene_name'] elif layer == 'MM': attributes = ['ensembl_gene_id', 'uniprotswissprot', 'mgi_id', 'external_gene_name'] elif layer == 'HS': attributes = ['ensembl_gene_id', 'uniprotswissprot', 'hmmpanther', 'external_gene_name'] dfQ = ds.query(attributes=attributes).set_index('Gene stable ID') # Population of genes (background) to test against if layer == 'DM': pop_flybase = set(dfQ.index.tolist()) pop_uniprot = set(dfQ['UniProtKB/Swiss-Prot ID'].dropna().tolist()) pop = pop_flybase.union(pop_uniprot) elif layer == 'MM': pop_mgi = set(dfQ['MGI ID'].dropna().tolist()) pop_uniprot = set(dfQ['UniProtKB/Swiss-Prot ID'].dropna().tolist()) pop = pop_mgi.union(pop_uniprot) elif layer == 'HS': pop_uniprot = set(dfQ['UniProtKB/Swiss-Prot ID'].dropna().tolist()) pop = pop_uniprot # Load GO
from pybiomart import Server server = Server(host='http://www.ensembl.org') server.list_marts() mart = server['ENSEMBL_MART_ENSEMBL'] from pybiomart import Dataset dataset = Dataset(name='hsapiens_gene_ensembl', host='http://www.ensembl.org') dataset.query(attributes=['ensembl_gene_id', 'external_gene_name'], filters={'chromosome_name': ['1', '2']}) def attributes(self): if self._attributes is None: self._filters, self._attributes = self._fetch_configuration() return self._attributes dataset.attributes dataset.list_attributes() def filters(self): if self._filters is None: self._filters, self._attributes = self._fetch_configuration() return self._filters dataset.filters dataset.list_filters()
datamart_names = { 'HS': 'hsapiens_gene_ensembl', 'MM': 'mmusculus_gene_ensembl', 'DM': 'dmelanogaster_gene_ensembl' } r = [] for specie in species: print("Calculating for species: {specie:s}".format(specie=specie)) print("Querying Datamart") datamart_name = datamart_names[specie] ds = Dataset(name=datamart_name, host='http://www.ensembl.org') dfQ = ds.query(attributes=[ 'ensembl_gene_id', 'external_gene_name', 'gene_biotype' ]).set_index('Gene stable ID') # n_genome = len(dfQ) dfQpc = dfQ.loc[(dfQ['Gene type'] == 'protein_coding'), :] n_genome_pc = len(dfQpc) DFQnpc = dfQ.loc[(dfQ['Gene type'] != 'protein_coding'), :] n_genome_non_pc = len(DFQnpc) print('done.') for celltype in celltypes: print("Calculating for celltype: {celltype:s}".format( celltype=celltype)) rFPKMfile = '../../02-core_genes/results/FPKM/{specie:s}/{specie:s}-FPKM-{celltype:s}.csv.gz'.format(