def get_geneset(filename): indices = [pos for pos, char in enumerate(filename) if char == '.'] outfile = '/Volumes/My Book/AHS_projectdata/geneSets2/' + filename[:indices[ -2]] + '.geneset.csv' print('running ', filename[:indices[1]]) genes = pd.read_csv(filename, sep='\t') if genes.shape[0] == 0: genes.to_csv(outfile, sep='\t') return dataset = Dataset(name='hsapiens_gene_ensembl', host='grch37.ensembl.org') df = pd.concat([ dataset.query( attributes=['ensembl_gene_id', 'hgnc_symbol'], filters={ 'link_ensembl_gene_id': genes.gene_ID.tolist()[:int(len(genes.gene_ID.tolist()) / 2)] }), dataset.query( attributes=['ensembl_gene_id', 'hgnc_symbol'], filters={ 'link_ensembl_gene_id': genes.gene_ID.tolist()[int(len(genes.gene_ID.tolist()) / 2):] }) ], sort=False) df.drop_duplicates(subset=["HGNC symbol"], inplace=True) df.dropna(inplace=True) my_genes = df["HGNC symbol"] my_genes.to_csv(outfile, index=False, header=False)
def get_ensembl_table(): dataset = Dataset(name='hsapiens_gene_ensembl', host='http://www.ensembl.org') table = dataset.query( attributes=['ensembl_gene_id', 'external_gene_name', 'unigene']) return table
def test_ensembl(self): """Tests example query to ensembl.""" dataset = Dataset(name='hsapiens_gene_ensembl', host='http://www.ensembl.org', use_cache=False) result = dataset.query( attributes=['ensembl_gene_id', 'external_gene_name']) assert result.shape[0] > 0 assert result.shape[1] == 2
def gene_length_normalize(*, genes_info, genes_col='HGNC symbol', length_col='gene_length', scores_df, samples_col): """ Normalize dataset by gene length. if gene lengths file is not provided, info will be retrieved from ensembl. Parameters ---------- genes_info : str file containing gene lengths. If file is not provided, info will be retrieved from ensembl genes_col : str column containing gene names. length_col : str column containing the length of each gene. scores_df : pd.DataFrame dataframe containing data to normalize. samples_col : str column containing samples IDs. Returns ------- pd.Dataframe dataframe containing normalized dataframe. """ unnormalized = [] if not genes_info: dataset = Dataset(name='hsapiens_gene_ensembl', host='http://www.ensembl.org') genes_df = dataset.query( attributes=['hgnc_symbol', 'start_position', 'end_position']) genes_df['gene_length'] = genes_df['Gene end (bp)'] - genes_df[ 'Gene start (bp)'] else: genes_df = pd.read_csv(genes_info, sep='\t') genes_lengths = genes_df.set_index(genes_col).to_dict()[length_col] for (name, data) in tqdm(scores_df.drop(columns=[samples_col]).iteritems(), desc="Normalizing genes scores"): if name not in genes_lengths.keys(): unnormalized.append(name) continue # normalize genes by length scores_df[name] = round(scores_df[name] / genes_lengths[name], 5) scores_df = scores_df.drop(unnormalized, axis=1) return scores_df
def get_homology_lookup() -> pd.DataFrame: """ Returns lookup table consisting of ensembl id of reference species (C.elegans) and ensembl id, gene symbol, orthology type and orthology confidence of the other species (D.melanogaster). """ dataset = Dataset(name=CELEGANS_DATASET_NAME, host=HOST) attributes = [ENSEMBL_ID_ATTRIBUTE] + DROSO_HOMO_ATTRIBUTES df_lookup = dataset.query(attributes=attributes, filters=None) df_lookup.to_csv(LOOKUP_FILENAME, header=True, index=True) return df_lookup
def get_ref_proteins(gene_list): """Get wild type protein sequences for each gene with an alternative junction Args: gene_list (list): list of gene symbols Returns: final_gene_df (df): reference protein df with sequence, length, and ID """ dataset = Dataset(name='hsapiens_gene_ensembl', host='http://www.ensembl.org') total_gene_df = pd.DataFrame() gene_info = dataset.query(attributes=[ "external_gene_name", "ensembl_gene_id", "ensembl_transcript_id", "ensembl_peptide_id", "chromosome_name", "start_position", "end_position", "strand", "transcript_start", "transcript_end", "transcription_start_site", "transcript_length", "transcript_tsl", "transcript_biotype" ]) for gene in gene_list: gene_df = gene_info.loc[gene_info["Gene name"] == gene] # filter out NaN values gene_df = gene_df.loc[gene_df["Transcript support level (TSL)"].astype( str).str.contains("tsl")] gene_df["tsl"] = [ re.search(r'\d', x).group() for x in list(gene_df["Transcript support level (TSL)"]) ] # filter by protein coding and TSL == 1,2 gene_df = gene_df[(gene_df["Transcript type"] == "protein_coding") & (gene_df["tsl"].isin(["1", "2"]))] gene_df["protein sequence"] = [ ensembl_rest.sequence_id(x)["seq"] for x in list(gene_df["Protein stable ID"]) ] gene_df["protein length"] = [ len(x) for x in list(gene_df["protein sequence"]) ] final_gene_df = gene_df[[ "Protein stable ID", "protein sequence", "protein length" ]] final_gene_df["gene"] = gene total_gene_df = total_gene_df.append(final_gene_df, ignore_index=True) total_gene_df.to_csv("protein_sequences.tsv", sep='\t', index=False) return total_gene_df
def get_species_ens_entrez_lookup(dataset_name: str) -> pd.DataFrame: """ Returns lookup table for a ensembl dataset name with 2 columns: ensembl id, entrez id. """ dataset = Dataset(name=dataset_name, host=HOST) df_lookup = dataset.query(attributes=[ ENSEMBL_ID_ATTRIBUTE, ENTREZ_ID_ATTRIBUTE], filters=None) df_lookup.to_csv(RESULTS_DIR / ("{}_ENS_ENTREZ_LOOKUP_.csv" .format(dataset_name.split("_")[0].upper())), header=True, index=True) return df_lookup
def get_biomart(species, meta): tmp_host = 'http://asia.ensembl.org' server = Server(host=tmp_host) query_set = None try: dataset = Dataset(name=species, host=tmp_host) if meta: query_set = dataset.query(attributes=[ 'ensembl_gene_id', 'external_gene_name', 'description', 'uniprotswissprot', 'kegg_enzyme', 'metacyc' ]) else: query_set = dataset.query(attributes=[ 'ensembl_gene_id', 'external_gene_name', 'description', 'uniprotswissprot', 'kegg_enzyme' ]) except IndexError: mart = server['ENSEMBL_MART_ENSEMBL'] print('Invalid dataset in BioMart') print(mart.list_datasets()) return query_set
def biomart(ani_list,ani_dict,out,mode): #server = Server(host='http://www.ensembl.org') #mart = server['ENSEMBL_MART_ENSEMBL'] #all_name = mart.list_datasets() for animal in ani_list: dataset = Dataset(name= ani_dict[animal],host='http://www.ensembl.org') #dataset.list_filters #attr_all_list = dataset.attributes() if mode == "GO": print("Downloading " + animal + " Gene information") if not os.path.exists(out + "/" + "GO"): os.mkdir(out + "/" + "GO") attr_list = ["ensembl_gene_id","external_gene_name","start_position","end_position","description","transcript_count","chromosome_name"] df = dataset.query(attributes= attr_list) df.to_csv(out + "/" + "GO" + "/" + animal + "_GO.txt", index = None, header = True) elif mode == "GOD": print("Downloading " + animal + " Gene Ontology") if not os.path.exists(out + "/" + "GOD"): os.mkdir(out + "/" + "GOD") attr_list = ["ensembl_gene_id","go_id","name_1006","definition_1006"] df = dataset.query(attributes= attr_list) df.to_csv(out + "/" + "GOD" + "/" + animal + "_GOD.txt",sep='\t', index = None, header = True) elif mode == "ORTH": print("Downloading " + animal + " Orthologs") if not os.path.exists(out + "/" + "ORTH"): os.mkdir(out + "/" + "ORTH") orth_list = list(ani_dict.keys()) for o in orth_list: if not os.path.exists(out + "/" + "ORTH" + "/" + o): os.mkdir(out + "/" + "ORTH" + "/" + o) orth_list.remove(animal) sp_list = list() for key in orth_list: sp_name = ani_dict[key].split("_")[0] sp_list.append(sp_name) for sp in sp_list: attr_list = ["ensembl_gene_id","external_gene_name",sp + "_homolog_ensembl_gene",sp + "_homolog_associated_gene_name",sp + "_homolog_orthology_type"] df = dataset.query(attributes= attr_list) f = list(animal_name_dict.keys())[list(animal_name_dict.values()).index(sp + "_gene_ensembl")] print("Downloading orthologs between " + animal + " and " + f) df.to_csv(out + "/" + "ORTH" + "/" + animal + "/" + animal + "_" + f + ".txt", index = None, header = True) elif mode == "PC": print("Downloading protein coding genes information for " + animal) if not os.path.exists(out + "/" + "PC"): os.mkdir(out + "/" + "PC") attr_list = ["ensembl_gene_id","external_gene_name","go_id"] filter_list = {"biotype": ["protein_coding"]} df = dataset.query(attributes= attr_list, filters= filter_list) df.to_csv(out + "/" + "PC" + "/" + animal + ".txt", index = None, header = True)
def biomart(ani_list, ani_dict, out, mode): #server = Server(host='http://www.ensembl.org') #mart = server['ENSEMBL_MART_ENSEMBL'] #all_name = mart.list_datasets() #attr_all_list = dataset.attributes() if mode == "Bos_Chromosome_18": dataset = Dataset(name="btaurus_gene_ensembl", host='http://www.ensembl.org') print("Downloading " + mode + " Gene information") if not os.path.exists(path + "temp_Data/" + "Bos_Chromosome_18/"): os.mkdir(path + "temp_Data/" + "Bos_Chromosome_18/") attr_list = ["ensembl_gene_id"] filter_list = {'chromosome_name': ['18']} df = dataset.query(attributes=attr_list, filters=filter_list) df.to_csv(path + "temp_Data/" + "Bos_Chromosome_18/" + "Cow_C_18.txt", index=None, header=True) if mode == "sex": for ani in animal_list: dataset = Dataset(name=ani_dict[ani], host='http://www.ensembl.org') print("Downloading " + ani + mode + " Gene information") if not os.path.exists(path + "temp_Data/" + "sex/"): os.mkdir(path + "temp_Data/" + "sex/") attr_list = ["ensembl_gene_id"] if ani == "Chicken": filter_list = {'chromosome_name': ['W', "Z"]} df = dataset.query(attributes=attr_list, filters=filter_list) df.to_csv(path + "temp_Data/" + "sex/" + ani + "_sex.txt", index=None, header=True) print(ani + "W Z") else: try: filter_list = {'chromosome_name': ["X"]} df = dataset.query(attributes=attr_list, filters=filter_list) df.to_csv(path + "temp_Data/" + "sex/" + ani + "_sex.txt", index=None, header=True) print(ani + "X Y") except: filter_list = {'chromosome_name': ["X"]} df = dataset.query(attributes=attr_list, filters=filter_list) df.to_csv(path + "temp_Data/" + "sex/" + ani + "_sex.txt", index=None, header=True) print(ani + "X") if mode == "MT": for ani in animal_list: dataset = Dataset(name=ani_dict[ani], host='http://www.ensembl.org') print("Downloading " + ani + mode + " Gene information") if not os.path.exists(path + "temp_Data/" + "MT/"): os.mkdir(path + "temp_Data/" + "MT/") attr_list = ["ensembl_gene_id"] try: filter_list = {'chromosome_name': ["MT"]} df = dataset.query(attributes=attr_list, filters=filter_list) df.to_csv(path + "temp_Data/" + "MT/" + ani + "_MT.txt", index=None, header=True) except: print("NO Mitochondrial genes in {}".format(ani))
outF = open(evalign_stat, "w") print( "Position\tIsoform\tGene_ID\tReads\tEvent_mean\tEvent_median\tSD\tDistance", end="\n", file=outF) [ print(k.split('_')[2] + '\t' + k.split('_')[1] + '\t' + k.split('_')[0] + '\t' + str(len(v)) + '\t' + str(np.mean(v)) + '\t' + str(np.median(v)) + '\t' + str(np.std(v)) + '\t' + str(np.mean(v) - 123.83), end="\n", file=outF) for k, v in data_dict.items() ] evalign_file.close() isoform_file.close() outF.close() outF2.close() dataset = Dataset(name='hsapiens_gene_ensembl', host='http://www.ensembl.org') conversion = dataset.query( attributes=['ensembl_gene_id', 'external_gene_name']) conversion.columns = ['Gene_ID', 'Gene_symbol'] for file in [evalign_stat, evalign_df]: df = pd.read_csv(file, sep='\t') df_merge = pd.merge(df, conversion, how='inner', on=['Gene_ID']) df_merge.to_csv(file, header=True, index=False, sep='\t')
def handle_upload_2(fn): patients = [] patients1 = [] patients2 = [] genes = [] geneNames = [] #data = {} data1 = {} data2 = {} group1 = [] group2 = [] group_labels1 = [] group_labels2 = [] group1_data = [] group2_data = [] #patient_ids = ['3dbe99d1-e3b8-4ee2-b6a8-2e2e12c6fbe9','6F4C8D30-47FB-47DF-9EB7-4E5881E3711E','95CEF916-5545-455B-920C-773A54FC7676','67C73260-A242-4BBA-87C5-D2302556DFF7','55262FCB-1B01-4480- B322-36570430C917','3dbe99d1-e3b8-4ee2-b6a8-2e2e12c6fbe9','6F4C8D30-47FB-47DF-9EB7-4E5881E3711E','95CEF916-5545-455B-920C-773A54FC7676','67C73260-A242-4BBA-87C5-D2302556DFF7','55262FCB-1B01-4480- B322-36570430C917'] #patientfilename = 'nationwidechildrens.org_clinical_patient_brca.txt' patients.append([1, 2, 3, 4]) patients1.append(['1', '2']) #patients.append(['3','4']) patients2.append(['3', '4']) group_labels1.append([1, 1]) group_labels2.append([2, 2]) logstring = "Creating Plots for given input files... \n\n" logstring = "Reading gene expression data... \n" line_no = 0 patient_ids = [] survival = [] survival_yrs = [] data = [] #group2.append(group_labels2) group1.append([1, 1]) group2.append([2, 2]) genes = () dataset = Dataset(name='hsapiens_gene_ensembl', host='http://www.ensembl.org') conv = dataset.query(attributes=['ensembl_gene_id', 'external_gene_name']) genes_3 = {} logstring = logstring + "\n\n Matching gene and protein IDs... \n" red_patch = mpatches.Patch(color='red', label='Group1') blue_patch = mpatches.Patch(color='blue', label='Group2') #lut = dict(zip(set(endData[0]), sns.hls_palette(len(set(endData[0])), l=0.5, s=0.8))) #col_colors = pd.DataFrame(endData[0])[0].map(lut) #print(col_colors) #colors = np.array(['#BB0000','#BB0000','#0000BB','#0000BB']) #df9 = pd.DataFrame(data=endData[1:,0:],index=geneNames,columns=patients) #df2 = pd.DataFrame(data=endData[0,0:], index='',columns=patients) #my_palette = dict(zip(df[.unique(), ["orange","yellow","brown"])) #row_colors = df2.cyl.map(my_palette) #fig, (ax1, ax2) = plt.subplots(1,2,sharex=True,sharey=True) colordict = {0: '#BB0000', 1: '#0000BB'} #logstring = logstring + str(df9) df2 = pd.read_csv(fn, delim_whitespace=True, header=None, index_col=0) #print(df2.head()) df = df2.transpose() survival_real = df['SURVIVAL'] #df['is_train'] = np.random.uniform(0, 1, len(df)) <= .75 #train, test = df[df['is_train']==True], df[df['is_train']==False] features = df.columns[3:60483] x = df.iloc[:, 5:500] y = df.iloc[:, 2] print(df) rm = linear_model.LinearRegression() rm.fit(x, y) #print(rm.intercept_) #print(rm.coef_) #print(rm.predict(x)) predictions = rm.predict(x) real_values = df.iloc[:, 2].values.tolist() ret = [] for j in range(1, len(survival_real)): accur = "FALSE" if (abs(float(predictions[j]) - float(real_values[j])) < 1.0): print("Foo") accur = "TRUE" ret.append({ 'patient_id': j, 'real_value': real_values[j], 'prediction': predictions[j], 'was_correct': accur }) return (ret)
matchesThaliana = {} for i, row in dfSalidaThaliana.iterrows(): genSnapdragon = row['query acc.ver'] genThaliana = row['subject acc.ver'] evalue = row['evalue'] matchesThaliana[genSnapdragon] = (genThaliana, evalue) # Ahora sacamos las anotaciones y las guardamos en un mapa # Tomato anotacionesTomato = {} datasetTomato = Dataset(name='slycopersicum_eg_gene', virtual_schema='plants_mart', host='http://plants.ensembl.org') # Para cada gen, lanzamos una consulta a Biomart for genSnapdragon in matchesTomato: print('tomato') genTomato, evalue = matchesTomato[genSnapdragon] resultTomato = datasetTomato.query( attributes=[ 'ensembl_gene_id', 'ensembl_transcript_id', 'go_id', 'go_linkage_type', 'namespace_1003' ], filters={'link_ensembl_transcript_stable_id': genTomato}) # Si encuentra match: if len(resultTomato) > 0:
# anchor = 'A' # chrom = 'chr4' # ex 2: DSG3 D + INDEL (1 aa deletion) # transcript_id = 'ENST00000257189' junction_coors = [31472788, 31474124] anchor = 'D' chrom = 'chr18' # ex 3: RAB18 NDA + FS # # transcript_id = 'ENST00000356940' # [stop_exon, start_exon] # junction_coors = [27509930,27532507] # anchor = 'NDA' # chrom = 'chr10' # load ensembl dataset with pybiomart dataset = Dataset(name='hsapiens_gene_ensembl', host='http://www.ensembl.org') # use fasta to get sequences ref_fasta = Fasta('/Users/meganrichters/Documents/ref_files/all_sequences.fa') def get_coding_coordinates(dataset, transcript_id, anchor): # get ref info and drop NaN values - ex: exons that are not part of the coding sequence ref_tscript_info = dataset.query(attributes=[ "ensembl_transcript_id", "strand", "transcript_start", "transcript_end", "exon_chrom_start", "exon_chrom_end", "genomic_coding_start", "genomic_coding_end" ], filters={ 'link_ensembl_transcript_stable_id': [transcript_id] }).dropna()
return x elif not pd.isna(x): return x else: return y if __name__ == '__main__': # # [H]omo [S]apiens (9606) - [A]liases # print('Mapping HS') # Query bioMart for Gene Name/Description ds_HS = Dataset(name='hsapiens_gene_ensembl', host='http://www.ensembl.org') df_HS_G = ds_HS.query(attributes=[ 'ensembl_gene_id', 'external_gene_name', 'gene_biotype', 'description' ]).set_index('Gene stable ID') rCSVFileCG = "../01-diff-gene-exp/results/HS/HS-DGE_Cyte_vs_Gonia.csv" rCSVFileCT = "../01-diff-gene-exp/results/HS/HS-DGE_Tid_vs_Cyte.csv" df_HS_CG = pd.read_csv(rCSVFileCG, index_col=0).loc[:, ['logFC', 'logCPM', 'FDR']] df_HS_CG.index.name = 'id_gene' df_HS_CG.index = df_HS_CG.index.map(lambda x: x.split('.')[0]) df_HS_CG.columns = [x + '_CyteGonia' for x in df_HS_CG.columns] df_HS_CT = pd.read_csv(rCSVFileCT, index_col=0).loc[:, ['logFC', 'logCPM', 'FDR']] df_HS_CT.columns = [x + '_TidCyte' for x in df_HS_CT.columns] df_HS_CT.index.name = 'id_gene'
def import_ndex(network_id, force_update=False): """ Download and process the PPI network directly from ndexbio.org :param network_id: String, UUID of the network to download :param force_update: Boolean, if true the cached version will be ignored and updated :return: String, one line per interaction, seperated by tabs """ ndex_server = 'public.ndexbio.org' # --- Check if we can use a cached version # Connect to NDEx server anonymously, download metadata and get modification time network_metadata = ndex2.client.Ndex2(ndex_server) \ .get_network_summary(network_id) network_modification_time = datetime.fromtimestamp( network_metadata['modificationTime'] / 1000.0, tz=timezone.utc) # Try and retrieve a cached version. Check if the modification date is within spec, return the cached network if not force_update: try: ppi_network_cache = PpiNetworkCache.objects.get( network_id=network_id) datetime_now = timezone.now() # The network data modification date must be the same as the one just retrieved, # the network cache must have been created within the last 24h if ppi_network_cache.data_last_modified == network_modification_time and \ datetime_now - timedelta(hours=24) <= ppi_network_cache.last_modified: print( f'Network cached on {ppi_network_cache.last_modified.isoformat()}' ) return ppi_network_cache.network_string except PpiNetworkCache.DoesNotExist: # Download and generate network if no cache exists pass # Import NDEx from server based on UUID nice_cx_network = ndex2.create_nice_cx_from_server(server=ndex_server, uuid=network_id) # --- Create a node_id to gene_id dict which maps from the node_id to the gene_id node_to_gene_df = pd.DataFrame([x[1] for x in nice_cx_network.get_nodes()]) \ .rename({'@id': 'Node ID', 'n': 'Gene name'}, axis='columns') # If we are using APID, then we need to use another attribute if network_id == '9c38ce6e-c564-11e8-aaa6-0ac135e8bacf': node_to_gene_df['Gene name'] = node_to_gene_df['Node ID'].map( lambda x: nice_cx_network.get_node_attribute_value( x, 'GeneName_A')) # --- Create gene_id to other_id dict which maps from gene_id to other ID e.g NCBI IDs query_attributes = ['external_gene_name', 'entrezgene_id'] gene_mapping_df = Dataset(name='hsapiens_gene_ensembl', host='http://www.ensembl.org').query( attributes=query_attributes).dropna() gene_mapping_df.columns = query_attributes # set the Gene name (the one used in the networks as ID). Then convert # the entrez IDs into int and then to string gene_mapping_df = gene_mapping_df \ .drop_duplicates(subset=['external_gene_name'], keep='first') \ .set_index('external_gene_name') \ .astype(int).astype(str) # Create the mapping dict gene_mapping_dict = gene_mapping_df.to_dict()[ 'entrezgene_id'] # Get the entrez IDs # --- Apply gene mapping from gene name to NCBI IDs to the note_to_gene_df and drop missing values node_to_gene_df['Gene name'] = node_to_gene_df['Gene name'].map( gene_mapping_dict) node_to_gene_dict = node_to_gene_df \ .set_index('Node ID') \ .dropna() \ .to_dict()['Gene name'] # --- Create the network PPI file # Iterate over all edges result_list = [] for _, edge in nice_cx_network.get_edges(): edge_source = edge.get('s') edge_target = edge.get('t') if edge_source != edge_target: # Convert source and target to NCBI IDs and write into string try: result_list.append(node_to_gene_dict[edge_source] + '\t' + node_to_gene_dict[edge_target]) except KeyError: # If no mapping can be found, skip this node continue # --- Save version to cache (db) and return result network string result_string = '\n'.join(result_list) PpiNetworkCache.objects.update_or_create(network_id=network_id, defaults={ 'data_last_modified': network_modification_time, 'network_string': result_string }) return result_string
def read_ndex_file_4(fn): """ Given an input string/file, parse the network and return two-column array with interaction partners :param fn: Imput NDEx file as string :return: Printed as strings, two-column array with interaction partners """ lines6 = "" # read edges and nodes into arrays if ("edges" in fn.split("nodes")[1]): lines5 = fn.split("{\"nodes\":[") lines3 = lines5[1].split("{\"edges\":[")[0] # remove "cyTableColumn" from array containing edges if ("cyTableColumn" in lines5[1]): lines4 = lines5[1].split("{\"edges\":[")[1].split( "{\"cyTableColumn\":[")[0] lines4 = lines4[:-4] # take protein name from networkAttributes or nodeAttributes if it is defined there. elif ("networkAttributes" in lines5[1]): lines4 = lines5[1].split("{\"edges\":[")[1].split( "{\"networkAttributes\":[")[0] lines4 = lines4[:-4] if ("nodeAttributes" in lines5[1].split("{\"edges\":[")[1].split( "{\"networkAttributes\":[")[1] and "UniprotName" in lines5[1].split("{\"edges\":[") [1].split("{\"networkAttributes\":[")[1]): lines6_temp = \ lines5[1].split("{\"edges\":[")[1].split("{\"networkAttributes\":[")[1].split( "{\"nodeAttributes\":[")[ 1] lines6 = lines6_temp.split("{\"edgeAttributes\":[")[0] else: lines4 = lines5[1].split("{\"edges\":[")[1] # check if edge-array comes before node-array in file elif ("edges" in fn.split("nodes")[0]): lines5 = fn.split("{\"nodes\":[") lines3 = lines5[1].split("]},")[0] + "]]]" lines4 = lines5[0].split("{\"edges\":[")[1][:-4] # lines3 contains the nodes, lines4 the edges, lines6 contains nodeAttributes (information from the ndex file usable for the conversion from node IDs to gene IDs) # remove signs to allow automatic json to array conversion lines3.replace("@", "") lines3.replace("uniprot:", "uniprot") lines3.replace("signor:", "signor") lines3.replace(" ", "") lines3.replace("ncbigene:", "") lines3.replace("\\n", "") lines33 = lines3[:-3].replace("}]", "") node_line = lines33.replace("ncbigene:", "") nodelinesplit = node_line.split(", ") dictlist = [] # node dict is later filled with keys (node IDs) and the values are NCBI gene IDs node_dict = {} if not (node_line.endswith("}")): node_line = node_line + "}" node_line_2 = "[" + node_line + "]" tmp2 = json.loads(node_line_2) node_dict_2 = {} # iterate over lines in nodeAttributes if not (lines6 == ""): lines6 = "[" + lines6 # get array with nodeAttributes for current line tmp4 = json.loads(lines6[:-4]) # if node element has attribute "GeneName_A", then the NCBI ID is given in the nodeAttributes for item in tmp4: if (item['n'] == "GeneName_A"): # use node ID and NCBI ID node_dict_2[item['po']] = item['v'] # print(str(item['po']) + " " + str(item['v'])) # print(node_dict_2) dataset = Dataset(name='hsapiens_gene_ensembl', host='http://www.ensembl.org') conv = dataset.query( attributes=['ensembl_gene_id', 'external_gene_name', 'entrezgene_id']) conv_genelist = conv['Gene name'].tolist() for item in tmp2: dictlist.append(item) # write conversion from node ID to gene ID in dictionary, based on nodeAttributes from the data if ('r' in item): if (any(c.islower() for c in item['r'])): gene_name = item['n'] if (gene_name in conv_genelist): gene_nbr = conv.index[conv['Gene name'] == gene_name] gene_nbr1 = conv.loc[gene_nbr, 'NCBI gene ID'].values[0] node_dict[item['@id']] = gene_nbr1 # print(item) else: node_dict[item['@id']] = item['r'] # print(item) else: if (item['n'].isdigit()): # if gene ID is in node attributes # print(item) node_dict[item['@id']] = item['n'] elif (item['n'] in node_dict_2): # otherwise use conversion table to convert gene ID to NCBI ID gene_name = node_dict_2[item['n']] # print(gene_name) if (gene_name in conv_genelist): gene_nbr = conv.index[conv['Gene name'] == gene_name] gene_nbr1 = conv.loc[gene_nbr, 'NCBI gene ID'].values[0] node_dict[item['@id']] = gene_nbr1 # print(gene_nbr1) # print(node_dict) # remove signs from string to allow json conversion lines4.replace("@", "") lines4.replace("uniprot:", "uniprot") lines4.replace("signor:", "signor") lines4.replace(" ", "") lines4 = lines4.replace("]", "") edge_line = lines4.rstrip() edge_line_2 = "[" + edge_line + "]" edgelinesplit = edge_line.split(", ") edgelist = [] tmp4 = json.loads(edge_line_2) # get dictionary with gene names and NCBI IDs (entrezgene_id) dataset = Dataset(name='hsapiens_gene_ensembl', host='http://www.ensembl.org') conv = dataset.query( attributes=['ensembl_gene_id', 'external_gene_name', 'entrezgene_id']) ret = [] # convert node IDs in edges to NCBI IDs for item in tmp4: # print(item) if (item['s'] in node_dict and item['t'] in node_dict): source = node_dict[item['s']] target = node_dict[item['t']] # print(source) # print(target) if (source != target and not (math.isnan(float(source))) and not (math.isnan(float(target)))): baz = [str(int(source)), str(int(target))] ret.append("\t".join(baz)) # print("\n".join(ret)) return ("\n".join(ret))
# continue # fasta.write( ">"+svid+"."+read.query_name+"\n") # fasta.write(read.seq+"\n") fasta.close() bamfile.close() ######################################## Main code ######################################## print("Start:", datetime.datetime.now()) EnsemblRestClient = EnsemblRestClient() ### DOWNLOAD BASIC GENE INFORMATION FROM ENSEMBL (ID, CHROMOSOME, POSITION, STRAND, BIOTYPE) dataset = Dataset(name='hsapiens_gene_ensembl', host='http://grch37.ensembl.org') ensembl_genes = dataset.query(attributes=[ 'ensembl_gene_id', 'chromosome_name', 'start_position', 'end_position', 'strand', 'gene_biotype' ], filters={ 'chromosome_name': [ '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', 'X', 'Y', 'MT' ] }) regions = [] for line in ensembl_genes.iterrows():
}, 'enterocyte': { 'HS': enterocyte_pca_modules_hs, 'MM': enterocyte_pca_modules_mm, 'DM': enterocyte_pca_modules_dm, } } modules = data[celltype][layer] # for specie in species print('Calculating GOEA on {celltype:s} {network:s} {threshold:.1f} {layer:s}'.format(celltype=celltype, network=network, threshold=threshold, layer=layer)) # Load Gene Population print("Load gene population (from biomart)") datamart_name = dict_datamart_names[layer] ds = Dataset(name=datamart_name, host='http://www.ensembl.org') if layer == 'DM': attributes = ['ensembl_gene_id', 'uniprotswissprot', 'external_gene_name'] elif layer == 'MM': attributes = ['ensembl_gene_id', 'uniprotswissprot', 'mgi_id', 'external_gene_name'] elif layer == 'HS': attributes = ['ensembl_gene_id', 'uniprotswissprot', 'hmmpanther', 'external_gene_name'] dfQ = ds.query(attributes=attributes).set_index('Gene stable ID') # Population of genes (background) to test against if layer == 'DM': pop_flybase = set(dfQ.index.tolist()) pop_uniprot = set(dfQ['UniProtKB/Swiss-Prot ID'].dropna().tolist()) pop = pop_flybase.union(pop_uniprot) elif layer == 'MM': pop_mgi = set(dfQ['MGI ID'].dropna().tolist())
from pybiomart import Server server = Server(host='http://www.ensembl.org') server.list_marts() mart = server['ENSEMBL_MART_ENSEMBL'] from pybiomart import Dataset dataset = Dataset(name='hsapiens_gene_ensembl', host='http://www.ensembl.org') dataset.query(attributes=['ensembl_gene_id', 'external_gene_name'], filters={'chromosome_name': ['1', '2']}) def attributes(self): if self._attributes is None: self._filters, self._attributes = self._fetch_configuration() return self._attributes dataset.attributes dataset.list_attributes() def filters(self): if self._filters is None: self._filters, self._attributes = self._fetch_configuration() return self._filters dataset.filters dataset.list_filters()
# datamart_names = { 'HS': 'hsapiens_gene_ensembl', 'MM': 'mmusculus_gene_ensembl', 'DM': 'dmelanogaster_gene_ensembl' } r = [] for specie in species: print("Calculating for species: {specie:s}".format(specie=specie)) print("Querying Datamart") datamart_name = datamart_names[specie] ds = Dataset(name=datamart_name, host='http://www.ensembl.org') dfQ = ds.query(attributes=[ 'ensembl_gene_id', 'external_gene_name', 'gene_biotype' ]).set_index('Gene stable ID') # n_genome = len(dfQ) dfQpc = dfQ.loc[(dfQ['Gene type'] == 'protein_coding'), :] n_genome_pc = len(dfQpc) DFQnpc = dfQ.loc[(dfQ['Gene type'] != 'protein_coding'), :] n_genome_non_pc = len(DFQnpc) print('done.') for celltype in celltypes: print("Calculating for celltype: {celltype:s}".format( celltype=celltype))