def get(self, datasetId, elements="file"): print("get dataset") print(datasetId) path = APP_CONFIG["application_files_location"] + APP_CONFIG["application_store_name"] if elements == "file": print("creating frame") print("accepting ") print(request.accept_mimetypes) # convert public to intern... return cr.DataFrameApiResource(dr.get_data_frame_from_hdf(datasetId, path), datasetId + ".tsv"), 201 if elements == "genes": print("retrieving genes") dataset_with_ensembl = dr.get_dataset_genes_with_ensembl_info(datasetId, path) return cr.JsonResource(dataset_with_ensembl), 201 if elements == "desc": dataset = dr.get_data_frame_from_hdf(datasetId, path) filtered = dataset.sum(1) == 0 d_filtered = dataset[-filtered] filtered_data_set = d_filtered # pd.DataFrame(scale(d_filtered), index=d_filtered.index, columns=d_filtered.columns) json_message = {"meta":{"samples":len(filtered_data_set.columns), "genes":len(filtered_data_set.index)}, "samples": [{"id": s} for idx, s in enumerate(filtered_data_set.columns)], "descriptive": [ {"identifier": item[0], "mean": item[1].mean(), "sd": item[1].std(), "max": item[1].max()} for item in filtered_data_set.iterrows() ]} return cr.JsonResource(json_message), 201
def stream_detect_outliers_in_data(data, method, min_dist, max_samples, mining_id): # if exists -> get from db processing_result = None store_path = APP_CONFIG["application_files_location"] + APP_CONFIG["application_store_name"] data = dr.get_data_frame_from_hdf(data, store_path) # For demo purposes (cleaning) data = data[-(data.sum(1) <= 5)] if method == "kmeans_outliers": return om.cluster_outliers(data, data.index, max_samples=max_samples, min_dist=min_dist, mining_id=mining_id) if method == "mad": return om.mad_outliers(data, data.index, max_samples)
def plot_comparison(dataset_identifiers, features, type="boxplot"): sns.set_style("whitegrid") sns.set_palette("deep") generator = "matplot" f, axes = plt.subplots(1, 2) # dataset_identifiers = ['DESeq_1_18_0_umc_read_counts_table_without_8433', # 'DESeq_1_18_0_genentech_read_counts_table'] # features = ["ENSG00000002549"] path = APP_CONFIG["application_files_location"] + APP_CONFIG["application_store_name"] i = 0 subset = None print features for d in dataset_identifiers: dataset = dr.get_data_frame_from_hdf(d, path) if type != "pca": subset = dataset.ix[features, :] if type == "boxplot": axes[i].boxplot(subset) create_csv_version(subset.transpose(), d, type, subset.columns) if type == "scatter": generator = "matplot" #print len(subset.ix[0, :]) t = axes[i].plot(subset.transpose(), 'o') create_csv_version(subset.transpose(), d, type, subset.columns) plugins.connect(f, plugins.PointLabelTooltip(t[0], labels=(list(subset.columns)))) if type == "pca": print(type) pca = PCA(n_components=2) t_data = dataset.transpose() pca_result = pca.fit(t_data) pca_transformed = pca_result.transform(t_data) t = axes[i].plot(pca_transformed[:, 0], pca_transformed[:, 1], 'o') create_csv_version(pca_transformed, d, type, dataset.columns) plugins.connect(f, plugins.PointLabelTooltip(t[0], labels=(list(dataset.columns)))) axes[i].set_xlabel(d) axes[i].set_ylabel(str(features)) i += 1 if generator == "bokeh": bk = to_bokeh(name="descriptive") else: bk = plt.gcf() sz = bk.get_size_inches() bk.set_size_inches((sz[0]*2.5, sz[1]*2)) return bk
def sample_distribution_for_genes(dataset_identifier, genes, title="", samples=None, unit="zscore", chart_type="heat", chart_rendering="html"): try: generator = None convert = False tooltip = False sns.set(style="whitegrid") #get path to HDF store from config file path_to_store = APP_CONFIG["application_files_location"] + APP_CONFIG["application_store_name"] print("path to store " + path_to_store) print(genes) #get data set as a pandas frame dataset = dr.get_data_frame_from_hdf(dataset_identifier, path_to_store) tooltip = None #filter selected genes gene_list = dataset.loc[genes, :] labels = list(gene_list.columns) #matplotlib settings ax = None fig = None #heatmap #html = dynamic plot #pdf = pdf file if chart_type == "heat": if chart_rendering == "html": print("bokeh heatmap") sns.set() title += " heatmap" generator = "bokeh" print("sns heatmap") fig = HeatMap(gene_list.dropna(), title=title) elif chart_rendering == "pdf": plt.switch_backend('Agg') title += " heatmap pdf" sns.set_context("paper", font_scale=0.8) generator = "matplot" ax = sns.heatmap(gene_list) #adapt label fontsize for tick in ax.xaxis.get_major_ticks(): tick.label.set_fontsize(2.0) # tick.label.set print("axis loaded") print("heatmap with bokeh") elif chart_type == "box": #boxplot charts title += " boxplot" generator = "bokeh" # convert = True TOOLS = "resize,crosshair,pan,wheel_zoom,box_zoom,reset,previewsave,hover" print("creating box plot") # new_z_set = pd.DataFrame(stats.zscore(gene_list), index=gene_list.index, columns=gene_list.columns) t_genes = gene_list.transpose() # sns.boxplot(t_genes, color="coolwarm_r", names=t_genes.columns) fig = BoxPlot(t_genes, legend=True, tools=TOOLS) hover = fig.select(dict(type=HoverTool)) hover.tooltips = [ ("sample", "$index") ] elif chart_type == "scatter": if chart_rendering == "html": title += " scatter" generator = "bokeh" # tooltip = True TOOLS = "resize, crosshair, pan, wheel_zoom, box_zoom,reset, previewsave, hover" fig = bpl.figure(title=title, tools=TOOLS) current_palette = sns.color_palette() i = 0 colors = bp.Spectral10 # fig.scatter(dc, dc["index"]) for e in gene_list.iterrows(): val = [v for v in e[1]] idx = range(0, len(val)) col_data_source = ColumnDataSource(dict( x=idx, y=val, sample=gene_list.columns[idx], )) fig.scatter('x', 'y', color=colors[i], legend=e[0], source=col_data_source) i += 1 hover = fig.select(dict(type=HoverTool)) hover.tooltips = [ ("sample", "@sample"), ("count", "$y") ] elif chart_rendering == "pdf": plt.switch_backend('PDF') sns.color_palette("deep") title += " scatter pdf" generator = "matplot" print("starting with matplot") transposed_data = gene_list.transpose() ax = transposed_data.plot(style='o') legend = plt.legend(frameon=1) frame = legend.get_frame() frame.set_edgecolor('blue') coll_len = len(gene_list.columns) for idx, gene in enumerate(gene_list.index): for x, y in zip(range(0, coll_len), gene_list.iloc[idx, :]): if y > gene_list.iloc[idx, :].quantile(0.99): ax.annotate(labels[x], xy=(x, y), textcoords='data') # grid.add_legend(labels) print("plotted with matplot") if ax is not None: if generator == "matplot" and (title is not None): ax.set_title(title) elif generator == "matplot" and ax is not None: ax.set_title(title) if generator == "matplot": fig = plt.gcf() if convert is True and generator == "bokeh": fig = plt.gcf() fig = mpl.to_bokeh(fig) return {"generator": generator, "chart": fig} except BaseException as e: print(e.__str__())
def post(self): args = dataset_parser.parse_args() print("dataset post received") # upload new dataset if args.dataset_identifier is None: print("creating new data set") new_file = args.file filename = secure_filename(new_file.filename) print("uploaded" + filename) intern_identifier = ig.get_generated_guid_as_string() new_file.save(filename) try: with open(filename, "rb") as fl: frame = dr.get_data_frame_from_csv(fl) intern_location = dr.store_data_frame_to_hdf(frame, intern_identifier) # raw counts by default data_entity = dr.create_data_set(intern_identifier, public_identifier=filename, dataset_type="raw gene counts", experiment_identifier="raw_data_container") # For demo -> add exp identifier server_hash = ig.md5_for_file(fl) print(filename + " is saved") print("file removed") return cr.JsonResource( {"filename": filename, "intern_identifier": data_entity.intern_identifier, "public_identifier": data_entity.public_identifier, "server_md5": server_hash}), 201 except IntegrityError: return cr.StringApiResource("Public identifier already taken"), 409 except: return cr.StringApiResource("An error has occured, check if your data set comply with the expected format"), 400 finally: if filename is not None: os.remove(filename) else: # a new dataset is created based on source try: print("pre-processing existing dataset") source_data = args.dataset_identifier print("source data : " + source_data) # target_data = args.target_dataset_identifier # TODO link to experiment identifier! # experiment_identifier = args.experiment_identifier method_identifier = args.preprocessing_method_identifier print("source " + source_data + "method " + method_identifier) path = APP_CONFIG["application_files_location"] + APP_CONFIG["application_store_name"] df = dr.get_data_frame_from_hdf(source_data, path) print("data frame is loaded") new_data_set = ed.normalize_gene_counts(df, method_identifier, target_identifier=source_data, experiment_identifier=args.experiment_identifier) return cr.JsonResource(eto.SummaryDatasetView(new_data_set).to_json()), 201 except Exception as e: print(e.__str__()) return cr.StringApiResource("Explosion! Tool down..."), 400