def rebuild_comparisons(allsorts_clf, probabilities, ui, size=10): X_filtered = allsorts_clf.transform(ui.samples)["counts"] ''' Probability comparisons ''' true_samples = [] for sample, column in probabilities.iterrows(): if column["True"] in column["Pred"]: true_samples.append(sample) comparisons = probabilities.loc[true_samples] cgroup = comparisons.groupby("True") new_samples = cgroup.apply(lambda x: x.sample(n=size) if x.shape[0] > size else x.sample(n=x.shape[0])) new_index = [] for sample in new_samples.index: new_index.append(sample[1]) filename = "comparisons.csv" destination = str(root_dir()) + "/models/allsorts/" + filename cfinal = comparisons[(comparisons.index.isin(new_index))] cfinal.to_csv(destination) '''Umap Visualisation''' # Get genes used in the final models and filter counts by them chosen_genes = [] classifiers = allsorts_clf.steps[-1][-1].fitted feature_selection = allsorts_clf.steps[-3][-1].genes for subtype, clf in classifiers.items(): genes = feature_selection[subtype] sub_genes = pd.DataFrame(clf.coef_, columns=genes) sub_genes = sub_genes[sub_genes != 0].dropna(axis=1) chosen_genes += list(sub_genes.columns) chosen_genes = list(set(chosen_genes)) X_filtered = X_filtered.loc[true_samples, chosen_genes] labels = ui.labels.loc[true_samples] # Create UMAPs u = UMAP(n_neighbors=10).fit(X_filtered) # Save u_filename = 'umap.sav' l_filename = 'comparison_labels.csv' g_filename = 'comparison_genes.csv' destination = str( root_dir()) + "/models/allsorts/comparisons/" + u_filename joblib.dump(u, destination) labels.to_csv( str(root_dir()) + "/models/allsorts/comparisons/" + l_filename) pd.Series(X_filtered.columns).to_csv( str(root_dir()) + "/models/allsorts/comparisons/" + g_filename)
def _loadChrom(self): chrom_ref_path = str(root_dir())+"/data/chrom_refs.txt" self.chrom_ref = pd.read_csv(chrom_ref_path, sep="\t", header=None) self.chrom_ref.drop([1, 2, 5, 6, 7], axis=1, inplace=True) self.chrom_ref.columns = ["chrom", "start", "end", "meta"] self.chrom_ref["length"] = (self.chrom_ref["start"] - self.chrom_ref["end"]).abs() # Extract gene names gene_names = [] for line, meta in self.chrom_ref["meta"].iteritems(): gene_name = meta.split(";")[2].split('"')[1] gene_names.append(gene_name) self.chrom_ref["gene"] = gene_names self.chrom_ref.index = self.chrom_ref["chrom"] self.chrom_ref.drop(["chrom", "meta"], axis=1, inplace=True) self.chrom_ref["start"] = pd.to_numeric(self.chrom_ref["start"]) self.chrom_ref["end"] = pd.to_numeric(self.chrom_ref["end"]) # Create dictionary of genes per chromosome self.chrom_dict = {} for chrom, info in self.chrom_ref.iterrows(): if chrom in self.chrom_dict: self.chrom_dict[chrom].append(info["gene"]) else: self.chrom_dict[chrom] = [info["gene"]] self.chroms = list(range(1, 23)) + ["X", "Y"]
def load_classifier(path=False): """ Load the ALLSorts classifier from a pickled file. ... Parameters __________ path : str Path to a pickle object that holds the ALLSorts model. Default: "/models/allsorts/allsorts.pkl.gz" Returns __________ allsorts_clf : ALLSorts object ALLSorts object, unpacked, ready to go. """ if not path: path = str(root_dir()) + "/models/allsorts/allsorts.pkl.gz" message("Loading classifier...") allsorts_clf = joblib.load(path) return allsorts_clf
def __init__(self): if self._is_cli(): self.cli = True self.input = self._get_args() self.samples = self.input.samples self.labels = self.input.labels if self.input.labels else False self.model_dir = str( root_dir() ) + "/models/allsorts/" if not self.input.model_dir else self.input.model_dir self.destination = False if not self.input.destination else self.input.destination self.test = self.input.test self.train = False if not self.input.train else True self.comparison = False if not self.input.comparison else True self.n_jobs = 1 if not self.input.njobs else int(self.input.njobs) self.verbose = False if not self.input.verbose else True self.force = False if not self.input.force else True self.cv = 3 if not self.input.cv else int(self.input.cv) self.parents = False if not self.input.parents else True self.ball = self.input.ball self._input_checks() self._load_samples() else: message( "No arguments supplied. Please use allsorts --help for further information about input." ) sys.exit(0)
def get_figures(samples, allsorts, destination, probabilities, plots=["distributions", "waterfalls"]): """ Make figures of the results. ... Parameters __________ samples : Pandas DataFrame Pandas DataFrame that represents the raw counts of your samples (rows) x genes (columns)). destination : str Location of where the results should be saved. probabilities : Pandas DataFrame The result of running the get_predictions(samples, labels=False, parents=False) function. See function for further usage. plots : List List of plots required. Default: "distributions", "waterfalls", and "manifold". See https://github.com/Oshlack/AllSorts/ for examples. Output __________ Distributions.png, Waterfalls.png, Manifold.png at the ui.destination path. """ message("Saving figures...") for plot in plots: if plot == "distributions": dist_plot = allsorts.predict_dist(probabilities, return_plot=True) dist_plot.savefig(destination + "/distributions.png") if plot == "waterfalls": if "True" in probabilities.columns: comparisons = False else: comparisons = pd.read_csv(str(root_dir()) + "/models/allsorts/comparisons.csv", index_col=0) waterfall_plot = allsorts.predict_waterfall(probabilities, compare=comparisons, return_plot=True) waterfall_plot.savefig(destination + "/waterfalls.png") if plot == "manifold": umap_plot = allsorts.predict_plot(samples, return_plot=True) umap_plot.savefig(destination + "/manifold.png")
def predict_plot(self, X, return_plot=False): """ Given the raw counts, embed these within a UMAP visualisation consisting of the comparison data. ... Parameters __________ X : Pandas DataFrame Pandas DataFrame that represents the raw counts of your samples (rows) x genes (columns)). return_plot : bool Rather than showing the plot through whatever IDE is being used, send it back to the function call. Likely so it can be saved. Returns __________ Matplotlib object containing the drawn figure Output __________ UMAP Plot figure. """ plt.figure(figsize=(20, 10)) u = joblib.load( str(root_dir()) + "/models/allsorts/comparisons/umap.sav") c_labels = pd.read_csv( str(root_dir()) + "/models/allsorts/comparisons/comparison_labels.csv", index_col=0) c_labels = c_labels["labels"] c_genes = pd.read_csv( str(root_dir()) + "/models/allsorts/comparisons/comparison_genes.csv", index_col=0) c_genes = list(c_genes.iloc[:, 0]) u_c = u.embedding_ X_t = self.transform(X) X_t = X_t["counts"].loc[:, c_genes] u_t = u.transform(X_t) plt.scatter(u_t[:, 0], u_t[:, 1], c="#000000", alpha=0.4) plt.scatter(u_c[:, 0], u_c[:, 1], c=[c_subtypes[r] for r in c_labels], alpha=0.4, marker="x") transformed_positions = pd.DataFrame(u_c) transformed_positions["label"] = list(c_labels) median = transformed_positions.groupby("label").median() for name, label in median.iterrows(): plt.text(label[0], label[1], name, FontSize=16) if return_plot: return plt else: plt.show()