Example #1
0
def rebuild_comparisons(allsorts_clf, probabilities, ui, size=10):

    X_filtered = allsorts_clf.transform(ui.samples)["counts"]
    ''' Probability comparisons '''

    true_samples = []
    for sample, column in probabilities.iterrows():
        if column["True"] in column["Pred"]:
            true_samples.append(sample)

    comparisons = probabilities.loc[true_samples]
    cgroup = comparisons.groupby("True")
    new_samples = cgroup.apply(lambda x: x.sample(n=size) if x.shape[0] > size
                               else x.sample(n=x.shape[0]))

    new_index = []
    for sample in new_samples.index:
        new_index.append(sample[1])

    filename = "comparisons.csv"
    destination = str(root_dir()) + "/models/allsorts/" + filename
    cfinal = comparisons[(comparisons.index.isin(new_index))]
    cfinal.to_csv(destination)
    '''Umap Visualisation'''

    # Get genes used in the final models and filter counts by them
    chosen_genes = []
    classifiers = allsorts_clf.steps[-1][-1].fitted
    feature_selection = allsorts_clf.steps[-3][-1].genes

    for subtype, clf in classifiers.items():
        genes = feature_selection[subtype]
        sub_genes = pd.DataFrame(clf.coef_, columns=genes)
        sub_genes = sub_genes[sub_genes != 0].dropna(axis=1)
        chosen_genes += list(sub_genes.columns)

    chosen_genes = list(set(chosen_genes))
    X_filtered = X_filtered.loc[true_samples, chosen_genes]
    labels = ui.labels.loc[true_samples]

    # Create UMAPs
    u = UMAP(n_neighbors=10).fit(X_filtered)

    # Save
    u_filename = 'umap.sav'
    l_filename = 'comparison_labels.csv'
    g_filename = 'comparison_genes.csv'

    destination = str(
        root_dir()) + "/models/allsorts/comparisons/" + u_filename
    joblib.dump(u, destination)
    labels.to_csv(
        str(root_dir()) + "/models/allsorts/comparisons/" + l_filename)
    pd.Series(X_filtered.columns).to_csv(
        str(root_dir()) + "/models/allsorts/comparisons/" + g_filename)
Example #2
0
	def _loadChrom(self):

		chrom_ref_path = str(root_dir())+"/data/chrom_refs.txt"

		self.chrom_ref = pd.read_csv(chrom_ref_path, sep="\t", header=None)
		self.chrom_ref.drop([1, 2, 5, 6, 7], axis=1, inplace=True)
		self.chrom_ref.columns = ["chrom", "start", "end", "meta"]
		self.chrom_ref["length"] = (self.chrom_ref["start"] -
									self.chrom_ref["end"]).abs()

		# Extract gene names
		gene_names = []
		for line, meta in self.chrom_ref["meta"].iteritems():
			gene_name = meta.split(";")[2].split('"')[1]
			gene_names.append(gene_name)

		self.chrom_ref["gene"] = gene_names
		self.chrom_ref.index = self.chrom_ref["chrom"]
		self.chrom_ref.drop(["chrom", "meta"], axis=1, inplace=True)
		self.chrom_ref["start"] = pd.to_numeric(self.chrom_ref["start"])
		self.chrom_ref["end"] = pd.to_numeric(self.chrom_ref["end"])

		# Create dictionary of genes per chromosome
		self.chrom_dict = {}
		for chrom, info in self.chrom_ref.iterrows():
			if chrom in self.chrom_dict:
				self.chrom_dict[chrom].append(info["gene"])
			else:
				self.chrom_dict[chrom] = [info["gene"]]

		self.chroms = list(range(1, 23)) + ["X", "Y"]
Example #3
0
def load_classifier(path=False):
    """
    Load the ALLSorts classifier from a pickled file.

    ...

    Parameters
    __________
    path : str
        Path to a pickle object that holds the ALLSorts model.
        Default: "/models/allsorts/allsorts.pkl.gz"

    Returns
    __________
    allsorts_clf : ALLSorts object
        ALLSorts object, unpacked, ready to go.
    """

    if not path:
        path = str(root_dir()) + "/models/allsorts/allsorts.pkl.gz"

    message("Loading classifier...")
    allsorts_clf = joblib.load(path)

    return allsorts_clf
Example #4
0
 def __init__(self):
     if self._is_cli():
         self.cli = True
         self.input = self._get_args()
         self.samples = self.input.samples
         self.labels = self.input.labels if self.input.labels else False
         self.model_dir = str(
             root_dir()
         ) + "/models/allsorts/" if not self.input.model_dir else self.input.model_dir
         self.destination = False if not self.input.destination else self.input.destination
         self.test = self.input.test
         self.train = False if not self.input.train else True
         self.comparison = False if not self.input.comparison else True
         self.n_jobs = 1 if not self.input.njobs else int(self.input.njobs)
         self.verbose = False if not self.input.verbose else True
         self.force = False if not self.input.force else True
         self.cv = 3 if not self.input.cv else int(self.input.cv)
         self.parents = False if not self.input.parents else True
         self.ball = self.input.ball
         self._input_checks()
         self._load_samples()
     else:
         message(
             "No arguments supplied. Please use allsorts --help for further information about input."
         )
         sys.exit(0)
Example #5
0
def get_figures(samples,
                allsorts,
                destination,
                probabilities,
                plots=["distributions", "waterfalls"]):
    """
    Make figures of the results.

    ...

    Parameters
    __________
    samples : Pandas DataFrame
        Pandas DataFrame that represents the raw counts of your samples (rows) x genes (columns)).
    destination : str
        Location of where the results should be saved.
    probabilities : Pandas DataFrame
        The result of running the get_predictions(samples, labels=False, parents=False) function.
        See function for further usage.
    plots : List
        List of plots required. Default:  "distributions", "waterfalls", and "manifold".
        See https://github.com/Oshlack/AllSorts/ for examples.

    Output
    __________
    Distributions.png, Waterfalls.png, Manifold.png at the ui.destination path.

    """

    message("Saving figures...")

    for plot in plots:

        if plot == "distributions":
            dist_plot = allsorts.predict_dist(probabilities, return_plot=True)
            dist_plot.savefig(destination + "/distributions.png")

        if plot == "waterfalls":
            if "True" in probabilities.columns:
                comparisons = False
            else:
                comparisons = pd.read_csv(str(root_dir()) +
                                          "/models/allsorts/comparisons.csv",
                                          index_col=0)

            waterfall_plot = allsorts.predict_waterfall(probabilities,
                                                        compare=comparisons,
                                                        return_plot=True)
            waterfall_plot.savefig(destination + "/waterfalls.png")

        if plot == "manifold":
            umap_plot = allsorts.predict_plot(samples, return_plot=True)
            umap_plot.savefig(destination + "/manifold.png")
Example #6
0
    def predict_plot(self, X, return_plot=False):
        """
		Given the raw counts, embed these within a UMAP visualisation consisting of the comparison data.

		...

		Parameters
		__________
		X : Pandas DataFrame
			Pandas DataFrame that represents the raw counts of your samples (rows) x genes (columns)).
		return_plot : bool
			Rather than showing the plot through whatever IDE is being used, send it back to the function call.
			Likely so it can be saved.

		Returns
		__________
		Matplotlib object containing the drawn figure

		Output
		__________
		UMAP Plot figure.

		"""

        plt.figure(figsize=(20, 10))
        u = joblib.load(
            str(root_dir()) + "/models/allsorts/comparisons/umap.sav")
        c_labels = pd.read_csv(
            str(root_dir()) +
            "/models/allsorts/comparisons/comparison_labels.csv",
            index_col=0)
        c_labels = c_labels["labels"]
        c_genes = pd.read_csv(
            str(root_dir()) +
            "/models/allsorts/comparisons/comparison_genes.csv",
            index_col=0)
        c_genes = list(c_genes.iloc[:, 0])

        u_c = u.embedding_
        X_t = self.transform(X)
        X_t = X_t["counts"].loc[:, c_genes]
        u_t = u.transform(X_t)

        plt.scatter(u_t[:, 0], u_t[:, 1], c="#000000", alpha=0.4)
        plt.scatter(u_c[:, 0],
                    u_c[:, 1],
                    c=[c_subtypes[r] for r in c_labels],
                    alpha=0.4,
                    marker="x")

        transformed_positions = pd.DataFrame(u_c)
        transformed_positions["label"] = list(c_labels)

        median = transformed_positions.groupby("label").median()
        for name, label in median.iterrows():
            plt.text(label[0], label[1], name, FontSize=16)

        if return_plot:
            return plt
        else:
            plt.show()