예제 #1
0
def load_classifier(path=False):
    """
    Load the ALLSorts classifier from a pickled file.

    ...

    Parameters
    __________
    path : str
        Path to a pickle object that holds the ALLSorts model.
        Default: "/models/allsorts/allsorts.pkl.gz"

    Returns
    __________
    allsorts_clf : ALLSorts object
        ALLSorts object, unpacked, ready to go.
    """

    if not path:
        path = str(root_dir()) + "/models/allsorts/allsorts.pkl.gz"

    message("Loading classifier...")
    allsorts_clf = joblib.load(path)

    return allsorts_clf
예제 #2
0
    def transform(self, counts, y=False):
        """ Pre-process input counts as per parameters determined by fit().

		Parameters
		__________
		counts: Pandas DataFrame
			The training counts (samples/rows x genes/columns)
		"""

        counts.index = counts.index.astype("str")
        ''' Filter genes '''
        if self.filter_genes:
            counts = counts.reindex(self.genes, axis=1)
        ''' Normalise with TMM '''
        if self.norm == "TMM":
            counts = self.tmm_norm.transform(counts)
        ''' Check for missing genes '''

        missing_genes = list(set(self.genes).difference(counts.columns))
        if len(missing_genes) > 0:
            message(
                "Note: " + str(len(missing_genes)) +
                " genes not found in supplied samples, filling with zeroes.\n"
                + "This WILL impact classification performance.\n" +
                "Follow the counts guide on Github (http://) to resolve.",
                level="w")

        return counts
예제 #3
0
def run_predictions(ui, allsorts):
    """
    This is what we are here for. Use ALLSorts to make predictions!

    ...

    Parameters
    __________
    ui : User Input Class
        Carries all information required to execute ALLSorts, see UserInput class for further information.

    Output
    __________
    Probabilities.csv, Predictions.csv, Distributions.png, Waterfalls.png at the ui.destination path.

    """

    predictions, probabilities = get_predictions(ui.samples,
                                                 allsorts,
                                                 parents=ui.parents)
    probabilities["Pred"] = list(predictions["Prediction"])
    if not isinstance(ui.labels, bool):
        probabilities["True"] = ui.labels

    probabilities.round(3).to_csv(ui.destination + "/probabilities.csv")
    predictions.to_csv(ui.destination + "/predictions.csv")

    if "B-ALL" in probabilities.columns:
        get_figures(ui.samples, allsorts, ui.destination,
                    probabilities.drop("B-ALL", axis=1))
    else:
        get_figures(ui.samples, allsorts, ui.destination, probabilities)

    message("Finished. Thanks for using ALLSorts!")
예제 #4
0
파일: user.py 프로젝트: Oshlack/ALLSorts
 def __init__(self):
     if self._is_cli():
         self.cli = True
         self.input = self._get_args()
         self.samples = self.input.samples
         self.labels = self.input.labels if self.input.labels else False
         self.model_dir = str(
             root_dir()
         ) + "/models/allsorts/" if not self.input.model_dir else self.input.model_dir
         self.destination = False if not self.input.destination else self.input.destination
         self.test = self.input.test
         self.train = False if not self.input.train else True
         self.comparison = False if not self.input.comparison else True
         self.n_jobs = 1 if not self.input.njobs else int(self.input.njobs)
         self.verbose = False if not self.input.verbose else True
         self.force = False if not self.input.force else True
         self.cv = 3 if not self.input.cv else int(self.input.cv)
         self.parents = False if not self.input.parents else True
         self.ball = self.input.ball
         self._input_checks()
         self._load_samples()
     else:
         message(
             "No arguments supplied. Please use allsorts --help for further information about input."
         )
         sys.exit(0)
예제 #5
0
def get_figures(samples,
                allsorts,
                destination,
                probabilities,
                plots=["distributions", "waterfalls"]):
    """
    Make figures of the results.

    ...

    Parameters
    __________
    samples : Pandas DataFrame
        Pandas DataFrame that represents the raw counts of your samples (rows) x genes (columns)).
    destination : str
        Location of where the results should be saved.
    probabilities : Pandas DataFrame
        The result of running the get_predictions(samples, labels=False, parents=False) function.
        See function for further usage.
    plots : List
        List of plots required. Default:  "distributions", "waterfalls", and "manifold".
        See https://github.com/Oshlack/AllSorts/ for examples.

    Output
    __________
    Distributions.png, Waterfalls.png, Manifold.png at the ui.destination path.

    """

    message("Saving figures...")

    for plot in plots:

        if plot == "distributions":
            dist_plot = allsorts.predict_dist(probabilities, return_plot=True)
            dist_plot.savefig(destination + "/distributions.png")

        if plot == "waterfalls":
            if "True" in probabilities.columns:
                comparisons = False
            else:
                comparisons = pd.read_csv(str(root_dir()) +
                                          "/models/allsorts/comparisons.csv",
                                          index_col=0)

            waterfall_plot = allsorts.predict_waterfall(probabilities,
                                                        compare=comparisons,
                                                        return_plot=True)
            waterfall_plot.savefig(destination + "/waterfalls.png")

        if plot == "manifold":
            umap_plot = allsorts.predict_plot(samples, return_plot=True)
            umap_plot.savefig(destination + "/manifold.png")
예제 #6
0
파일: user.py 프로젝트: Oshlack/ALLSorts
    def _input_checks(self):

        if self.train and not (self.labels and self.samples):
            message(
                "Error: if -train is set both -labels/-l, -params/-p, -samples/-s must be also. Exiting."
            )
            sys.exit()

        if not self.train and not self.destination:
            message(
                "Error: if -train is not set a destination (-d /path/to/output/) is required. Exiting."
            )
            sys.exit()
예제 #7
0
def run(ui=False):
    """
    A function that runs ALLSorts in one of three modes: Training, Comparison adding, Prediction.

    - The Training mode will replace the model in the installed directory.
    - The Prediction mode will output a set of predictions and visualisations as per an input set of samples.
    - The Comparison mode will build comparisons from a supplied set of samples and labels to which to compare all
      new predictions. I.e. when no labels are u

    ...

    Parameters
    __________
    ui : User Input Class
        Carries all information required to execute ALLSorts, see UserInput class for further information.
    """

    if not ui:
        ui = UserInput()
        message(allsorts_asci)

    if ui.train:
        message("Training Mode", level=1)
        train_time = time.time()
        train(ui=ui)
        message("Total Train time " +
                str(round(time.time() - train_time, 2)))  # Seconds

    elif ui.comparison:
        message("Rebuilding Comparisons", level=1)

        allsorts_clf = load_classifier()
        allsorts_clf = _set_njobs(ui.n_jobs, allsorts_clf)
        allsorts_clf.steps[-1][
            -1].filter_healthy = True if ui.ball == "True" else False

        run_comparison_builder(ui, allsorts_clf)

    else:
        message("Prediction Mode", level=1)

        allsorts_clf = load_classifier()
        allsorts_clf = _set_njobs(ui.n_jobs, allsorts_clf)
        allsorts_clf.steps[-1][
            -1].filter_healthy = True if ui.ball == "True" else False

        run_predictions(ui, allsorts_clf)
예제 #8
0
def get_predictions(samples, allsorts, labels=False, parents=False):
    """
    Given a set of samples use ALLSorts to return a set of predictions and probabilities.

    ...

    Parameters
    __________
    samples : Pandas DataFrame
        Pandas DataFrame that represents the raw counts of your samples (rows) x genes (columns)).
    labels : Pandas Series
        Pandas series that has a label associate with each sample.
    parents : bool
        True/False as to whether to include parents in the hierarchy in the output, i.e. Ph Group.

    Returns
    __________
    predictions: Pandas DataFrame
        A predictions for each inputted sample.
    probabilities: Pandas DataFrame
        Probabilities returned by ALLSorts for each prediction - samples (rows) x subtype/meta-subtype (columns)
        Note: These do not have to add to 1 column-wise - see paper (when it is released!)
    """

    message("Making predictions...")
    probabilities = allsorts.predict_proba(samples, parents=parents)
    if "B-ALL" in probabilities.columns:
        predictions = pd.DataFrame(allsorts.predict(probabilities.drop("B-ALL",
                                                                       axis=1),
                                                    probabilities=True,
                                                    parents=parents),
                                   columns=["Prediction"],
                                   index=samples.index)
    else:
        predictions = pd.DataFrame(allsorts.predict(probabilities,
                                                    probabilities=True,
                                                    parents=parents),
                                   columns=["Prediction"],
                                   index=samples.index)

    if isinstance(labels, pd.Series):
        probabilities["True"] = labels

    return predictions, probabilities
예제 #9
0
def run_comparison_builder(ui, allsorts):
    """
    Build comparison results to compare to future predictions.

    I.e. what the waterfall plot displays in addition to the predicted sample probabilities.
    ...

    Parameters
    __________
    ui : User Input Class
        Carries all information required to execute ALLSorts, see UserInput class for further information.

    """

    predictions, probabilities = get_predictions(ui.samples,
                                                 allsorts,
                                                 labels=ui.labels,
                                                 parents=True)
    probabilities["Pred"] = list(predictions["Prediction"])

    message("Building comparisons...")
    rebuild_comparisons(allsorts, probabilities, ui)
    message("Finished.")
예제 #10
0
파일: train.py 프로젝트: Oshlack/ALLSorts
def train(ui=False):
    ''' TRAINING A MODEL (OUTER LOOP)
		--
		This operation requires two steps:
		1. With a tuned estimator returned from inner loop, calibrate optimal thresholds.
		2. Score this method
		3. Train a final model, using the average of the final thresholds.
	'''

    message("Cross Validation (this will take awhile):", level=2)

    # Create results path
    search_path = ui.model_dir + "gridsearch/"
    create_dir([ui.model_dir, search_path])

    # CV results
    subtypes = list(ui.labels.unique())
    thresholds_cv = {}
    results_cv = {}
    results_cv["accuracy"] = []
    results_cv["precision"] = []
    results_cv["recall"] = []
    results_cv["f1"] = []

    # First we need to fight out for the model of choice

    # Now we need to figure out thresholds

    for fold in range(1, ui.cv + 1):

        message("Fold: " + str(fold))
        seed = np.random.randint(1, 1000)
        x_train, x_test, y_train, y_test = train_test_split(ui.samples,
                                                            ui.labels,
                                                            stratify=ui.labels,
                                                            test_size=0.2,
                                                            random_state=seed)

        # Inner loop (hyperparameter tuning)
        allsorts_clf_fold = _tune(ui, x_train, y_train, fold=fold)

        probabilities = allsorts_clf_fold.predict_proba(x_test, parents=True)
        f_hierarchy = allsorts_clf_fold.steps[-1][-1].f_hierarchy

        # Optimise Prediction Thresholds
        thresholds = fit_thresholds(probabilities, f_hierarchy, y_test)
        allsorts_clf_fold.steps[-1][-1].thresholds = thresholds

        for subtype, fold_thresh in thresholds.items():
            if subtype in thresholds_cv.keys():
                thresholds_cv[subtype].append(fold_thresh)
            else:
                thresholds_cv[subtype] = [fold_thresh]

        # Score fold
        y_pred = allsorts_clf_fold.predict(x_test, parents=True)

        hierarchy = {
            "High Sig": {
                "High hyperdiploid": False,
                'Low hyperdiploid': False,
                "Near haploid": False
            },
            'Low hypodiploid': False,
            'iAMP21': False,
            'NUTM1': False,
            'BCL2/MYC': False,
            'TCF3-PBX1': False,
            'MEF2D': False,
            'HLF': False,
            'IKZF1 N159Y': False,
            'PAX5 P80R': False,
            'Ph Group': {
                "Ph-like": False,
                "Ph": False
            },
            "PAX5alt": False,
            'ETV6-RUNX1 Group': {
                'ETV6-RUNX1': False,
                'ETV6-RUNX1-like': False
            },
            'ZNF384 Group': False,
            'KMT2A Group': False,
            'DUX4': False
        }

        f_hierarchy = _flat_hierarchy(hierarchy, flat_hierarchy={})
        probs = allsorts_clf_fold.predict_proba(x_test, parents=True)
        print(probs)
        print(f_hierarchy)
        fold_preds(y_test, probs, f_hierarchy=f_hierarchy)

        results_cv["accuracy"].append(round(accuracy_score(y_test, y_pred), 4))
        results_cv["precision"].append(
            round(
                precision_score(y_test,
                                y_pred,
                                average="weighted",
                                zero_division=0,
                                labels=subtypes), 4))
        results_cv["recall"].append(
            round(
                recall_score(y_test,
                             y_pred,
                             average="weighted",
                             zero_division=0,
                             labels=subtypes), 4))
        results_cv["f1"].append(
            round(
                f1_score(y_test,
                         y_pred,
                         average="weighted",
                         zero_division=0,
                         labels=subtypes), 4))

    # Train final model using all samples
    allsorts_clf = _tune(ui, ui.samples, ui.labels)

    test = allsorts_clf.transform(ui.samples)
    test["True"] = ui.labels
    test["counts"].to_csv("normed_counts.csv")

    # Average thresholds
    thresholds = {}
    for subtype, sub_thresh in thresholds_cv.items():
        thresholds[subtype] = round(sum(sub_thresh) / len(sub_thresh), 4)

    allsorts_clf.steps[-1][-1].thresholds = thresholds

    # Save results and model
    scores = pd.DataFrame(results_cv, index=list(range(1, ui.cv + 1)))
    scores.to_csv(ui.model_dir + "cross_val_results.csv")

    save_path_model = ui.model_dir + "allsorts.pkl.gz"
    message("Saving model to: " + save_path_model)
    allsorts_clf.save(path=save_path_model)
예제 #11
0
파일: train.py 프로젝트: Oshlack/ALLSorts
def _tune(ui, x_train, y_train, fold="all"):
    ''' TUNING A MODEL (INNER LOOP)
		--
		This operation requires two steps:
		1. Construct a pipeline using the ALLSorts class (A Sklearn pipeline extension).
		2. Gridsearch the parameter space that is outlined.

		Currently this is achieved by editing this function. Although, in future, this will be included within a
		a passable JSON file that contains the below. Given that this is likely only to be run once in a blue moon,
		this is not a priority.

		For those wishing to use the ALLSorts model, with some substitutions of algorithms, simply edit this file after
		making a copy of the original (save it somewhere so you can always revert). Note, setting up an ALLSorts
		pipeline and grid search is identical to setting up a usual sklearn pipeline.

		For more information on how to achieve this visit:
		https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html
		https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html
	'''

    hierarchy = {
        "High Sig": {
            "High hyperdiploid": False,
            'Low hyperdiploid': False,
            "Near haploid": False
        },
        'Low hypodiploid': False,
        'iAMP21': False,
        'NUTM1': False,
        'BCL2/MYC': False,
        'TCF3-PBX1': False,
        'MEF2D': False,
        'HLF': False,
        'IKZF1 N159Y': False,
        'PAX5 P80R': False,
        'Ph Group': {
            "Ph-like": False,
            "Ph": False
        },
        "PAX5alt": False,
        'ETV6-RUNX1 Group': {
            'ETV6-RUNX1': False,
            'ETV6-RUNX1-like': False
        },
        'ZNF384 Group': False,
        'KMT2A Group': False,
        'DUX4': False
    }

    f_hierarchy = _flat_hierarchy(hierarchy, flat_hierarchy={})

    ### ADD Some features we want
    fusion_list: List[Any] = []
    fusion_list += ["BCR_ABL1"]
    fusion_list += ["ETV6_RUNX1"]
    fusion_list += ["TCF3_PBX1", "TCF3_HLF"]

    # Set parameters to be used in GridSearchCV
    lr = LogisticRegression(penalty="l1",
                            solver="liblinear",
                            max_iter=500,
                            multi_class="auto",
                            class_weight="balanced")
    lr_params = [{"C": 0.3}]
    standard_params = []
    classifier = HierarchicalClassifier()

    # Create parameter grid for input into GridSearchCV
    allsorts_params = [

        # Hierarchy
        {
            'standardisation': [Scaler(scaler="std")],
            'centroids': [
                CentroidCreate(hierarchy=hierarchy,
                               distance_function=euclidean_distances)
            ],
            'feature_select':
            [FeatureSelection(hierarchy=hierarchy, method="all", test=False)],
            'feature_create__chrom_feature': [True],
            'feature_create__iamp21_feature': [True],
            'feature_create__fusion_feature': [True],
            'train_model__hierarchy': [hierarchy],
            'train_model__model': [lr],
            'train_model__params':
            lr_params
        }
    ]

    # Note: Once benchmarks per cpu is made, estimate time of compute and distribute it accordingly
    training_x_models = len(list(ParameterGrid(allsorts_params)))
    grid_search_cv = 2

    if training_x_models * grid_search_cv >= ui.n_jobs:
        grid_jobs = ui.n_jobs
        stage_jobs = 1
    else:
        grid_jobs = 1
        stage_jobs = ui.n_jobs

    # Create Pipeline
    allsorts_pipe = ALLSorts(
        [("preprocess", Preprocessing(filter_genes=True, norm="TMM")),
         ("feature_create",
          FeatureCreation(
              n_jobs=stage_jobs, kernel_div=30, fusions=fusion_list)),
         ("standardisation", Scaler()), ("feature_select", FeatureSelection()),
         ("centroids", CentroidCreate()), ("train_model", classifier)],
        verbose=ui.verbose)

    # Check with user whether they want to train this many models
    if fold == 1:
        message("Important: Training " + str(training_x_models) + " models (" +
                str(grid_search_cv * ui.cv * training_x_models) +
                " with cross validation).",
                important=True)

    # Perform Grid Search - Likely to take a lot of time.
    allsorts_grid = GridSearchCV(allsorts_pipe,
                                 param_grid=allsorts_params,
                                 cv=grid_search_cv,
                                 n_jobs=grid_jobs,
                                 scoring="balanced_accuracy").fit(
                                     x_train, y_train)

    grid_results = _grid_save(allsorts_grid)
    grid_results.to_csv(ui.model_dir + "gridsearch/gridsearch_fold" +
                        str(fold) + ".csv")

    # Pick the estimator that maximised the score in our gridsearchcv
    allsorts_clf = allsorts_grid.best_estimator_

    return allsorts_clf