Example #1
0
    def kfold_prediction(self, k=10):
        # generate indices for kfold cross validation
        self.num_pred = 0  # number of predictions

        prediction = pd.Series(index=self.y.index)  # predicted class
        onco_prob = pd.Series(index=self.y.index).fillna(0)
        tsg_prob = pd.Series(index=self.y.index).fillna(0)

        for i in range(self.total_iter):
            # randomize for another round
            self.x, self.y = futils.randomize(self.x, self.prng)
            futils.check_num_classes(self.y)  # warn user if not 3 classes

            # set up stratified kfold iterator
            kf = KFold(n_splits=k)
            k_fold = kf.split(self.y)

            # obtain predictions from single round of kfold validation
            for train_ix, test_ix in k_fold:
                # retreive indices from pandas dataframe using row number
                tmp_train_ix = self.x.iloc[train_ix].index
                tmp_test_ix = self.x.iloc[test_ix].index

                if self.is_weighted_sample:
                    # figure out sample weights
                    num_train = len(train_ix)
                    sample_weight = np.zeros(num_train)
                    onco_ix = np.nonzero(
                        self.y.loc[tmp_train_ix] == self.onco_num)[0]
                    tsg_ix = np.nonzero(
                        self.y.loc[tmp_train_ix] == self.tsg_num)[0]
                    other_ix = np.nonzero(
                        self.y.loc[tmp_train_ix] == self.other_num)[0]
                    sample_weight[onco_ix] = 1. / len(onco_ix)
                    sample_weight[tsg_ix] = 1. / len(tsg_ix)
                    sample_weight[other_ix] = 1. / len(other_ix)

                    # do training with sample weighting
                    self.clf.fit(self.x.loc[tmp_train_ix].copy(),
                                 self.y.loc[tmp_train_ix].copy(),
                                 sample_weight=sample_weight)
                else:
                    # do training without weighting
                    self.clf.fit(self.x.loc[tmp_train_ix].copy(),
                                 self.y.loc[tmp_train_ix].copy())

                # predict test data in kfold validation
                tmp_prob = self.clf.predict_proba(self.x.loc[tmp_test_ix])
                onco_prob.loc[tmp_test_ix] += tmp_prob[:, self.onco_num]
                tsg_prob.loc[tmp_test_ix] += tmp_prob[:, self.tsg_num]

            self.num_pred += 1

        # convert number of trees to fraction of trees
        onco_prob /= self.num_pred
        tsg_prob /= self.num_pred
        other_prob = 1 - (onco_prob + tsg_prob)

        # return prediction.astype(int), prob
        return onco_prob, tsg_prob, other_prob
    def kfold_prediction(self, k=10):
        # generate indices for kfold cross validation
        self.num_pred = 0  # number of predictions

        prediction = pd.Series(index=self.y.index)  # predicted class
        onco_prob = pd.Series(index=self.y.index).fillna(0)
        tsg_prob = pd.Series(index=self.y.index).fillna(0)

        for i in range(self.total_iter):
            # randomize for another round
            self.x, self.y = futils.randomize(self.x, self.prng)
            futils.check_num_classes(self.y) # warn user if not 3 classes

            # set up stratified kfold iterator
            k_fold = cross_validation.StratifiedKFold(self.y,
                                                      n_folds=k)

            # obtain predictions from single round of kfold validation
            for train_ix, test_ix in k_fold:
                # retreive indices from pandas dataframe using row number
                tmp_train_ix = self.x.iloc[train_ix].index
                tmp_test_ix = self.x.iloc[test_ix].index

                if self.is_weighted_sample:
                    # figure out sample weights
                    num_train = len(train_ix)
                    sample_weight = np.zeros(num_train)
                    onco_ix = np.nonzero(self.y.ix[tmp_train_ix]==self.onco_num)[0]
                    tsg_ix = np.nonzero(self.y.ix[tmp_train_ix]==self.tsg_num)[0]
                    other_ix = np.nonzero(self.y.ix[tmp_train_ix]==self.other_num)[0]
                    sample_weight[onco_ix] = 1. / len(onco_ix)
                    sample_weight[tsg_ix] = 1. / len(tsg_ix)
                    sample_weight[other_ix] = 1. / len(other_ix)

                    # do training with sample weighting
                    self.clf.fit(self.x.ix[tmp_train_ix].copy(),
                                 self.y.ix[tmp_train_ix].copy(),
                                 sample_weight=sample_weight)
                else:
                    # do training without weighting
                    self.clf.fit(self.x.ix[tmp_train_ix].copy(),
                                 self.y.ix[tmp_train_ix].copy())

                # predict test data in kfold validation
                tmp_prob = self.clf.predict_proba(self.x.ix[tmp_test_ix])
                onco_prob.ix[tmp_test_ix] += tmp_prob[:, self.onco_num]
                tsg_prob.ix[tmp_test_ix] += tmp_prob[:, self.tsg_num]

            self.num_pred += 1

        # convert number of trees to fraction of trees
        onco_prob /= self.num_pred
        tsg_prob /= self.num_pred
        other_prob = 1 - (onco_prob + tsg_prob)

        # return prediction.astype(int), prob
        return onco_prob, tsg_prob, other_prob
Example #3
0
    def train_cv(self, k=10):
        """Train classifier on entire data set provided, but done in cross-validation."""
        # generate indices for kfold cross validation
        self.num_pred = 0  # number of predictions
        self.test_fold_df = pd.DataFrame(
            {l + 1: 0
             for l in range(self.total_iter)}, index=self.x.index)

        for i in range(self.total_iter):
            # randomize for another round
            self.x, self.y = futils.randomize(self.x, self.prng)
            futils.check_num_classes(self.y)  # warn user if not 3 classes

            # set up stratified kfold iterator
            kf = KFold(n_splits=k)
            k_fold = kf.split(self.y)

            # obtain predictions from single round of kfold validation
            for nfold, (train_ix, test_ix) in enumerate(k_fold):
                # retreive indices from pandas dataframe using row number
                tmp_train_ix = self.x.iloc[train_ix].index

                # save which genes are in the test fold
                tmp_test_ix = self.x.iloc[test_ix].index
                self.test_fold_df.loc[tmp_test_ix, i + 1] = nfold + 1

                if self.is_weighted_sample:
                    # figure out sample weights
                    num_train = len(train_ix)
                    sample_weight = np.zeros(num_train)
                    onco_ix = np.nonzero(
                        self.y.loc[tmp_train_ix] == self.onco_num)[0]
                    tsg_ix = np.nonzero(
                        self.y.loc[tmp_train_ix] == self.tsg_num)[0]
                    other_ix = np.nonzero(
                        self.y.loc[tmp_train_ix] == self.other_num)[0]
                    sample_weight[onco_ix] = 1. / len(onco_ix)
                    sample_weight[tsg_ix] = 1. / len(tsg_ix)
                    sample_weight[other_ix] = 1. / len(other_ix)

                    # do training with sample weighting
                    self.clf.fit(self.x.loc[tmp_train_ix].copy(),
                                 self.y.loc[tmp_train_ix].copy(),
                                 sample_weight=sample_weight)
                else:
                    # do training without weighting
                    self.clf.fit(self.x.loc[tmp_train_ix].copy(),
                                 self.y.loc[tmp_train_ix].copy())
                self.clf.append_fold_result(
                )  # add the training result from each fold
            self.clf.append_cv_result(
            )  # add the training result for a single CV to the R variable

            self.num_pred += 1
        self.clf.set_cv_fold(self.test_fold_df)
    def train_cv(self, k=10):
        """Train classifier on entire data set provided, but done in cross-validation."""
        # generate indices for kfold cross validation
        self.num_pred = 0  # number of predictions
        self.test_fold_df = pd.DataFrame({l+1: 0 for l in range(self.total_iter)}, index=self.x.index)

        for i in range(self.total_iter):
            # randomize for another round
            self.x, self.y = futils.randomize(self.x, self.prng)
            futils.check_num_classes(self.y) # warn user if not 3 classes

            # set up stratified kfold iterator
            k_fold = cross_validation.StratifiedKFold(self.y,
                                                      n_folds=k)

            # obtain predictions from single round of kfold validation
            for nfold, (train_ix, test_ix) in enumerate(k_fold):
                # retreive indices from pandas dataframe using row number
                tmp_train_ix = self.x.iloc[train_ix].index

                # save which genes are in the test fold
                tmp_test_ix = self.x.iloc[test_ix].index
                self.test_fold_df.loc[tmp_test_ix, i+1] = nfold + 1

                if self.is_weighted_sample:
                    # figure out sample weights
                    num_train = len(train_ix)
                    sample_weight = np.zeros(num_train)
                    onco_ix = np.nonzero(self.y.ix[tmp_train_ix]==self.onco_num)[0]
                    tsg_ix = np.nonzero(self.y.ix[tmp_train_ix]==self.tsg_num)[0]
                    other_ix = np.nonzero(self.y.ix[tmp_train_ix]==self.other_num)[0]
                    sample_weight[onco_ix] = 1. / len(onco_ix)
                    sample_weight[tsg_ix] = 1. / len(tsg_ix)
                    sample_weight[other_ix] = 1. / len(other_ix)

                    # do training with sample weighting
                    self.clf.fit(self.x.ix[tmp_train_ix].copy(),
                                 self.y.ix[tmp_train_ix].copy(),
                                 sample_weight=sample_weight)
                else:
                    # do training without weighting
                    self.clf.fit(self.x.ix[tmp_train_ix].copy(),
                                 self.y.ix[tmp_train_ix].copy())
                self.clf.append_fold_result()  # add the training result from each fold
            self.clf.append_cv_result()  # add the training result for a single CV to the R variable

            self.num_pred += 1
        self.clf.set_cv_fold(self.test_fold_df)
Example #5
0
 def train(self):
     """Train classifier on entire data set provided."""
     self.x, self.y = futils.randomize(self.x, self.prng)
     futils.check_num_classes(self.y)  # warn user if not 3 classes
     self.clf.fit(self.x, self.y)
Example #6
0
    def kfold_validation(self, k=10):
        """Records the performance in terms of ROC and PR AUC for cross-validation.

        Params
        ------
        k : int (10)
            Number of cross-validation folds
        """
        self.num_pred = 0  # number of predictions

        for i in range(self.total_iter):
            # randomize for another round
            self.x, self.y = futils.randomize(self.x, self.prng)
            futils.check_num_classes(self.y)  # warn user if not 3 classes

            # initialize predicted results variables
            num_genes = len(self.y)
            onco_pred = np.zeros(num_genes)
            onco_prob = np.zeros(num_genes)
            tsg_pred = np.zeros(num_genes)
            tsg_prob = np.zeros(num_genes)
            overall_pred = np.zeros(num_genes)

            # set up stratified kfold iterator
            k_fold = cross_validation.StratifiedKFold(self.y, n_folds=k)

            # evaluate k-fold cross validation
            for train_ix, test_ix in k_fold:
                if self.is_weighted_sample:
                    # weight classes by using sample weights
                    num_train = len(train_ix)
                    sample_weight = np.zeros(num_train)
                    onco_ix = np.nonzero(self.y[train_ix] == self.onco_num)[0]
                    tsg_ix = np.nonzero(self.y[train_ix] == self.tsg_num)[0]
                    other_ix = np.nonzero(
                        self.y[train_ix] == self.other_num)[0]
                    sample_weight[onco_ix] = 1. / len(onco_ix)
                    sample_weight[tsg_ix] = 1. / len(tsg_ix)
                    sample_weight[other_ix] = 1. / len(other_ix)

                    # do training
                    self.clf.fit(self.x.iloc[train_ix].copy(),
                                 self.y.iloc[train_ix].copy(),
                                 sample_weight=sample_weight)
                else:
                    # do training without sample weights
                    self.clf.fit(self.x.iloc[train_ix].copy(),
                                 self.y.iloc[train_ix].copy())

                # do prediction
                y_pred = self.clf.predict(self.x.iloc[test_ix])
                proba_ = self.clf.predict_proba(self.x.iloc[test_ix])

                # update information
                overall_pred[
                    test_ix] = y_pred  # prediction including all classes
                onco_pred[test_ix] = (y_pred == self.onco_num).astype(
                    int)  # predicted oncogenes
                onco_prob[test_ix] = proba_[:, self.
                                            onco_num]  # predicted oncogenes
                tsg_pred[test_ix] = (y_pred == self.tsg_num).astype(
                    int)  # predicted oncogenes
                tsg_prob[test_ix] = proba_[:,
                                           self.tsg_num]  # predicted oncogenes

            # update information
            true_onco = (self.y == self.onco_num).astype(int)
            self._update_onco_metrics(true_onco, onco_pred, onco_prob)
            true_tsg = (self.y == self.tsg_num).astype(int)  # true oncogenes
            self._update_tsg_metrics(true_tsg, tsg_pred, tsg_prob)
            self._update_metrics(self.y, overall_pred, onco_prob, tsg_prob)
            self.num_pred += 1

        self._on_finish()  # update info for kfold cross-validation
 def train(self):
     """Train classifier on entire data set provided."""
     self.x, self.y = futils.randomize(self.x, self.prng)
     futils.check_num_classes(self.y) # warn user if not 3 classes
     self.clf.fit(self.x, self.y)
    def kfold_validation(self, k=10):
        """Records the performance in terms of ROC and PR AUC for cross-validation.

        Params
        ------
        k : int (10)
            Number of cross-validation folds
        """
        self.num_pred = 0  # number of predictions

        for i in range(self.total_iter):
            # randomize for another round
            self.x, self.y = futils.randomize(self.x, self.prng)
            futils.check_num_classes(self.y) # warn user if not 3 classes

            # initialize predicted results variables
            num_genes = len(self.y)
            onco_pred = np.zeros(num_genes)
            onco_prob = np.zeros(num_genes)
            tsg_pred = np.zeros(num_genes)
            tsg_prob = np.zeros(num_genes)
            overall_pred = np.zeros(num_genes)

            # set up stratified kfold iterator
            k_fold = cross_validation.StratifiedKFold(self.y,
                                                      n_folds=k)

            # evaluate k-fold cross validation
            for train_ix, test_ix in k_fold:
                if self.is_weighted_sample:
                    # weight classes by using sample weights
                    num_train = len(train_ix)
                    sample_weight = np.zeros(num_train)
                    onco_ix = np.nonzero(self.y[train_ix]==self.onco_num)[0]
                    tsg_ix = np.nonzero(self.y[train_ix]==self.tsg_num)[0]
                    other_ix = np.nonzero(self.y[train_ix]==self.other_num)[0]
                    sample_weight[onco_ix] = 1. / len(onco_ix)
                    sample_weight[tsg_ix] = 1. / len(tsg_ix)
                    sample_weight[other_ix] = 1. / len(other_ix)

                    # do training
                    self.clf.fit(self.x.iloc[train_ix].copy(),
                                 self.y.iloc[train_ix].copy(),
                                 sample_weight=sample_weight)
                else:
                    # do training without sample weights
                    self.clf.fit(self.x.iloc[train_ix].copy(),
                                 self.y.iloc[train_ix].copy())

                # do prediction
                y_pred = self.clf.predict(self.x.iloc[test_ix])
                proba_ = self.clf.predict_proba(self.x.iloc[test_ix])

                # update information
                overall_pred[test_ix] = y_pred  # prediction including all classes
                onco_pred[test_ix] = (y_pred==self.onco_num).astype(int)  # predicted oncogenes
                onco_prob[test_ix] = proba_[:, self.onco_num] # predicted oncogenes
                tsg_pred[test_ix] = (y_pred==self.tsg_num).astype(int)  # predicted oncogenes
                tsg_prob[test_ix] = proba_[:, self.tsg_num] # predicted oncogenes

            # update information
            true_onco = (self.y==self.onco_num).astype(int)
            self._update_onco_metrics(true_onco,
                                      onco_pred,
                                      onco_prob)
            true_tsg = (self.y==self.tsg_num).astype(int)  # true oncogenes
            self._update_tsg_metrics(true_tsg,
                                     tsg_pred,
                                     tsg_prob)
            self._update_metrics(self.y,
                                 overall_pred,
                                 onco_prob,
                                 tsg_prob)
            self.num_pred += 1

        self._on_finish()  # update info for kfold cross-validation