Exemple #1
0
 def model_param(self):
     print_section('PARAMETERS')
     print("learning rate:", self.nu)
     print("Lambda:", self.Lambda)
     print("maximum iteration:", self.maxiter)
     print("kernel function: ", self.kernel)
     print("method: ", self.method)
     return
Exemple #2
0
 def input_summary(self):
     print_section('SUMMARY')
     print("Analysis type:", self.problem)
     print("input folder:", self.input_folder)
     print("output folder:", self.output_folder)
     print("number of training samples:", self.Ntrain)
     print("number of testing samples:", self.Ntest)
     print("number of pathways:", self.Ngroup)
     print("number of gene predictors:", self.Npred)
     print("number of clinical predictors:", self.Npred_clin)
     return
Exemple #3
0
    def proc_input(self):
        """
        load corresponding data
        """
        print_section('LOAD DATA')
        # make output folder
        if not os.path.exists(self.output_folder):
            os.makedirs(self.output_folder)

        # training data
        thisfile = self.input_folder + "/" + self.group_file
        have_file(thisfile)
        self.pred_sets = pd.Series.from_csv(thisfile)

        thisfile = self.input_folder + "/" + self.train_predictor_file
        have_file(thisfile)
        self.train_predictors = pd.DataFrame.from_csv(thisfile)

        thisfile = self.input_folder + "/" + self.train_response_file
        have_file(thisfile)
        self.train_response = pd.DataFrame.from_csv(thisfile)

        # clinical file
        if self.hasClinical:
            thisfile = self.input_folder + "/" + self.clinical_file
            have_file(thisfile)
            self.train_clinical = pd.DataFrame.from_csv(thisfile)
            self.clin_names = self.train_clinical.columns
        # weights data
        if self.hasWeights:
            thisfile = self.input_folder + "/" + self.weights_file
            have_file(thisfile)
            raw_weights = pd.read_csv(thisfile,
                                      header=None,
                                      index_col=0,
                                      squeeze=True)
            self.proc_weight(raw_weights)

        # data summary
        self.Ntrain = self.train_predictors.shape[0]
        self.Ngroup = self.pred_sets.shape[0]
        self.Npred = self.train_predictors.shape[1]
        if self.hasClinical:
            self.Npred_clin = self.train_clinical.shape[1]
        self.group_names = self.pred_sets.index

        # change loaded indicator
        self.loaded = True
        return
Exemple #4
0
    def data_preprocessing(self, center=False, norm=False):
        print_section('PROCESS DATA')
        if not self.loaded:
            print("No data loaded. Can not preprocess.")
            return

        # center genomic data
        if center:
            print('Centering data.')
            scale(self.train_predictors, copy=False, with_std=False)

        # normalize data
        if norm:
            print("Normalizing data.")
            scale(self.train_predictors, copy=False, with_mean=False)

        # check groups
        print("Checking groups.")
        to_drop = []
        for i in range(len(self.pred_sets)):
            genes = self.pred_sets.values[i].split(" ")
            shared = np.intersect1d(self.train_predictors.columns.values,
                                    genes)
            if len(shared) == 0:
                print("Drop group:", self.pred_sets.index[i])
                to_drop.append(i)
            else:
                self.pred_sets.values[i] = ' '.join(shared)
        if len(to_drop) > 0:
            self.pred_sets = self.pred_sets.drop(self.pred_sets.index[to_drop])
            self.group_names = self.pred_sets.index

        # add intercept column to clinical data
        intercept_col = pd.DataFrame({'intercept': np.ones(self.Ntrain)},
                                     index=self.train_predictors.index)
        if self.hasClinical:
            if self.problem != "survival":
                self.train_clinical = pd.concat(
                    [self.train_clinical, intercept_col], axis=1)
        else:
            self.train_clinical = intercept_col

        # calculate summary
        self.Ngroup = len(self.pred_sets)
        return
Exemple #5
0
    def data_split(self):
        if not self.hasTest: return
        print_section('SPLIT DATA')
        print("Using test label: ", self.test_file)
        # load test file
        thisfile = self.input_folder + '/' + self.test_file
        f = open(thisfile, 'r')
        test_ind = [x.strip() for x in f]
        f.close()
        # split data
        self.test_predictors = self.train_predictors.loc[test_ind]
        self.test_response = self.train_response.loc[test_ind]
        self.test_clinical = self.train_clinical.loc[test_ind]
        train_ind = np.setdiff1d(self.train_predictors.index.values,
                                 np.array(test_ind))
        self.train_predictors = self.train_predictors.loc[train_ind]
        self.train_response = self.train_response.loc[train_ind]
        self.train_clinical = self.train_clinical.loc[train_ind]

        # update summary
        self.Ntest = len(self.test_response)
        self.Ntrain = len(self.train_response)
Exemple #6
0
def CV_PKB(inputs,
           K_train,
           Lambda,
           nfold=3,
           ESTOP=50,
           parallel=False,
           gr_sub=False,
           plot=False):
    ########## split data ###############
    temp = pd.Series(range(inputs.Ntrain), index=inputs.train_response.index)
    if inputs.problem == "classification":
        test_inds = subsamp(inputs.train_response,
                            inputs.train_response.columns[0], nfold)
    elif inputs.problem == 'survival':
        test_inds = subsamp(inputs.train_response,
                            inputs.train_response.columns[1], nfold)
    elif inputs.problem == "regression":
        test_inds = simple_subsamp(inputs.train_response, nfold)
    folds = []
    for i in range(nfold):
        folds.append([
            temp[test_inds[i]].values,
            np.setdiff1d(temp.values, temp[test_inds[i]].values)
        ])

    ########## initiate model for each fold ###############
    Ztrain_ls = [
        inputs.train_clinical.values[folds[i][1], :] for i in range(nfold)
    ]
    Ztest_ls = [
        inputs.train_clinical.values[folds[i][0], :] for i in range(nfold)
    ]
    K_train_ls = [
        K_train[np.ix_(folds[i][1], folds[i][1])] for i in range(nfold)
    ]
    K_test_ls = [
        K_train[np.ix_(folds[i][1], folds[i][0])] for i in range(nfold)
    ]

    if inputs.problem == "classification":
        ytrain_ls = [
            np.squeeze(inputs.train_response.iloc[folds[i][1]].values)
            for i in range(nfold)
        ]
        ytest_ls = [
            np.squeeze(inputs.train_response.iloc[folds[i][0]].values)
            for i in range(nfold)
        ]
        inputs_class = [
            CVinputs(inputs, ytrain_ls[i], ytest_ls[i]) for i in range(nfold)
        ]
        models = [
            assist.Classification.PKB_Classification(inputs_class[i],
                                                     ytrain_ls[i], ytest_ls[i])
            for i in range(nfold)
        ]
    elif inputs.problem == 'survival':
        ytrain_ls = [
            inputs.train_response.iloc[folds[i][1], ].values
            for i in range(nfold)
        ]
        ytest_ls = [
            inputs.train_response.iloc[folds[i][0], ].values
            for i in range(nfold)
        ]
        inputs_class = [
            CVinputs(inputs, ytrain_ls[i], ytest_ls[i]) for i in range(nfold)
        ]
        models = [
            assist.Survival.PKB_Survival(inputs_class[i], ytrain_ls[i],
                                         ytest_ls[i]) for i in range(nfold)
        ]
    elif inputs.problem == "regression":
        ytrain_ls = [
            np.squeeze(inputs.train_response.iloc[folds[i][1]].values)
            for i in range(nfold)
        ]
        ytest_ls = [
            np.squeeze(inputs.train_response.iloc[folds[i][0]].values)
            for i in range(nfold)
        ]
        inputs_class = [
            CVinputs(inputs, ytrain_ls[i], ytest_ls[i]) for i in range(nfold)
        ]
        models = [
            assist.Regression.PKB_Regression(inputs_class[i], ytrain_ls[i],
                                             ytest_ls[i]) for i in range(nfold)
        ]

    for x in models:
        x.init_F()

    ########## boosting for each fold ###############
    opt_iter = 0
    tmp_list = [x.test_loss[0] for x in models]
    min_loss = prev_loss = np.mean([x.test_loss[0] for x in models])
    ave_loss = [prev_loss]

    print_section('Cross-Validation')
    print("{:>9}\t{:>14}\t{:>24}".format("iteration", "Mean test loss",
                                         "time (if no E-stop)"))

    time0 = time.time()
    for t in range(1, inputs.maxiter + 1):
        # one iteration
        for k in range(nfold):
            if inputs.method == 'L2':
                [m,beta,gamma] = oneiter_L2(K_train_ls[k],Ztrain_ls[k],models[k],\
                                Lambda=Lambda,parallel = parallel,group_subset = gr_sub)
            if inputs.method == 'L1':
                [m,beta,gamma] = oneiter_L1(K_train_ls[k],Ztrain_ls[k],models[k],\
                                Lambda=Lambda,parallel = parallel,group_subset = gr_sub)
            # line search
            x = line_search(K_train_ls[k], Ztrain_ls[k], models[k],
                            [m, beta, gamma])
            beta *= x
            gamma *= x

            # update model
            models[k].update([m, beta, gamma], K_train_ls[k][:, :, m],
                             K_test_ls[k][:, :, m], Ztrain_ls[k], Ztest_ls[k],
                             inputs.nu)

        # save iteration
        #print("loss values: {}".format([x.test_loss[-1] for x in models]))
        cur_loss = np.mean([x.test_loss[-1] for x in models])
        #update best loss
        if cur_loss < min_loss:
            min_loss = cur_loss
            opt_iter = t
        ave_loss.append(cur_loss)

        # print report
        if t % 10 == 0:
            iter_persec = t / (time.time() - time0)  # time of one iteration
            rem_time = (inputs.maxiter - t) / iter_persec  # remaining time
            print("{:9.0f}\t{:14.4f}\t{:24.4f}".format(t, cur_loss,
                                                       rem_time / 60))

        # detect early stop
        if t - opt_iter >= ESTOP:
            print('Early stop criterion satisfied: break CV.')
            break

    # print the number of iterations used
    print('using iteration number:', opt_iter)

    # visualization
    if plot:
        folder = inputs.output_folder
        f = plt.figure()
        plt.plot(ave_loss)
        plt.xlabel("iterations")
        plt.ylabel("CV loss")
        f.savefig(folder + '/CV_loss.pdf')
    return opt_iter