コード例 #1
0
def readcrossval(feat_m1,
                 config,
                 sinkfolder,
                 patientinfo,
                 outputfolder,
                 feat_m2=None,
                 feat_m3=None,
                 alpha=0.95,
                 label_type=None,
                 survival=False,
                 n_classifiers=[1, 5, 10]):
    # n_classifiers = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 20 ,25, 30, 40 , 50]
    n_classifiers = [1]
    config = config_io.load_config(config)
    sinks = glob.glob(sinkfolder + 'RS*.hdf5')

    # Sort sinks based on creation date
    sinktimes = [os.path.getmtime(f) for f in sinks]
    sinks = [s for _, s in sorted(zip(sinktimes, sinks))]

    if label_type is None:
        label_type = config['Genetics']['mutation_type']

    if survival:
        # Also extract time to event and if event occurs from mutation data
        labels = [label_type, ['E'], ['T']]
    else:
        labels = [[label_type]]

    if feat_m1:
        label_data, _ =\
            readdata(feat_m1, feat_m2, feat_m3, patientinfo,
                     labels)
    else:
        # No feature files found
        label_data, _ = findmutationdata(patientinfo, labels)

    for n_class in n_classifiers:
        output_json = os.path.join(
            outputfolder, ('performance_{}.json').format(str(n_class)))

        sensitivity = list()
        specificity = list()
        precision = list()
        accuracy = list()
        auc = list()
        # auc_train = list()
        f1_score_list = list()

        patient_classification_list = dict()

        patient_IDs = label_data['patient_IDs']
        mutation_label = label_data['mutation_label']

        trained_classifiers = list()

        y_score = list()
        y_test = list()
        pid_test = list()
        y_predict = list()

        # For SVR
        r2score = list()
        MSE = list()
        coefICC = list()
        PearsonC = list()
        PearsonP = list()
        SpearmanC = list()
        SpearmanP = list()

        if survival:
            cindex = list()
            coxp = list()
            coxcoef = list()

        patient_MSE = dict()

        csvfile = os.path.join(outputfolder, 'scores.csv')
        towrite = list()

        empty_scores = {k: '' for k in natsort.natsorted(patient_IDs)}
        empty_scores = collections.OrderedDict(sorted(empty_scores.items()))
        towrite.append(["Patient"] + empty_scores.keys())
        params = dict()
        for num, s in enumerate(sinks):
            scores = empty_scores.copy()
            print("Processing {} / {}.").format(str(num + 1), str(len(sinks)))
            with open(s, 'r') as fp:
                sr = pd.read_hdf(fp)
            sr = sr['Constructed crossvalidation']
            t = sr.trained_classifier
            trained_classifiers.append(sr.trained_classifier)

            # Extract test info
            test_patient_IDs = sr.patient_ID_test
            X_test = sr.X_test
            Y_test = sr.Y_test

            # Extract sample size
            N_1 = float(len(sr.patient_ID_train))
            N_2 = float(len(sr.patient_ID_test))

            test_indices = list()
            for i_ID in test_patient_IDs:
                test_indices.append(np.where(patient_IDs == i_ID)[0][0])

                if i_ID not in patient_classification_list:
                    patient_classification_list[i_ID] = dict()
                    patient_classification_list[i_ID]['N_test'] = 0
                    patient_classification_list[i_ID]['N_correct'] = 0
                    patient_classification_list[i_ID]['N_wrong'] = 0

                patient_classification_list[i_ID]['N_test'] += 1

            # y_truth = [mutation_label[0][k] for k in test_indices]
            # FIXME: order can be switched, need to find a smart fix
            # 1 for normal, 0 for KM
            y_truth = [mutation_label[0][k][0] for k in test_indices]

            # Predict using the top N classifiers
            results = t.cv_results_['rank_test_score']
            indices = range(0, len(results))
            sortedindices = [x for _, x in sorted(zip(results, indices))]
            sortedindices = sortedindices[0:n_class]
            y_prediction = np.zeros([n_class, len(y_truth)])
            y_score = np.zeros([n_class, len(y_truth)])

            # Get some base objects required
            feature_labels = pd.read_hdf(feat_m1[0]).feature_labels
            base_estimator = t.estimator
            X_train = [(x, feature_labels) for x in sr.X_train]
            y_train = sr.Y_train
            y_train_prediction = np.zeros([n_class, len(y_train)])
            scorer = t.scorer_
            train = np.asarray(range(0, len(y_train)))
            test = train
            del sr  # Save some memory
            # cv_iter = list(t.cv.iter(X_train, y_train))

            # NOTE: need to build this in the SearchCVFastr Object
            for i, index in enumerate(sortedindices):
                print("Processing number {} of {} classifiers.").format(
                    str(i + 1), str(n_class))
                X_testtemp = X_test[:]

                # Get the parameters from the index
                parameters_est = t.cv_results_['params'][index]
                parameters_all = t.cv_results_['params_all'][index]

                # NOTE: kernel parameter can be unicode
                kernel = str(parameters_est[u'kernel'])
                del parameters_est[u'kernel']
                del parameters_all[u'kernel']
                parameters_est['kernel'] = kernel
                parameters_all['kernel'] = kernel

                # Refit a classifier using the settings given
                print("Refitting classifier with best settings.")
                best_estimator = clone(base_estimator).set_params(
                    **parameters_est)

                ret, GroupSel, VarSel, SelectModel, feature_labels[0], scaler =\
                    fit_and_score(best_estimator, X_train, y_train, scorer,
                                  train, test, True, parameters_all,
                                  t.fit_params,
                                  t.return_train_score,
                                  True, True, True,
                                  t.error_score)

                X = [x[0] for x in X_train]
                if GroupSel is not None:
                    X = GroupSel.transform(X)
                    X_testtemp = GroupSel.transform(X_testtemp)

                if SelectModel is not None:
                    X = SelectModel.transform(X)
                    X_testtemp = SelectModel.transform(X_testtemp)

                if VarSel is not None:
                    X = VarSel.transform(X)
                    X_testtemp = VarSel.transform(X_testtemp)

                if scaler is not None:
                    X = scaler.transform(X)
                    X_testtemp = scaler.transform(X_testtemp)

                if y_train is not None:
                    best_estimator.fit(X, y_train, **t.fit_params)
                else:
                    best_estimator.fit(X, **t.fit_params)

                # Predict the posterios using the fitted classifier for the training set
                print("Evaluating performance on training set.")
                if hasattr(best_estimator, 'predict_proba'):
                    probabilities = best_estimator.predict_proba(X)
                    y_train_prediction[i, :] = probabilities[:, 1]
                else:
                    # Regression has no probabilities
                    probabilities = best_estimator.predict(X)
                    y_train_prediction[i, :] = probabilities[:]

                # Predict the posterios using the fitted classifier for the test set
                print("Evaluating performance on test set.")
                if hasattr(best_estimator, 'predict_proba'):
                    probabilities = best_estimator.predict_proba(X_testtemp)
                    y_prediction[i, :] = probabilities[:, 1]
                else:
                    # Regression has no probabilities
                    probabilities = best_estimator.predict(X_testtemp)
                    y_prediction[i, :] = probabilities[:]

                if type(t.estimator) == sklearn.svm.classes.SVC:
                    y_score[i, :] = best_estimator.decision_function(
                        X_testtemp)
                else:
                    y_score[i, :] = best_estimator.decision_function(
                        X_testtemp)[:, 0]

                # Add number parameter settings
                for k in parameters_all.keys():
                    if k not in params.keys():
                        params[k] = list()
                    params[k].append(parameters_all[k])

                # Save some memory
                del best_estimator, X, X_testtemp, ret, GroupSel, VarSel, SelectModel, scaler, parameters_est, parameters_all, probabilities

            # Take mean over posteriors of top n
            y_train_prediction_m = np.mean(y_train_prediction, axis=0)
            y_prediction_m = np.mean(y_prediction, axis=0)

            # NOTE: Not sure if this is best way to compute AUC
            y_score = y_prediction_m

            if type(t.estimator) == sklearn.svm.classes.SVC:
                # Look for optimal F1 performance on training set
                thresholds = np.arange(0, 1, 0.01)
                f1_scores = list()
                y_train_prediction = np.zeros(y_train_prediction_m.shape)
                for t in thresholds:
                    for ip, y in enumerate(y_train_prediction_m):
                        if y > t:
                            y_train_prediction[ip] = 1
                        else:
                            y_train_prediction[ip] = 0

                    f1_scores.append(
                        f1_score(y_train_prediction,
                                 y_train,
                                 average='weighted'))

                # Use best threshold to determine test score
                best_index = np.argmax(f1_scores)
                best_thresh = thresholds[best_index]
                best_thresh = 0.5
                y_prediction = np.zeros(y_prediction_m.shape)
                for ip, y in enumerate(y_prediction_m):
                    if y > best_thresh:
                        y_prediction[ip] = 1
                    else:
                        y_prediction[ip] = 0

                # y_prediction = t.predict(X_temp)

                y_prediction = [min(max(y, 0), 1) for y in y_prediction]
            else:
                y_prediction = y_prediction_m
                y_prediction = [min(max(y, 0), 1) for y in y_prediction]

            print "Truth: ", y_truth
            print "Prediction: ", y_prediction

            for k, v in zip(test_patient_IDs, y_prediction):
                scores[k] = v

            # for k, v in scores.iteritems():
            #     print k, v
            #
            # raise IOError
            towrite.append(["Iteration " + str()] + scores.values())

            if type(t.estimator) == sklearn.svm.classes.SVC:
                for i_truth, i_predict, i_test_ID in zip(
                        y_truth, y_prediction, test_patient_IDs):
                    if i_truth == i_predict:
                        patient_classification_list[i_test_ID][
                            'N_correct'] += 1
                    else:
                        patient_classification_list[i_test_ID]['N_wrong'] += 1

            if type(t.estimator) == sklearn.svm.classes.SVC:
                c_mat = confusion_matrix(y_truth, y_prediction)
                TN = c_mat[0, 0]
                FN = c_mat[1, 0]
                TP = c_mat[1, 1]
                FP = c_mat[0, 1]

                if FN == 0 and TP == 0:
                    sensitivity.append(0)
                else:
                    sensitivity.append(float(TP) / (TP + FN))
                if FP == 0 and TN == 0:
                    specificity.append(0)
                else:
                    specificity.append(float(TN) / (FP + TN))
                if TP == 0 and FP == 0:
                    precision.append(0)
                else:
                    precision.append(float(TP) / (TP + FP))
                accuracy.append(accuracy_score(y_truth, y_prediction))
                # y_score = t.decision_function(X_temp)
                auc.append(roc_auc_score(y_truth, y_score))
                f1_score_list.append(
                    f1_score(y_truth, y_prediction, average='weighted'))
            # elif type(t.estimator) == sklearn.svm.classes.SVR:
            else:
                # y_score.extend(svm[k].ix('svms')[0].predict_proba(X_test))
                # y_predict.extend(svm[k].ix('svms')[0].predict(X_test))
                # y_test.extend(Y_test)
                # pid_test.extend(pidt)
                r2score.append(r2_score(y_truth, y_prediction))
                MSE.append(mean_squared_error(y_truth, y_prediction))
                coefICC.append(ICC(np.column_stack((y_prediction, y_truth))))
                C = pearsonr(y_prediction, y_truth)
                PearsonC.append(C[0])
                PearsonP.append(C[1])
                C = spearmanr(y_prediction, y_truth)
                SpearmanC.append(C.correlation)
                SpearmanP.append(C.pvalue)

                if survival:
                    # Extract time to event and event from label data
                    E_truth = np.asarray(
                        [mutation_label[1][k][0] for k in test_indices])
                    T_truth = np.asarray(
                        [mutation_label[2][k][0] for k in test_indices])

                    # Concordance index
                    cindex.append(1 - ll.utils.concordance_index(
                        T_truth, y_prediction, E_truth))

                    # Fit Cox model using SVR output, time to event and event
                    data = {
                        'predict': y_prediction,
                        'E': E_truth,
                        'T': T_truth
                    }
                    data = pd.DataFrame(data=data, index=test_patient_IDs)

                    try:
                        cph = ll.CoxPHFitter()
                        cph.fit(data, duration_col='T', event_col='E')

                        coxcoef.append(cph.summary['coef']['predict'])
                        coxp.append(cph.summary['p']['predict'])
                    except ValueError:
                        # Convergence halted, delta contains nan values?
                        coxcoef.append(1)
                        coxp.append(0)
                    except np.linalg.LinAlgError:
                        #FIXME: Singular matrix
                        coxcoef.append(1)
                        coxp.append(0)

        towrite = zip(*towrite)
        with open(csvfile, 'wb') as csv_file:
            writer = csv.writer(csv_file)
            for w in towrite:
                writer.writerow(w)

        # print(N_1)
        # print(N_2)

        if type(t.estimator) == sklearn.svm.classes.SVC:
            N_iterations = len(sinks)
            accuracy_mean = np.mean(accuracy)
            S_uj = 1.0 / max((N_iterations - 1), 1) * np.sum(
                (accuracy_mean - accuracy)**2.0)

            # print Y_test

            accuracy_var = np.sqrt((1.0 / N_iterations + N_2 / N_1) * S_uj)
            # print(accuracy_var)
            # print(np.sqrt(1/N_iterations*S_uj))
            # print(st.sem(accuracy))

            stats = dict()
            stats["Accuracy 95%:"] = str(CI(accuracy, N_1, N_2, alpha))

            stats["AUC 95%:"] = str(CI(auc, N_1, N_2, alpha))

            stats["F1-score 95%:"] = str(CI(f1_score_list, N_1, N_2, alpha))

            stats["Precision 95%:"] = str(CI(precision, N_1, N_2, alpha))

            stats["Sensitivity 95%: "] = str(CI(sensitivity, N_1, N_2, alpha))

            stats["Specificity 95%:"] = str(CI(specificity, N_1, N_2, alpha))

            print("Accuracy 95%:" + str(CI(accuracy, N_1, N_2, alpha)))

            print("AUC 95%:" + str(CI(auc, N_1, N_2, alpha)))

            print("F1-score 95%:" + str(CI(f1_score_list, N_1, N_2, alpha)))

            print("Precision 95%:" + str(CI(precision, N_1, N_2, alpha)))

            print("Sensitivity 95%: " + str(CI(sensitivity, N_1, N_2, alpha)))

            print("Specificity 95%:" + str(CI(specificity, N_1, N_2, alpha)))

            alwaysright = dict()
            alwayswrong = dict()
            for i_ID in patient_classification_list:
                percentage_right = patient_classification_list[i_ID][
                    'N_correct'] / float(
                        patient_classification_list[i_ID]['N_test'])

                # print(i_ID + ' , ' + str(patient_classification_list[i_ID]['N_test']) + ' : ' + str(percentage_right) + '\n')
                if percentage_right == 1.0:
                    label = mutation_label[0][np.where(i_ID == patient_IDs)]
                    label = label[0][0]
                    alwaysright[i_ID] = label
                    # alwaysright.append(('{} ({})').format(i_ID, label))
                    print(("Always Right: {}, label {}").format(i_ID, label))

                if percentage_right == 0:
                    label = mutation_label[0][np.where(
                        i_ID == patient_IDs)].tolist()
                    label = label[0][0]
                    alwayswrong[i_ID] = label
                    # alwayswrong.append(('{} ({})').format(i_ID, label))
                    print(("Always Wrong: {}, label {}").format(i_ID, label))

            stats["Always right"] = alwaysright
            stats["Always wrong"] = alwayswrong
            # Gather all scores for all patients and average
            pid_unique = list(set(pid_test))
            pid_unique = sorted(pid_unique)
            posteriors = dict()
            for pid in pid_unique:
                posteriors[pid] = list()

                counts = 0
                for num, allid in enumerate(pid_test):
                    if allid == pid:
                        counts += 1
                        posteriors[pid].append(y_score[num][0])
                        truelabel = y_test[num]

                posteriors[pid] = [np.mean(posteriors[pid]), truelabel, counts]
        # elif type(t.estimator) == sklearn.svm.classes.SVR:
        else:
            # Compute confidence intervals from cross validations
            stats = dict()
            stats["r2_score 95%:"] = str(CI(r2score, N_1, N_2, alpha))
            stats["MSE 95%:"] = str(CI(MSE, N_1, N_2, alpha))
            stats["ICC 95%:"] = str(CI(coefICC, N_1, N_2, alpha))
            stats["PearsonC 95%:"] = str(CI(PearsonC, N_1, N_2, alpha))
            stats["SpearmanC 95%: "] = str(CI(SpearmanC, N_1, N_2, alpha))
            stats["PearsonP 95%:"] = str(CI(PearsonP, N_1, N_2, alpha))
            stats["SpearmanP 95%: "] = str(CI(SpearmanP, N_1, N_2, alpha))

            if survival:
                stats["Concordance 95%:"] = str(CI(cindex, N_1, N_2, alpha))
                stats["Cox coef. 95%:"] = str(CI(coxcoef, N_1, N_2, alpha))
                stats["Cox p 95%:"] = str(CI(coxp, N_1, N_2, alpha))

            # Calculate and sort individual patient MSE
            patient_MSE = {k: np.mean(v) for k, v in patient_MSE.iteritems()}
            order = np.argsort(patient_MSE.values())
            sortedkeys = np.asarray(patient_MSE.keys())[order].tolist()
            sortedvalues = np.asarray(patient_MSE.values())[order].tolist()
            patient_MSE = [(k, v) for k, v in zip(sortedkeys, sortedvalues)]

            for p in patient_MSE:
                print p[0], p[1]

            stats["Patient_MSE"] = patient_MSE

            for k, v in stats.iteritems():
                print k, v

        # Check which parameters were most often used
        params = paracheck(params)
        # params = dict()
        # for num, classf in enumerate(trained_classifiers):
        #     params_temp = classf.best_params_
        #     if num == 0:
        #         for k in params_temp.keys():
        #             params[k] = list()
        #             params[k].append(params_temp[k])
        #     else:
        #         for k in params_temp.keys():
        #             params[k].append(params_temp[k])
        #
        # print params

        # # Make histograms or box plots of params
        # for k in params.keys():
        #     para = params[k]
        #     print k
        #     if type(para[0]) is unicode:
        #         letter_counts = Counter(para)
        #         values = letter_counts.values()
        #         keys = letter_counts.keys()
        #         print keys, values
        #         plt.bar(range(len(values)), values, align='center')
        #         plt.xticks(range(len(keys)), keys)
        #         plt.show()
        #     else:
        #         # Make a standard boxplot
        #         plt.figure()
        #         plt.boxplot(para, 0, 'gD')
        #         plt.show()

        # Save output
        savedict = dict()
        savedict["Statistics"] = stats
        savedict['Parameters'] = params

        if type(output_json) is list:
            output_json = ''.join(output_json)

        if not os.path.exists(os.path.dirname(output_json)):
            os.makedirs(os.path.dirname(output_json))

        with open(output_json, 'w') as fp:
            json.dump(savedict, fp, indent=4)

        print("Saved data!")
コード例 #2
0
def trainclassifier(feat_train,
                    patientinfo_train,
                    config,
                    output_hdf,
                    output_json,
                    feat_test=None,
                    patientinfo_test=None,
                    fixedsplits=None,
                    verbose=True):
    '''
    Train a classifier using machine learning from features. By default, if no
    split in training and test is supplied, a cross validation
    will be performed.

    Parameters
    ----------
    feat_train: string, mandatory
            contains the paths to all .hdf5 feature files used.
            modalityname1=file1,file2,file3,... modalityname2=file1,...
            Thus, modalities names are always between a space and a equal
            sign, files are split by commas. We assume that the lists of
            files for each modality has the same length. Files on the
            same position on each list should belong to the same patient.

    patientinfo: string, mandatory
            Contains the path referring to a .txt file containing the
            patient label(s) and value(s) to be used for learning. See
            the Github Wiki for the format.

    config: string, mandatory
            path referring to a .ini file containing the parameters
            used for feature extraction. See the Github Wiki for the possible
            fields and their description.

    output_hdf: string, mandatory
            path refering to a .hdf5 file to which the final classifier and
            it's properties will be written to.

    output_json: string, mandatory
            path refering to a .json file to which the performance of the final
            classifier will be written to. This file is generated through one of
            the PREDICT plotting functions.

    feat_test: string, optional
            When this argument is supplied, the machine learning will not be
            trained using a cross validation, but rather using a fixed training
            and text split. This field should contain paths of the test set
            feature files, similar to the feat_train argument.

    patientinfo_test: string, optional
            When feat_test is supplied, you can supply optionally a patient label
            file through which the performance will be evaluated.

    fixedsplits: string, optional
            By default, random split cross validation is used to train and
            evaluate the machine learning methods. Optionally, you can provide
            a .xlsx file containing fixed splits to be used. See the Github Wiki
            for the format.

    verbose: boolean, default True
            print final feature values and labels to command line or not.

    '''
    # Load variables from the config file
    config = config_io.load_config(config)

    if type(feat_train) is list:
        feat_train = ''.join(feat_train)

    if type(patientinfo_train) is list:
        patientinfo_train = ''.join(patientinfo_train)

    if type(config) is list:
        config = ''.join(config)

    label_type = config['Genetics']['label_names']

    # Split the features per modality
    feat_train_temp = [str(item).strip() for item in feat_train.split('=')]
    feat_train_temp = feat_train_temp[
        1::]  # First item is the first modality name
    feat_train = list()
    for feat_mod in feat_train_temp:
        feat_mod_temp = [str(item).strip() for item in feat_mod.split(',')]

        # Last item contains name of next modality if multiple, seperated by a space
        space = feat_mod_temp[-1].find(' ')
        if space != -1:
            feat_mod_temp[-1] = feat_mod_temp[-1][0:space]
        feat_train.append(feat_mod_temp)

    # Read the features and classification data
    label_data_train, image_features_train =\
        load_data(feat_train, patientinfo_train,
                       label_type)

    if feat_test is not None:
        # Split the features per modality
        feat_test_temp = [str(item).strip() for item in feat_test.split('=')]
        feat_test_temp = feat_test_temp[
            1::]  # First item is the first modality name
        feat_test = list()
        for feat_mod in feat_test_temp:
            feat_mod_temp = [str(item).strip() for item in feat_mod.split(',')]

            # Last item contains name of next modality if multiple, seperated by a space
            space = feat_mod_temp[-1].find(' ')
            if space != -1:
                feat_mod_temp[-1] = feat_mod_temp[-1][0:space]
            feat_test.append(feat_mod_temp)

        label_data_test, image_features_test =\
            load_data(feat_test, patientinfo_test,
                           label_type)

    # Create tempdir name from patientinfo file name
    basename = os.path.basename(patientinfo_train)
    filename, _ = os.path.splitext(basename)
    path = patientinfo_train
    for i in range(4):
        # Use temp dir: result -> sample# -> parameters - > temppath
        path = os.path.dirname(path)

    _, path = os.path.split(path)
    path = os.path.join(path, 'trainclassifier', filename)

    # Construct the required classifier
    classifier, param_grid =\
        cc.construct_classifier(config)

    # Append the feature groups to the parameter grid
    if config['General']['FeatureCalculator'] == 'CalcFeatures':
        param_grid['SelectGroups'] = 'True'
        for group in config['SelectFeatGroup'].keys():
            param_grid[group] = config['SelectFeatGroup'][group]

    # if config['FeatureSelection']['SelectFromModel']:
    #     param_grid['SelectFromModel'] = ['Lasso', False]

    if config['FeatureScaling']['scale_features']:
        if type(config['FeatureScaling']['scaling_method']) is not list:
            param_grid['FeatureScaling'] = [
                config['FeatureScaling']['scaling_method']
            ]
        else:
            param_grid['FeatureScaling'] = config['FeatureScaling'][
                'scaling_method']

    param_grid['Featsel_Variance'] = config['Featsel']['Variance']

    # For N_iter, perform k-fold crossvalidation
    if feat_test is None:
        trained_classifier = cv.crossval(
            config,
            label_data_train,
            image_features_train,
            classifier,
            param_grid,
            use_fastr=config['Classification']['fastr'],
            fixedsplits=fixedsplits)
    else:
        trained_classifier = cv.nocrossval(config, label_data_train,
                                           label_data_test,
                                           image_features_train,
                                           image_features_test, classifier,
                                           param_grid,
                                           config['Classification']['fastr'])

    if type(output_hdf) is list:
        output_hdf = ''.join(output_hdf)

    if not os.path.exists(os.path.dirname(output_hdf)):
        os.makedirs(os.path.dirname(output_hdf))

    trained_classifier.to_hdf(output_hdf, 'SVMdata')

    # Calculate statistics of performance
    if feat_test is None:
        if type(classifier) == sklearn.svm.SVR:
            statistics = plot_single_SVR(trained_classifier, label_data_train,
                                         label_type)
        else:
            statistics = plot_single_SVM(trained_classifier, label_data_train,
                                         label_type)
    else:
        if patientinfo_test is not None:
            if type(classifier) == sklearn.svm.SVR:
                statistics = plot_single_SVR(trained_classifier,
                                             label_data_test, label_type)
            else:
                statistics = plot_single_SVM(trained_classifier,
                                             label_data_test, label_type)
        else:
            statistics = None

    # Save output
    savedict = dict()
    savedict["Statistics"] = statistics

    if type(output_json) is list:
        output_json = ''.join(output_json)

    if not os.path.exists(os.path.dirname(output_json)):
        os.makedirs(os.path.dirname(output_json))

    with open(output_json, 'w') as fp:
        json.dump(savedict, fp, indent=4)

    print("Saved data!")
コード例 #3
0
def trainclassifier(feat_m1,
                    patientinfo,
                    config,
                    parameter_file,
                    output_svm,
                    output_json,
                    feat_m2=None,
                    feat_m3=None,
                    verbose=True):
    # Load variables from the config file
    config = config_io.load_config(config)
    if type(parameter_file) is list:
        parameter_file = ''.join(parameter_file)

    if type(patientinfo) is list:
        patientinfo = ''.join(patientinfo)

    if type(config) is list:
        config = ''.join(config)

    with open(parameter_file) as data_file:
        parameters = json.load(data_file)

    label_type = config['Genetics']['mutation_type']

    # Read the features and classification data
    image_features_select, labels, label_data =\
        readdata(feat_m1, feat_m2, feat_m3, patientinfo,
                 label_type, parameters)

    # Delete features which are are the same in more than 99% of patients
    # TODO: Separate this into a different tool
    sel = VarianceThreshold(threshold=0.99 * (1 - 0.99))
    sel = sel.fit(image_features_select)
    image_features_select = sel.transform(image_features_select)
    labels = sel.transform(labels).tolist()[0]

    # If we have too few features left, don't proceed
    if len(image_features_select[1]) > 7:

        # Create tempdir name from parameter file name
        basename = os.path.basename(parameter_file)
        filename, _ = os.path.splitext(basename)
        path = parameter_file
        for i in range(4):
            # Use temp dir: result -> sample# -> parameters - > temppath
            path = os.path.dirname(path)

        _, path = os.path.split(path)
        path = os.path.join(path, 'trainclassifier', filename)

        # Construct the required classifier
        classifier, param_grid =\
            cc.construct_classifier(config,
                                    image_features_select[0])

        # For N_iter, perform k-fold crossvalidation
        if config['Classification']['fastr']:
            trained_classifier = cv.crossvalfastr(config, label_data,
                                                  image_features_select,
                                                  classifier, param_grid, path)
        else:
            trained_classifier = cv.crossval(config, label_data,
                                             image_features_select, classifier,
                                             param_grid, path)
        # Add labels to dataframe
        # TODO: Works only if single mutation is present
        labels_pd =\
            pd.Series([labels],
                      index=[trained_classifier.keys()[0]],
                      name='feature_labels')
        classifier = classifier.append(labels_pd)

        # Calculate statistics of performance
        statistics = plot_single_SVM(classifier, label_data)

    else:
        statistics = "None"

        labels = ["Too Few Features."]
        feat = ["None"]

        panda_dict = dict(zip(labels, feat))

        classifier = pd.Series(panda_dict)

    # Save output
    savedict = dict()
    savedict["Parameters"] = parameters
    savedict["Statistics"] = statistics

    print("Saving data!")
    if type(output_svm) is list:
        output_svm = ''.join(output_svm)

    if type(output_json) is list:
        output_json = ''.join(output_json)

    # TODO: ouptu_svm/json are list objects!
    classifier.to_hdf(output_svm, 'SVMdata')
    with open(output_json, 'w') as fp:
        json.dump(savedict, fp, indent=4)
コード例 #4
0
def StatisticalTestFeatures(features,
                            patientinfo,
                            config,
                            output=None,
                            verbose=True):
    '''
    Perform several statistical tests on features, such as a student t-test.
    Useage is similar to trainclassifier.

    Parameters
    ----------
    features: string, mandatory
            contains the paths to all .hdf5 feature files used.
            modalityname1=file1,file2,file3,... modalityname2=file1,...
            Thus, modalities names are always between a space and a equal
            sign, files are split by commas. We assume that the lists of
            files for each modality has the same length. Files on the
            same position on each list should belong to the same patient.

    patientinfo: string, mandatory
            Contains the path referring to a .txt file containing the
            patient label(s) and value(s) to be used for learning. See
            the Github Wiki for the format.

    config: string, mandatory
            path referring to a .ini file containing the parameters
            used for feature extraction. See the Github Wiki for the possible
            fields and their description.

    # TODO: outputs

    verbose: boolean, default True
            print final feature values and labels to command line or not.

    '''
    # Load variables from the config file
    config = config_io.load_config(config)

    if type(patientinfo) is list:
        patientinfo = ''.join(patientinfo)

    if type(config) is list:
        config = ''.join(config)

    if type(output) is list:
        output = ''.join(output)

    # Create output folder if required
    if not os.path.exists(os.path.dirname(output)):
        os.makedirs(os.path.dirname(output))

    label_type = config['Genetics']['label_names']

    # Read the features and classification data
    print("Reading features and label data.")
    label_data, image_features =\
        load_features(features, patientinfo, label_type)

    # Extract feature labels and put values in an array
    feature_labels = image_features[0][1]
    feature_values = np.zeros([len(image_features), len(feature_labels)])
    for num, x in enumerate(image_features):
        feature_values[num, :] = x[0]

    # -----------------------------------------------------------------------
    # Perform statistical tests
    print("Performing statistical tests.")
    label_value = label_data['mutation_label']
    label_name = label_data['mutation_name']

    header = list()
    subheader = list()
    for i_name in label_name:
        header.append(str(i_name[0]))
        header.append('')
        header.append('')
        header.append('')
        header.append('')
        header.append('')

        subheader.append('Label')
        subheader.append('Ttest')
        subheader.append('Welch')
        subheader.append('Wilcoxon')
        subheader.append('Mann-Whitney')
        subheader.append('')

    # Open the output file
    if output is not None:
        myfile = open(output, 'wb')
        wr = csv.writer(myfile, quoting=csv.QUOTE_ALL)
        wr.writerow(header)
        wr.writerow(subheader)

    savedict = dict()
    for i_class, i_name in zip(label_value, label_name):
        savedict[i_name[0]] = dict()
        pvalues = list()
        pvalueswelch = list()
        pvalueswil = list()
        pvaluesmw = list()

        for num, fl in enumerate(feature_labels):
            fv = feature_values[:, num]
            classlabels = i_class.ravel()

            class1 = [i for j, i in enumerate(fv) if classlabels[j] == 1]
            class2 = [i for j, i in enumerate(fv) if classlabels[j] == 0]

            pvalues.append(ttest_ind(class1, class2)[1])
            pvalueswelch.append(ttest_ind(class1, class2, equal_var=False)[1])
            pvalueswil.append(ranksums(class1, class2)[1])
            try:
                pvaluesmw.append(mannwhitneyu(class1, class2)[1])
            except ValueError as e:
                print("[PREDICT Warning] " + str(e) +
                      '. Replacing metric value by 1.')
                pvaluesmw.append(1)

        # Sort based on p-values:
        pvalues = np.asarray(pvalues)
        indices = np.argsort(pvalues)
        pvalues = pvalues[indices].tolist()
        feature_labels_o = np.asarray(feature_labels)[indices].tolist()
        pvalueswelch = np.asarray(pvalueswelch)[indices].tolist()
        pvalueswil = np.asarray(pvalueswil)[indices].tolist()
        pvaluesmw = np.asarray(pvaluesmw)[indices].tolist()

        savedict[i_name[0]]['ttest'] = pvalues
        savedict[i_name[0]]['welch'] = pvalueswelch
        savedict[i_name[0]]['wil'] = pvalueswil
        savedict[i_name[0]]['mw'] = pvaluesmw
        savedict[i_name[0]]['labels'] = feature_labels_o

    if output is not None:
        for num in range(0, len(savedict[i_name[0]]['ttest'])):
            writelist = list()
            for i_name in savedict.keys():
                labeldict = savedict[i_name]
                writelist.append(labeldict['labels'][num])
                writelist.append(labeldict['ttest'][num])
                writelist.append(labeldict['welch'][num])
                writelist.append(labeldict['wil'][num])
                writelist.append(labeldict['mw'][num])
                writelist.append('')

            wr.writerow(writelist)

        print("Saved data!")

    return savedict
コード例 #5
0
def trainclassifier(feat_train, patientinfo_train, config,
                    output_hdf, output_json,
                    feat_test=None, patientinfo_test=None,
                    fixedsplits=None, verbose=True):
    '''
    Train a classifier using machine learning from features. By default, if no
    split in training and test is supplied, a cross validation
    will be performed.

    Parameters
    ----------
    feat_train: string, mandatory
            contains the paths to all .hdf5 feature files used.
            modalityname1=file1,file2,file3,... modalityname2=file1,...
            Thus, modalities names are always between a space and a equal
            sign, files are split by commas. We assume that the lists of
            files for each modality has the same length. Files on the
            same position on each list should belong to the same patient.

    patientinfo: string, mandatory
            Contains the path referring to a .txt file containing the
            patient label(s) and value(s) to be used for learning. See
            the Github Wiki for the format.

    config: string, mandatory
            path referring to a .ini file containing the parameters
            used for feature extraction. See the Github Wiki for the possible
            fields and their description.

    output_hdf: string, mandatory
            path refering to a .hdf5 file to which the final classifier and
            it's properties will be written to.

    output_json: string, mandatory
            path refering to a .json file to which the performance of the final
            classifier will be written to. This file is generated through one of
            the PREDICT plotting functions.

    feat_test: string, optional
            When this argument is supplied, the machine learning will not be
            trained using a cross validation, but rather using a fixed training
            and text split. This field should contain paths of the test set
            feature files, similar to the feat_train argument.

    patientinfo_test: string, optional
            When feat_test is supplied, you can supply optionally a patient label
            file through which the performance will be evaluated.

    fixedsplits: string, optional
            By default, random split cross validation is used to train and
            evaluate the machine learning methods. Optionally, you can provide
            a .xlsx file containing fixed splits to be used. See the Github Wiki
            for the format.

    verbose: boolean, default True
            print final feature values and labels to command line or not.

    '''

    # Convert inputs from lists to strings
    if type(patientinfo_train) is list:
        patientinfo_train = ''.join(patientinfo_train)

    if type(config) is list:
        config = ''.join(config[0])

    if type(output_hdf) is list:
        if len(output_hdf) == 1:
            output_hdf = ''.join(output_hdf)
        else:
            # FIXME
            print('[PREDICT Warning] You provided multiple configuration files: only the first one will be used!')
            output_hdf = output_hdf[0]

    if type(output_json) is list:
        if len(output_json) == 1:
            output_json = ''.join(output_json)
        else:
            # FIXME
            print('[PREDICT Warning] You provided multiple configuration files: only the first one will be used!')
            output_json = output_json[0]

    # Load variables from the config file
    config = config_io.load_config(config)
    label_type = config['Genetics']['label_names']
    print label_type, type(label_type)

    # Load the feature files and match to label data
    label_data_train, image_features_train =\
        load_features(feat_train, patientinfo_train, label_type)

    if feat_test:
        label_data_test, image_features_test =\
            load_features(feat_test, patientinfo_test, label_type)

    # Create tempdir name from patientinfo file name
    basename = os.path.basename(patientinfo_train)
    filename, _ = os.path.splitext(basename)
    path = patientinfo_train
    for i in range(4):
        # Use temp dir: result -> sample# -> parameters - > temppath
        path = os.path.dirname(path)

    _, path = os.path.split(path)
    path = os.path.join(path, 'trainclassifier', filename)

    # Construct the required classifier
    classifier, param_grid =\
        cc.construct_classifier(config, image_features_train)

    # Append the feature groups to the parameter grid
    if config['General']['FeatureCalculator'] == 'CalcFeatures':
        param_grid['SelectGroups'] = 'True'
        for group in config['SelectFeatGroup'].keys():
            param_grid[group] = config['SelectFeatGroup'][group]

    # If scaling is to be applied, add to parameters
    if config['FeatureScaling']['scale_features']:
        if type(config['FeatureScaling']['scaling_method']) is not list:
            param_grid['FeatureScaling'] = [config['FeatureScaling']['scaling_method']]
        else:
            param_grid['FeatureScaling'] = config['FeatureScaling']['scaling_method']

    # Extract hyperparameter grid settings for SearchCV from config
    param_grid['Featsel_Variance'] = config['Featsel']['Variance']

    param_grid['Imputation'] = config['Imputation']['Use']
    param_grid['ImputationMethod'] = config['Imputation']['strategy']
    param_grid['ImputationNeighbours'] = config['Imputation']['n_neighbors']

    param_grid['SelectFromModel'] = config['Featsel']['SelectFromModel']

    param_grid['UsePCA'] = config['Featsel']['UsePCA']
    param_grid['PCAType'] = config['Featsel']['PCAType']

    param_grid['StatisticalTestUse'] =\
        config['Featsel']['StatisticalTestUse']
    param_grid['StatisticalTestMetric'] =\
        config['Featsel']['StatisticalTestMetric']
    param_grid['StatisticalTestThreshold'] =\
        uniform(loc=config['Featsel']['StatisticalTestThreshold'][0],
                scale=config['Featsel']['StatisticalTestThreshold'][1])

    # For N_iter, perform k-fold crossvalidation
    outputfolder = os.path.dirname(output_hdf)
    if feat_test is None:
        trained_classifier = cv.crossval(config, label_data_train,
                                         image_features_train,
                                         classifier, param_grid,
                                         use_fastr=config['Classification']['fastr'],
                                         fastr_plugin=config['Classification']['fastr_plugin'],
                                         fixedsplits=fixedsplits,
                                         ensemble=config['Ensemble'],
                                         outputfolder=outputfolder,
                                         tempsave=config['General']['tempsave'])
    else:
        trained_classifier = cv.nocrossval(config, label_data_train,
                                           label_data_test,
                                           image_features_train,
                                           image_features_test,
                                           classifier, param_grid,
                                           use_fastr=config['Classification']['fastr'],
                                           fastr_plugin=config['Classification']['fastr_plugin'],
                                           ensemble=config['Ensemble'])

    if not os.path.exists(os.path.dirname(output_hdf)):
        os.makedirs(os.path.dirname(output_hdf))

    trained_classifier.to_hdf(output_hdf, 'SVMdata')

    # Calculate statistics of performance
    if feat_test is None:
        if type(classifier) == sklearn.svm.SVR:
            statistics = plot_single_SVR(trained_classifier, label_data_train,
                                         label_type)
        else:
            statistics = plot_SVM(trained_classifier, label_data_train,
                                  label_type)
    else:
        if patientinfo_test is not None:
            if type(classifier) == sklearn.svm.SVR:
                statistics = plot_single_SVR(trained_classifier,
                                             label_data_test,
                                             label_type)
            else:
                statistics = plot_SVM(trained_classifier,
                                      label_data_test,
                                      label_type)
        else:
            statistics = None

    # Save output
    savedict = dict()
    savedict["Statistics"] = statistics

    if not os.path.exists(os.path.dirname(output_json)):
        os.makedirs(os.path.dirname(output_json))

    with open(output_json, 'w') as fp:
        json.dump(savedict, fp, indent=4)

    print("Saved data!")
コード例 #6
0
def trainclassifier_old(feat_m1,
                        patientinfo,
                        config,
                        output_svm,
                        output_json,
                        feat_m2=None,
                        feat_m3=None,
                        fixedsplits=None,
                        verbose=True):
    # Load variables from the config file
    config = config_io.load_config(config)

    if type(patientinfo) is list:
        patientinfo = ''.join(patientinfo)

    if type(config) is list:
        config = ''.join(config)

    label_type = config['Genetics']['mutation_type']

    # Read the features and classification data
    label_data, image_features =\
        readdata(feat_m1, feat_m2, feat_m3, patientinfo,
                 label_type)

    # Create tempdir name from patientinfo file name
    basename = os.path.basename(patientinfo)
    filename, _ = os.path.splitext(basename)
    path = patientinfo
    for i in range(4):
        # Use temp dir: result -> sample# -> parameters - > temppath
        path = os.path.dirname(path)

    _, path = os.path.split(path)
    path = os.path.join(path, 'trainclassifier', filename)

    # Construct the required classifier
    classifier, param_grid =\
        cc.construct_classifier(config,
                                image_features[0][0])

    # Append the feature groups to the parameter grid
    if config['General']['FeatureCalculator'] == 'CalcFeatures':
        param_grid['SelectGroups'] = 'True'
        for group in config['SelectFeatGroup'].keys():
            param_grid[group] = config['SelectFeatGroup'][group]

    if config['FeatureScaling']['scale_features']:
        if type(config['FeatureScaling']['scaling_method']) is not list:
            param_grid['FeatureScaling'] = [
                config['FeatureScaling']['scaling_method']
            ]
        else:
            param_grid['FeatureScaling'] = config['FeatureScaling'][
                'scaling_method']

    param_grid['Featsel_Variance'] = config['Featsel']['Variance']

    # For N_iter, perform k-fold crossvalidation
    trained_classifier = cv.crossval(
        config,
        label_data,
        image_features,
        classifier,
        param_grid,
        use_fastr=config['Classification']['fastr'],
        fixedsplits=fixedsplits)

    if type(output_svm) is list:
        output_svm = ''.join(output_svm)

    if not os.path.exists(os.path.dirname(output_svm)):
        os.makedirs(os.path.dirname(output_svm))

    trained_classifier.to_hdf(output_svm, 'SVMdata')

    # Calculate statistics of performance
    if type(classifier) == sklearn.svm.SVR:
        statistics = plot_single_SVR(trained_classifier, label_data,
                                     label_type)
    else:
        statistics = plot_single_SVM(trained_classifier, label_data,
                                     label_type)

    # Save output
    savedict = dict()
    savedict["Statistics"] = statistics

    if type(output_json) is list:
        output_json = ''.join(output_json)

    if not os.path.exists(os.path.dirname(output_json)):
        os.makedirs(os.path.dirname(output_json))

    with open(output_json, 'w') as fp:
        json.dump(savedict, fp, indent=4)

    print("Saved data!")