def readcrossval(feat_m1, config, sinkfolder, patientinfo, outputfolder, feat_m2=None, feat_m3=None, alpha=0.95, label_type=None, survival=False, n_classifiers=[1, 5, 10]): # n_classifiers = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 20 ,25, 30, 40 , 50] n_classifiers = [1] config = config_io.load_config(config) sinks = glob.glob(sinkfolder + 'RS*.hdf5') # Sort sinks based on creation date sinktimes = [os.path.getmtime(f) for f in sinks] sinks = [s for _, s in sorted(zip(sinktimes, sinks))] if label_type is None: label_type = config['Genetics']['mutation_type'] if survival: # Also extract time to event and if event occurs from mutation data labels = [label_type, ['E'], ['T']] else: labels = [[label_type]] if feat_m1: label_data, _ =\ readdata(feat_m1, feat_m2, feat_m3, patientinfo, labels) else: # No feature files found label_data, _ = findmutationdata(patientinfo, labels) for n_class in n_classifiers: output_json = os.path.join( outputfolder, ('performance_{}.json').format(str(n_class))) sensitivity = list() specificity = list() precision = list() accuracy = list() auc = list() # auc_train = list() f1_score_list = list() patient_classification_list = dict() patient_IDs = label_data['patient_IDs'] mutation_label = label_data['mutation_label'] trained_classifiers = list() y_score = list() y_test = list() pid_test = list() y_predict = list() # For SVR r2score = list() MSE = list() coefICC = list() PearsonC = list() PearsonP = list() SpearmanC = list() SpearmanP = list() if survival: cindex = list() coxp = list() coxcoef = list() patient_MSE = dict() csvfile = os.path.join(outputfolder, 'scores.csv') towrite = list() empty_scores = {k: '' for k in natsort.natsorted(patient_IDs)} empty_scores = collections.OrderedDict(sorted(empty_scores.items())) towrite.append(["Patient"] + empty_scores.keys()) params = dict() for num, s in enumerate(sinks): scores = empty_scores.copy() print("Processing {} / {}.").format(str(num + 1), str(len(sinks))) with open(s, 'r') as fp: sr = pd.read_hdf(fp) sr = sr['Constructed crossvalidation'] t = sr.trained_classifier trained_classifiers.append(sr.trained_classifier) # Extract test info test_patient_IDs = sr.patient_ID_test X_test = sr.X_test Y_test = sr.Y_test # Extract sample size N_1 = float(len(sr.patient_ID_train)) N_2 = float(len(sr.patient_ID_test)) test_indices = list() for i_ID in test_patient_IDs: test_indices.append(np.where(patient_IDs == i_ID)[0][0]) if i_ID not in patient_classification_list: patient_classification_list[i_ID] = dict() patient_classification_list[i_ID]['N_test'] = 0 patient_classification_list[i_ID]['N_correct'] = 0 patient_classification_list[i_ID]['N_wrong'] = 0 patient_classification_list[i_ID]['N_test'] += 1 # y_truth = [mutation_label[0][k] for k in test_indices] # FIXME: order can be switched, need to find a smart fix # 1 for normal, 0 for KM y_truth = [mutation_label[0][k][0] for k in test_indices] # Predict using the top N classifiers results = t.cv_results_['rank_test_score'] indices = range(0, len(results)) sortedindices = [x for _, x in sorted(zip(results, indices))] sortedindices = sortedindices[0:n_class] y_prediction = np.zeros([n_class, len(y_truth)]) y_score = np.zeros([n_class, len(y_truth)]) # Get some base objects required feature_labels = pd.read_hdf(feat_m1[0]).feature_labels base_estimator = t.estimator X_train = [(x, feature_labels) for x in sr.X_train] y_train = sr.Y_train y_train_prediction = np.zeros([n_class, len(y_train)]) scorer = t.scorer_ train = np.asarray(range(0, len(y_train))) test = train del sr # Save some memory # cv_iter = list(t.cv.iter(X_train, y_train)) # NOTE: need to build this in the SearchCVFastr Object for i, index in enumerate(sortedindices): print("Processing number {} of {} classifiers.").format( str(i + 1), str(n_class)) X_testtemp = X_test[:] # Get the parameters from the index parameters_est = t.cv_results_['params'][index] parameters_all = t.cv_results_['params_all'][index] # NOTE: kernel parameter can be unicode kernel = str(parameters_est[u'kernel']) del parameters_est[u'kernel'] del parameters_all[u'kernel'] parameters_est['kernel'] = kernel parameters_all['kernel'] = kernel # Refit a classifier using the settings given print("Refitting classifier with best settings.") best_estimator = clone(base_estimator).set_params( **parameters_est) ret, GroupSel, VarSel, SelectModel, feature_labels[0], scaler =\ fit_and_score(best_estimator, X_train, y_train, scorer, train, test, True, parameters_all, t.fit_params, t.return_train_score, True, True, True, t.error_score) X = [x[0] for x in X_train] if GroupSel is not None: X = GroupSel.transform(X) X_testtemp = GroupSel.transform(X_testtemp) if SelectModel is not None: X = SelectModel.transform(X) X_testtemp = SelectModel.transform(X_testtemp) if VarSel is not None: X = VarSel.transform(X) X_testtemp = VarSel.transform(X_testtemp) if scaler is not None: X = scaler.transform(X) X_testtemp = scaler.transform(X_testtemp) if y_train is not None: best_estimator.fit(X, y_train, **t.fit_params) else: best_estimator.fit(X, **t.fit_params) # Predict the posterios using the fitted classifier for the training set print("Evaluating performance on training set.") if hasattr(best_estimator, 'predict_proba'): probabilities = best_estimator.predict_proba(X) y_train_prediction[i, :] = probabilities[:, 1] else: # Regression has no probabilities probabilities = best_estimator.predict(X) y_train_prediction[i, :] = probabilities[:] # Predict the posterios using the fitted classifier for the test set print("Evaluating performance on test set.") if hasattr(best_estimator, 'predict_proba'): probabilities = best_estimator.predict_proba(X_testtemp) y_prediction[i, :] = probabilities[:, 1] else: # Regression has no probabilities probabilities = best_estimator.predict(X_testtemp) y_prediction[i, :] = probabilities[:] if type(t.estimator) == sklearn.svm.classes.SVC: y_score[i, :] = best_estimator.decision_function( X_testtemp) else: y_score[i, :] = best_estimator.decision_function( X_testtemp)[:, 0] # Add number parameter settings for k in parameters_all.keys(): if k not in params.keys(): params[k] = list() params[k].append(parameters_all[k]) # Save some memory del best_estimator, X, X_testtemp, ret, GroupSel, VarSel, SelectModel, scaler, parameters_est, parameters_all, probabilities # Take mean over posteriors of top n y_train_prediction_m = np.mean(y_train_prediction, axis=0) y_prediction_m = np.mean(y_prediction, axis=0) # NOTE: Not sure if this is best way to compute AUC y_score = y_prediction_m if type(t.estimator) == sklearn.svm.classes.SVC: # Look for optimal F1 performance on training set thresholds = np.arange(0, 1, 0.01) f1_scores = list() y_train_prediction = np.zeros(y_train_prediction_m.shape) for t in thresholds: for ip, y in enumerate(y_train_prediction_m): if y > t: y_train_prediction[ip] = 1 else: y_train_prediction[ip] = 0 f1_scores.append( f1_score(y_train_prediction, y_train, average='weighted')) # Use best threshold to determine test score best_index = np.argmax(f1_scores) best_thresh = thresholds[best_index] best_thresh = 0.5 y_prediction = np.zeros(y_prediction_m.shape) for ip, y in enumerate(y_prediction_m): if y > best_thresh: y_prediction[ip] = 1 else: y_prediction[ip] = 0 # y_prediction = t.predict(X_temp) y_prediction = [min(max(y, 0), 1) for y in y_prediction] else: y_prediction = y_prediction_m y_prediction = [min(max(y, 0), 1) for y in y_prediction] print "Truth: ", y_truth print "Prediction: ", y_prediction for k, v in zip(test_patient_IDs, y_prediction): scores[k] = v # for k, v in scores.iteritems(): # print k, v # # raise IOError towrite.append(["Iteration " + str()] + scores.values()) if type(t.estimator) == sklearn.svm.classes.SVC: for i_truth, i_predict, i_test_ID in zip( y_truth, y_prediction, test_patient_IDs): if i_truth == i_predict: patient_classification_list[i_test_ID][ 'N_correct'] += 1 else: patient_classification_list[i_test_ID]['N_wrong'] += 1 if type(t.estimator) == sklearn.svm.classes.SVC: c_mat = confusion_matrix(y_truth, y_prediction) TN = c_mat[0, 0] FN = c_mat[1, 0] TP = c_mat[1, 1] FP = c_mat[0, 1] if FN == 0 and TP == 0: sensitivity.append(0) else: sensitivity.append(float(TP) / (TP + FN)) if FP == 0 and TN == 0: specificity.append(0) else: specificity.append(float(TN) / (FP + TN)) if TP == 0 and FP == 0: precision.append(0) else: precision.append(float(TP) / (TP + FP)) accuracy.append(accuracy_score(y_truth, y_prediction)) # y_score = t.decision_function(X_temp) auc.append(roc_auc_score(y_truth, y_score)) f1_score_list.append( f1_score(y_truth, y_prediction, average='weighted')) # elif type(t.estimator) == sklearn.svm.classes.SVR: else: # y_score.extend(svm[k].ix('svms')[0].predict_proba(X_test)) # y_predict.extend(svm[k].ix('svms')[0].predict(X_test)) # y_test.extend(Y_test) # pid_test.extend(pidt) r2score.append(r2_score(y_truth, y_prediction)) MSE.append(mean_squared_error(y_truth, y_prediction)) coefICC.append(ICC(np.column_stack((y_prediction, y_truth)))) C = pearsonr(y_prediction, y_truth) PearsonC.append(C[0]) PearsonP.append(C[1]) C = spearmanr(y_prediction, y_truth) SpearmanC.append(C.correlation) SpearmanP.append(C.pvalue) if survival: # Extract time to event and event from label data E_truth = np.asarray( [mutation_label[1][k][0] for k in test_indices]) T_truth = np.asarray( [mutation_label[2][k][0] for k in test_indices]) # Concordance index cindex.append(1 - ll.utils.concordance_index( T_truth, y_prediction, E_truth)) # Fit Cox model using SVR output, time to event and event data = { 'predict': y_prediction, 'E': E_truth, 'T': T_truth } data = pd.DataFrame(data=data, index=test_patient_IDs) try: cph = ll.CoxPHFitter() cph.fit(data, duration_col='T', event_col='E') coxcoef.append(cph.summary['coef']['predict']) coxp.append(cph.summary['p']['predict']) except ValueError: # Convergence halted, delta contains nan values? coxcoef.append(1) coxp.append(0) except np.linalg.LinAlgError: #FIXME: Singular matrix coxcoef.append(1) coxp.append(0) towrite = zip(*towrite) with open(csvfile, 'wb') as csv_file: writer = csv.writer(csv_file) for w in towrite: writer.writerow(w) # print(N_1) # print(N_2) if type(t.estimator) == sklearn.svm.classes.SVC: N_iterations = len(sinks) accuracy_mean = np.mean(accuracy) S_uj = 1.0 / max((N_iterations - 1), 1) * np.sum( (accuracy_mean - accuracy)**2.0) # print Y_test accuracy_var = np.sqrt((1.0 / N_iterations + N_2 / N_1) * S_uj) # print(accuracy_var) # print(np.sqrt(1/N_iterations*S_uj)) # print(st.sem(accuracy)) stats = dict() stats["Accuracy 95%:"] = str(CI(accuracy, N_1, N_2, alpha)) stats["AUC 95%:"] = str(CI(auc, N_1, N_2, alpha)) stats["F1-score 95%:"] = str(CI(f1_score_list, N_1, N_2, alpha)) stats["Precision 95%:"] = str(CI(precision, N_1, N_2, alpha)) stats["Sensitivity 95%: "] = str(CI(sensitivity, N_1, N_2, alpha)) stats["Specificity 95%:"] = str(CI(specificity, N_1, N_2, alpha)) print("Accuracy 95%:" + str(CI(accuracy, N_1, N_2, alpha))) print("AUC 95%:" + str(CI(auc, N_1, N_2, alpha))) print("F1-score 95%:" + str(CI(f1_score_list, N_1, N_2, alpha))) print("Precision 95%:" + str(CI(precision, N_1, N_2, alpha))) print("Sensitivity 95%: " + str(CI(sensitivity, N_1, N_2, alpha))) print("Specificity 95%:" + str(CI(specificity, N_1, N_2, alpha))) alwaysright = dict() alwayswrong = dict() for i_ID in patient_classification_list: percentage_right = patient_classification_list[i_ID][ 'N_correct'] / float( patient_classification_list[i_ID]['N_test']) # print(i_ID + ' , ' + str(patient_classification_list[i_ID]['N_test']) + ' : ' + str(percentage_right) + '\n') if percentage_right == 1.0: label = mutation_label[0][np.where(i_ID == patient_IDs)] label = label[0][0] alwaysright[i_ID] = label # alwaysright.append(('{} ({})').format(i_ID, label)) print(("Always Right: {}, label {}").format(i_ID, label)) if percentage_right == 0: label = mutation_label[0][np.where( i_ID == patient_IDs)].tolist() label = label[0][0] alwayswrong[i_ID] = label # alwayswrong.append(('{} ({})').format(i_ID, label)) print(("Always Wrong: {}, label {}").format(i_ID, label)) stats["Always right"] = alwaysright stats["Always wrong"] = alwayswrong # Gather all scores for all patients and average pid_unique = list(set(pid_test)) pid_unique = sorted(pid_unique) posteriors = dict() for pid in pid_unique: posteriors[pid] = list() counts = 0 for num, allid in enumerate(pid_test): if allid == pid: counts += 1 posteriors[pid].append(y_score[num][0]) truelabel = y_test[num] posteriors[pid] = [np.mean(posteriors[pid]), truelabel, counts] # elif type(t.estimator) == sklearn.svm.classes.SVR: else: # Compute confidence intervals from cross validations stats = dict() stats["r2_score 95%:"] = str(CI(r2score, N_1, N_2, alpha)) stats["MSE 95%:"] = str(CI(MSE, N_1, N_2, alpha)) stats["ICC 95%:"] = str(CI(coefICC, N_1, N_2, alpha)) stats["PearsonC 95%:"] = str(CI(PearsonC, N_1, N_2, alpha)) stats["SpearmanC 95%: "] = str(CI(SpearmanC, N_1, N_2, alpha)) stats["PearsonP 95%:"] = str(CI(PearsonP, N_1, N_2, alpha)) stats["SpearmanP 95%: "] = str(CI(SpearmanP, N_1, N_2, alpha)) if survival: stats["Concordance 95%:"] = str(CI(cindex, N_1, N_2, alpha)) stats["Cox coef. 95%:"] = str(CI(coxcoef, N_1, N_2, alpha)) stats["Cox p 95%:"] = str(CI(coxp, N_1, N_2, alpha)) # Calculate and sort individual patient MSE patient_MSE = {k: np.mean(v) for k, v in patient_MSE.iteritems()} order = np.argsort(patient_MSE.values()) sortedkeys = np.asarray(patient_MSE.keys())[order].tolist() sortedvalues = np.asarray(patient_MSE.values())[order].tolist() patient_MSE = [(k, v) for k, v in zip(sortedkeys, sortedvalues)] for p in patient_MSE: print p[0], p[1] stats["Patient_MSE"] = patient_MSE for k, v in stats.iteritems(): print k, v # Check which parameters were most often used params = paracheck(params) # params = dict() # for num, classf in enumerate(trained_classifiers): # params_temp = classf.best_params_ # if num == 0: # for k in params_temp.keys(): # params[k] = list() # params[k].append(params_temp[k]) # else: # for k in params_temp.keys(): # params[k].append(params_temp[k]) # # print params # # Make histograms or box plots of params # for k in params.keys(): # para = params[k] # print k # if type(para[0]) is unicode: # letter_counts = Counter(para) # values = letter_counts.values() # keys = letter_counts.keys() # print keys, values # plt.bar(range(len(values)), values, align='center') # plt.xticks(range(len(keys)), keys) # plt.show() # else: # # Make a standard boxplot # plt.figure() # plt.boxplot(para, 0, 'gD') # plt.show() # Save output savedict = dict() savedict["Statistics"] = stats savedict['Parameters'] = params if type(output_json) is list: output_json = ''.join(output_json) if not os.path.exists(os.path.dirname(output_json)): os.makedirs(os.path.dirname(output_json)) with open(output_json, 'w') as fp: json.dump(savedict, fp, indent=4) print("Saved data!")
def trainclassifier(feat_train, patientinfo_train, config, output_hdf, output_json, feat_test=None, patientinfo_test=None, fixedsplits=None, verbose=True): ''' Train a classifier using machine learning from features. By default, if no split in training and test is supplied, a cross validation will be performed. Parameters ---------- feat_train: string, mandatory contains the paths to all .hdf5 feature files used. modalityname1=file1,file2,file3,... modalityname2=file1,... Thus, modalities names are always between a space and a equal sign, files are split by commas. We assume that the lists of files for each modality has the same length. Files on the same position on each list should belong to the same patient. patientinfo: string, mandatory Contains the path referring to a .txt file containing the patient label(s) and value(s) to be used for learning. See the Github Wiki for the format. config: string, mandatory path referring to a .ini file containing the parameters used for feature extraction. See the Github Wiki for the possible fields and their description. output_hdf: string, mandatory path refering to a .hdf5 file to which the final classifier and it's properties will be written to. output_json: string, mandatory path refering to a .json file to which the performance of the final classifier will be written to. This file is generated through one of the PREDICT plotting functions. feat_test: string, optional When this argument is supplied, the machine learning will not be trained using a cross validation, but rather using a fixed training and text split. This field should contain paths of the test set feature files, similar to the feat_train argument. patientinfo_test: string, optional When feat_test is supplied, you can supply optionally a patient label file through which the performance will be evaluated. fixedsplits: string, optional By default, random split cross validation is used to train and evaluate the machine learning methods. Optionally, you can provide a .xlsx file containing fixed splits to be used. See the Github Wiki for the format. verbose: boolean, default True print final feature values and labels to command line or not. ''' # Load variables from the config file config = config_io.load_config(config) if type(feat_train) is list: feat_train = ''.join(feat_train) if type(patientinfo_train) is list: patientinfo_train = ''.join(patientinfo_train) if type(config) is list: config = ''.join(config) label_type = config['Genetics']['label_names'] # Split the features per modality feat_train_temp = [str(item).strip() for item in feat_train.split('=')] feat_train_temp = feat_train_temp[ 1::] # First item is the first modality name feat_train = list() for feat_mod in feat_train_temp: feat_mod_temp = [str(item).strip() for item in feat_mod.split(',')] # Last item contains name of next modality if multiple, seperated by a space space = feat_mod_temp[-1].find(' ') if space != -1: feat_mod_temp[-1] = feat_mod_temp[-1][0:space] feat_train.append(feat_mod_temp) # Read the features and classification data label_data_train, image_features_train =\ load_data(feat_train, patientinfo_train, label_type) if feat_test is not None: # Split the features per modality feat_test_temp = [str(item).strip() for item in feat_test.split('=')] feat_test_temp = feat_test_temp[ 1::] # First item is the first modality name feat_test = list() for feat_mod in feat_test_temp: feat_mod_temp = [str(item).strip() for item in feat_mod.split(',')] # Last item contains name of next modality if multiple, seperated by a space space = feat_mod_temp[-1].find(' ') if space != -1: feat_mod_temp[-1] = feat_mod_temp[-1][0:space] feat_test.append(feat_mod_temp) label_data_test, image_features_test =\ load_data(feat_test, patientinfo_test, label_type) # Create tempdir name from patientinfo file name basename = os.path.basename(patientinfo_train) filename, _ = os.path.splitext(basename) path = patientinfo_train for i in range(4): # Use temp dir: result -> sample# -> parameters - > temppath path = os.path.dirname(path) _, path = os.path.split(path) path = os.path.join(path, 'trainclassifier', filename) # Construct the required classifier classifier, param_grid =\ cc.construct_classifier(config) # Append the feature groups to the parameter grid if config['General']['FeatureCalculator'] == 'CalcFeatures': param_grid['SelectGroups'] = 'True' for group in config['SelectFeatGroup'].keys(): param_grid[group] = config['SelectFeatGroup'][group] # if config['FeatureSelection']['SelectFromModel']: # param_grid['SelectFromModel'] = ['Lasso', False] if config['FeatureScaling']['scale_features']: if type(config['FeatureScaling']['scaling_method']) is not list: param_grid['FeatureScaling'] = [ config['FeatureScaling']['scaling_method'] ] else: param_grid['FeatureScaling'] = config['FeatureScaling'][ 'scaling_method'] param_grid['Featsel_Variance'] = config['Featsel']['Variance'] # For N_iter, perform k-fold crossvalidation if feat_test is None: trained_classifier = cv.crossval( config, label_data_train, image_features_train, classifier, param_grid, use_fastr=config['Classification']['fastr'], fixedsplits=fixedsplits) else: trained_classifier = cv.nocrossval(config, label_data_train, label_data_test, image_features_train, image_features_test, classifier, param_grid, config['Classification']['fastr']) if type(output_hdf) is list: output_hdf = ''.join(output_hdf) if not os.path.exists(os.path.dirname(output_hdf)): os.makedirs(os.path.dirname(output_hdf)) trained_classifier.to_hdf(output_hdf, 'SVMdata') # Calculate statistics of performance if feat_test is None: if type(classifier) == sklearn.svm.SVR: statistics = plot_single_SVR(trained_classifier, label_data_train, label_type) else: statistics = plot_single_SVM(trained_classifier, label_data_train, label_type) else: if patientinfo_test is not None: if type(classifier) == sklearn.svm.SVR: statistics = plot_single_SVR(trained_classifier, label_data_test, label_type) else: statistics = plot_single_SVM(trained_classifier, label_data_test, label_type) else: statistics = None # Save output savedict = dict() savedict["Statistics"] = statistics if type(output_json) is list: output_json = ''.join(output_json) if not os.path.exists(os.path.dirname(output_json)): os.makedirs(os.path.dirname(output_json)) with open(output_json, 'w') as fp: json.dump(savedict, fp, indent=4) print("Saved data!")
def trainclassifier(feat_m1, patientinfo, config, parameter_file, output_svm, output_json, feat_m2=None, feat_m3=None, verbose=True): # Load variables from the config file config = config_io.load_config(config) if type(parameter_file) is list: parameter_file = ''.join(parameter_file) if type(patientinfo) is list: patientinfo = ''.join(patientinfo) if type(config) is list: config = ''.join(config) with open(parameter_file) as data_file: parameters = json.load(data_file) label_type = config['Genetics']['mutation_type'] # Read the features and classification data image_features_select, labels, label_data =\ readdata(feat_m1, feat_m2, feat_m3, patientinfo, label_type, parameters) # Delete features which are are the same in more than 99% of patients # TODO: Separate this into a different tool sel = VarianceThreshold(threshold=0.99 * (1 - 0.99)) sel = sel.fit(image_features_select) image_features_select = sel.transform(image_features_select) labels = sel.transform(labels).tolist()[0] # If we have too few features left, don't proceed if len(image_features_select[1]) > 7: # Create tempdir name from parameter file name basename = os.path.basename(parameter_file) filename, _ = os.path.splitext(basename) path = parameter_file for i in range(4): # Use temp dir: result -> sample# -> parameters - > temppath path = os.path.dirname(path) _, path = os.path.split(path) path = os.path.join(path, 'trainclassifier', filename) # Construct the required classifier classifier, param_grid =\ cc.construct_classifier(config, image_features_select[0]) # For N_iter, perform k-fold crossvalidation if config['Classification']['fastr']: trained_classifier = cv.crossvalfastr(config, label_data, image_features_select, classifier, param_grid, path) else: trained_classifier = cv.crossval(config, label_data, image_features_select, classifier, param_grid, path) # Add labels to dataframe # TODO: Works only if single mutation is present labels_pd =\ pd.Series([labels], index=[trained_classifier.keys()[0]], name='feature_labels') classifier = classifier.append(labels_pd) # Calculate statistics of performance statistics = plot_single_SVM(classifier, label_data) else: statistics = "None" labels = ["Too Few Features."] feat = ["None"] panda_dict = dict(zip(labels, feat)) classifier = pd.Series(panda_dict) # Save output savedict = dict() savedict["Parameters"] = parameters savedict["Statistics"] = statistics print("Saving data!") if type(output_svm) is list: output_svm = ''.join(output_svm) if type(output_json) is list: output_json = ''.join(output_json) # TODO: ouptu_svm/json are list objects! classifier.to_hdf(output_svm, 'SVMdata') with open(output_json, 'w') as fp: json.dump(savedict, fp, indent=4)
def StatisticalTestFeatures(features, patientinfo, config, output=None, verbose=True): ''' Perform several statistical tests on features, such as a student t-test. Useage is similar to trainclassifier. Parameters ---------- features: string, mandatory contains the paths to all .hdf5 feature files used. modalityname1=file1,file2,file3,... modalityname2=file1,... Thus, modalities names are always between a space and a equal sign, files are split by commas. We assume that the lists of files for each modality has the same length. Files on the same position on each list should belong to the same patient. patientinfo: string, mandatory Contains the path referring to a .txt file containing the patient label(s) and value(s) to be used for learning. See the Github Wiki for the format. config: string, mandatory path referring to a .ini file containing the parameters used for feature extraction. See the Github Wiki for the possible fields and their description. # TODO: outputs verbose: boolean, default True print final feature values and labels to command line or not. ''' # Load variables from the config file config = config_io.load_config(config) if type(patientinfo) is list: patientinfo = ''.join(patientinfo) if type(config) is list: config = ''.join(config) if type(output) is list: output = ''.join(output) # Create output folder if required if not os.path.exists(os.path.dirname(output)): os.makedirs(os.path.dirname(output)) label_type = config['Genetics']['label_names'] # Read the features and classification data print("Reading features and label data.") label_data, image_features =\ load_features(features, patientinfo, label_type) # Extract feature labels and put values in an array feature_labels = image_features[0][1] feature_values = np.zeros([len(image_features), len(feature_labels)]) for num, x in enumerate(image_features): feature_values[num, :] = x[0] # ----------------------------------------------------------------------- # Perform statistical tests print("Performing statistical tests.") label_value = label_data['mutation_label'] label_name = label_data['mutation_name'] header = list() subheader = list() for i_name in label_name: header.append(str(i_name[0])) header.append('') header.append('') header.append('') header.append('') header.append('') subheader.append('Label') subheader.append('Ttest') subheader.append('Welch') subheader.append('Wilcoxon') subheader.append('Mann-Whitney') subheader.append('') # Open the output file if output is not None: myfile = open(output, 'wb') wr = csv.writer(myfile, quoting=csv.QUOTE_ALL) wr.writerow(header) wr.writerow(subheader) savedict = dict() for i_class, i_name in zip(label_value, label_name): savedict[i_name[0]] = dict() pvalues = list() pvalueswelch = list() pvalueswil = list() pvaluesmw = list() for num, fl in enumerate(feature_labels): fv = feature_values[:, num] classlabels = i_class.ravel() class1 = [i for j, i in enumerate(fv) if classlabels[j] == 1] class2 = [i for j, i in enumerate(fv) if classlabels[j] == 0] pvalues.append(ttest_ind(class1, class2)[1]) pvalueswelch.append(ttest_ind(class1, class2, equal_var=False)[1]) pvalueswil.append(ranksums(class1, class2)[1]) try: pvaluesmw.append(mannwhitneyu(class1, class2)[1]) except ValueError as e: print("[PREDICT Warning] " + str(e) + '. Replacing metric value by 1.') pvaluesmw.append(1) # Sort based on p-values: pvalues = np.asarray(pvalues) indices = np.argsort(pvalues) pvalues = pvalues[indices].tolist() feature_labels_o = np.asarray(feature_labels)[indices].tolist() pvalueswelch = np.asarray(pvalueswelch)[indices].tolist() pvalueswil = np.asarray(pvalueswil)[indices].tolist() pvaluesmw = np.asarray(pvaluesmw)[indices].tolist() savedict[i_name[0]]['ttest'] = pvalues savedict[i_name[0]]['welch'] = pvalueswelch savedict[i_name[0]]['wil'] = pvalueswil savedict[i_name[0]]['mw'] = pvaluesmw savedict[i_name[0]]['labels'] = feature_labels_o if output is not None: for num in range(0, len(savedict[i_name[0]]['ttest'])): writelist = list() for i_name in savedict.keys(): labeldict = savedict[i_name] writelist.append(labeldict['labels'][num]) writelist.append(labeldict['ttest'][num]) writelist.append(labeldict['welch'][num]) writelist.append(labeldict['wil'][num]) writelist.append(labeldict['mw'][num]) writelist.append('') wr.writerow(writelist) print("Saved data!") return savedict
def trainclassifier(feat_train, patientinfo_train, config, output_hdf, output_json, feat_test=None, patientinfo_test=None, fixedsplits=None, verbose=True): ''' Train a classifier using machine learning from features. By default, if no split in training and test is supplied, a cross validation will be performed. Parameters ---------- feat_train: string, mandatory contains the paths to all .hdf5 feature files used. modalityname1=file1,file2,file3,... modalityname2=file1,... Thus, modalities names are always between a space and a equal sign, files are split by commas. We assume that the lists of files for each modality has the same length. Files on the same position on each list should belong to the same patient. patientinfo: string, mandatory Contains the path referring to a .txt file containing the patient label(s) and value(s) to be used for learning. See the Github Wiki for the format. config: string, mandatory path referring to a .ini file containing the parameters used for feature extraction. See the Github Wiki for the possible fields and their description. output_hdf: string, mandatory path refering to a .hdf5 file to which the final classifier and it's properties will be written to. output_json: string, mandatory path refering to a .json file to which the performance of the final classifier will be written to. This file is generated through one of the PREDICT plotting functions. feat_test: string, optional When this argument is supplied, the machine learning will not be trained using a cross validation, but rather using a fixed training and text split. This field should contain paths of the test set feature files, similar to the feat_train argument. patientinfo_test: string, optional When feat_test is supplied, you can supply optionally a patient label file through which the performance will be evaluated. fixedsplits: string, optional By default, random split cross validation is used to train and evaluate the machine learning methods. Optionally, you can provide a .xlsx file containing fixed splits to be used. See the Github Wiki for the format. verbose: boolean, default True print final feature values and labels to command line or not. ''' # Convert inputs from lists to strings if type(patientinfo_train) is list: patientinfo_train = ''.join(patientinfo_train) if type(config) is list: config = ''.join(config[0]) if type(output_hdf) is list: if len(output_hdf) == 1: output_hdf = ''.join(output_hdf) else: # FIXME print('[PREDICT Warning] You provided multiple configuration files: only the first one will be used!') output_hdf = output_hdf[0] if type(output_json) is list: if len(output_json) == 1: output_json = ''.join(output_json) else: # FIXME print('[PREDICT Warning] You provided multiple configuration files: only the first one will be used!') output_json = output_json[0] # Load variables from the config file config = config_io.load_config(config) label_type = config['Genetics']['label_names'] print label_type, type(label_type) # Load the feature files and match to label data label_data_train, image_features_train =\ load_features(feat_train, patientinfo_train, label_type) if feat_test: label_data_test, image_features_test =\ load_features(feat_test, patientinfo_test, label_type) # Create tempdir name from patientinfo file name basename = os.path.basename(patientinfo_train) filename, _ = os.path.splitext(basename) path = patientinfo_train for i in range(4): # Use temp dir: result -> sample# -> parameters - > temppath path = os.path.dirname(path) _, path = os.path.split(path) path = os.path.join(path, 'trainclassifier', filename) # Construct the required classifier classifier, param_grid =\ cc.construct_classifier(config, image_features_train) # Append the feature groups to the parameter grid if config['General']['FeatureCalculator'] == 'CalcFeatures': param_grid['SelectGroups'] = 'True' for group in config['SelectFeatGroup'].keys(): param_grid[group] = config['SelectFeatGroup'][group] # If scaling is to be applied, add to parameters if config['FeatureScaling']['scale_features']: if type(config['FeatureScaling']['scaling_method']) is not list: param_grid['FeatureScaling'] = [config['FeatureScaling']['scaling_method']] else: param_grid['FeatureScaling'] = config['FeatureScaling']['scaling_method'] # Extract hyperparameter grid settings for SearchCV from config param_grid['Featsel_Variance'] = config['Featsel']['Variance'] param_grid['Imputation'] = config['Imputation']['Use'] param_grid['ImputationMethod'] = config['Imputation']['strategy'] param_grid['ImputationNeighbours'] = config['Imputation']['n_neighbors'] param_grid['SelectFromModel'] = config['Featsel']['SelectFromModel'] param_grid['UsePCA'] = config['Featsel']['UsePCA'] param_grid['PCAType'] = config['Featsel']['PCAType'] param_grid['StatisticalTestUse'] =\ config['Featsel']['StatisticalTestUse'] param_grid['StatisticalTestMetric'] =\ config['Featsel']['StatisticalTestMetric'] param_grid['StatisticalTestThreshold'] =\ uniform(loc=config['Featsel']['StatisticalTestThreshold'][0], scale=config['Featsel']['StatisticalTestThreshold'][1]) # For N_iter, perform k-fold crossvalidation outputfolder = os.path.dirname(output_hdf) if feat_test is None: trained_classifier = cv.crossval(config, label_data_train, image_features_train, classifier, param_grid, use_fastr=config['Classification']['fastr'], fastr_plugin=config['Classification']['fastr_plugin'], fixedsplits=fixedsplits, ensemble=config['Ensemble'], outputfolder=outputfolder, tempsave=config['General']['tempsave']) else: trained_classifier = cv.nocrossval(config, label_data_train, label_data_test, image_features_train, image_features_test, classifier, param_grid, use_fastr=config['Classification']['fastr'], fastr_plugin=config['Classification']['fastr_plugin'], ensemble=config['Ensemble']) if not os.path.exists(os.path.dirname(output_hdf)): os.makedirs(os.path.dirname(output_hdf)) trained_classifier.to_hdf(output_hdf, 'SVMdata') # Calculate statistics of performance if feat_test is None: if type(classifier) == sklearn.svm.SVR: statistics = plot_single_SVR(trained_classifier, label_data_train, label_type) else: statistics = plot_SVM(trained_classifier, label_data_train, label_type) else: if patientinfo_test is not None: if type(classifier) == sklearn.svm.SVR: statistics = plot_single_SVR(trained_classifier, label_data_test, label_type) else: statistics = plot_SVM(trained_classifier, label_data_test, label_type) else: statistics = None # Save output savedict = dict() savedict["Statistics"] = statistics if not os.path.exists(os.path.dirname(output_json)): os.makedirs(os.path.dirname(output_json)) with open(output_json, 'w') as fp: json.dump(savedict, fp, indent=4) print("Saved data!")
def trainclassifier_old(feat_m1, patientinfo, config, output_svm, output_json, feat_m2=None, feat_m3=None, fixedsplits=None, verbose=True): # Load variables from the config file config = config_io.load_config(config) if type(patientinfo) is list: patientinfo = ''.join(patientinfo) if type(config) is list: config = ''.join(config) label_type = config['Genetics']['mutation_type'] # Read the features and classification data label_data, image_features =\ readdata(feat_m1, feat_m2, feat_m3, patientinfo, label_type) # Create tempdir name from patientinfo file name basename = os.path.basename(patientinfo) filename, _ = os.path.splitext(basename) path = patientinfo for i in range(4): # Use temp dir: result -> sample# -> parameters - > temppath path = os.path.dirname(path) _, path = os.path.split(path) path = os.path.join(path, 'trainclassifier', filename) # Construct the required classifier classifier, param_grid =\ cc.construct_classifier(config, image_features[0][0]) # Append the feature groups to the parameter grid if config['General']['FeatureCalculator'] == 'CalcFeatures': param_grid['SelectGroups'] = 'True' for group in config['SelectFeatGroup'].keys(): param_grid[group] = config['SelectFeatGroup'][group] if config['FeatureScaling']['scale_features']: if type(config['FeatureScaling']['scaling_method']) is not list: param_grid['FeatureScaling'] = [ config['FeatureScaling']['scaling_method'] ] else: param_grid['FeatureScaling'] = config['FeatureScaling'][ 'scaling_method'] param_grid['Featsel_Variance'] = config['Featsel']['Variance'] # For N_iter, perform k-fold crossvalidation trained_classifier = cv.crossval( config, label_data, image_features, classifier, param_grid, use_fastr=config['Classification']['fastr'], fixedsplits=fixedsplits) if type(output_svm) is list: output_svm = ''.join(output_svm) if not os.path.exists(os.path.dirname(output_svm)): os.makedirs(os.path.dirname(output_svm)) trained_classifier.to_hdf(output_svm, 'SVMdata') # Calculate statistics of performance if type(classifier) == sklearn.svm.SVR: statistics = plot_single_SVR(trained_classifier, label_data, label_type) else: statistics = plot_single_SVM(trained_classifier, label_data, label_type) # Save output savedict = dict() savedict["Statistics"] = statistics if type(output_json) is list: output_json = ''.join(output_json) if not os.path.exists(os.path.dirname(output_json)): os.makedirs(os.path.dirname(output_json)) with open(output_json, 'w') as fp: json.dump(savedict, fp, indent=4) print("Saved data!")