plot_outlier_fraction = None # extra fraction of plot to show (not used if plot_outlier_fraction is None) plot_f_edge = 0.05 # compute and save features, or laod previously computed features if os.path.isfile(feature_file): X = np.loadtxt(feature_file) data_files = None else: print 'Computing features...' X, data_files = features.compute_feature_matrix(data_dir, feature_functions, feature_labels, save_file=feature_file, verbose=True) X, outlier_indices = features.remove_outliers(X, n_sigma=2, verbose=True) #X = features.scale_features(X) # plot features for each segment fig, axs = plt.subplots(n_features-1, n_features-1, figsize=(9,9), sharex='col', sharey='row') plt.subplots_adjust(hspace=0.001, wspace=0.001) for i in range(n_features-1): for j in range(i+1, n_features-1): fig.delaxes(axs[i,j]) for seg_type, color, marker in zip([-1, 0, 1], ['b', 'y', 'r'], ['o', 'x', '+']): # select rows matching a given segment type and remove (hour, type) cols.
def train_model( features_files, feature_columns, classifier, model_args, outlier_sigma=None, scale_features=True, submission_file=None, save_settings=False, plot=False, normalize_probs=None, n_cv=10, f_cv=0.3, verbose=False, ): """ Fit a classification model (classifier, using arguments in model_args) to the features in columns feature_columns in the file(s) in features_files. Use CV with n_cv random training-CV sample splittings, each containing a fraction f_cv in the CV subsample, to estimate AUC for the fit. """ settings = locals() hour_column = 0 type_column = 1 # read in feature matrix from file(s) X = features.load_features(features_files) # remove outliers if outlier_sigma is not None: X, retained_indices = features.remove_outliers(X, n_sigma=outlier_sigma) # scale features if scale_features: X = features.scale_features(X) # set up model model = classifier(**model_args) # set up plot if plot: fig = plt.figure(figsize=(8, 4)) fig.set_tight_layout(True) ax0 = plt.subplot(121) ax1 = plt.subplot(122) # initialize plot arrays n_learn = np.zeros(10) learn_cv_avg = np.zeros(len(n_learn)) learn_train_avg = np.zeros(len(n_learn)) fp_rate_avg = np.linspace(0, 1, num=100) tp_rate_avg = np.zeros(len(fp_rate_avg)) # loop over training-CV sample splittings auc_values = [] for i_cv in range(n_cv): cv_indices = cv.cv_split_by_hour(X, n_pre_hrs=f_cv) if verbose: print "\nCV iteration", i_cv + 1 print len(cv_indices["train"]), "training instances" print len(cv_indices["cv"]), "CV instances" # get feature matrices and class arrays for training and CV samples train_features_all, cv_features_all = [X[cv_indices[k], :] for k in ["train", "cv"]] train_features, cv_features = [y[:, np.array(feature_columns)] for y in [train_features_all, cv_features_all]] train_class = train_features_all[:, type_column] cv_class = cv_features_all[:, type_column] # compute learning curve if plot: learn_mask, n_train, learn_train, learn_cv = learning_curve( model, (train_features, train_class), (cv_features, cv_class), n=len(n_learn), normalize_probs=normalize_probs, ) if len(learn_mask) > 0: n_learn[learn_mask] += 1 learn_train_avg[learn_mask] += learn_train learn_cv_avg[learn_mask] += learn_cv ax0.plot(n_train, learn_train, linestyle="-", color=(1, 0.6, 0.6)) ax0.plot(n_train, learn_cv, linestyle="-", color=(0.7, 0.7, 0.7)) # predict probabilities train_prob, cv_prob = predict_probs(model, train_class, train_features, cv_features, normalize_probs) check_for_nan(train_prob) check_for_nan(cv_prob) if verbose: try: model_coef = model.coef_ print "Feature coefficients:", model_coef except: pass # compute AUC auc = roc_auc_score(cv_class, cv_prob) auc_values.append(auc) if verbose: print "training AUC =", roc_auc_score(train_class, train_prob) print "CV AUC =", auc # plot ROC curve if plot: fp_rate, tp_rate, thresholds = roc_curve(cv_class, cv_prob) tp_rate_avg += np.interp(fp_rate_avg, fp_rate, tp_rate) ax1.plot(fp_rate, tp_rate, linestyle="-", color=(0.7, 0.7, 0.7)) # compute mean and std. dev. of AUC over CV iterations auc_mean = np.mean(auc_values) auc_std = np.std(auc_values) if verbose: print "\nAverage AUC:", auc_mean, "+/-", auc_std # update submission CSV file if submission_file is not None: train_features_all = X[(X[:, type_column] == 0) | (X[:, type_column] == 1), :] train_features = train_features_all[:, np.array(feature_columns)] train_class = train_features_all[:, type_column] test_features_all = X[X[:, type_column] == -1, :] test_features = test_features_all[:, np.array(feature_columns)] train_prob, test_prob = predict_probs(model, train_class, train_features, test_features, normalize_probs) check_for_nan(train_prob, message="Replacing NaN probabilities with 0.") check_for_nan(test_prob, message="Replacing NaN probabilities with 0.") for i, ff in enumerate(features_files): data_list_file = ".".join(ff.split(".")[:-1]) + "_data_files.txt" with open(data_list_file, "r") as df: if i == 0: data_files = np.array(df.readlines()) else: data_files = np.concatenate((data_files, df.readlines()), axis=0) if outlier_sigma is not None: data_files = data_files[retained_indices] test_files = [] for f in data_files: if "test" in f: test_files.append(f.strip()) submission.update_submission(dict(zip(test_files, test_prob)), submission_file) # save settings if save_settings: if submission_file is not None: settings_file = ".".join(submission_file.split(".")[:-1]) + "_settings.txt" open_mode = "a" else: settings_file = "train_model_settings.txt" open_mode = "w" with open(settings_file, open_mode) as sf: for s in [ "features_files", "feature_columns", "classifier", "model_args", "outlier_sigma", "scale_features", "submission_file", "normalize_probs", ]: if s in settings: sf.write(s + ": " + str(settings[s]) + "\n") sf.write("AUC = {0:.2f}+/-{1:.2f}\n\n".format(auc_mean, auc_std)) # plot average learning curves and ROC curve if plot: n_train_array = len(cv_indices["train"]) / float(len(n_learn)) * np.array(range(1, len(n_learn) + 1)) ax0.plot(n_train_array, learn_train_avg / (n_learn + 1.0e-3), "r-", linewidth=3) ax0.plot(n_train_array, learn_cv_avg / (n_learn + 1.0e-3), "k-", linewidth=3) tp_rate_avg /= float(n_cv) ax1.plot(fp_rate_avg, tp_rate_avg, "k-", linewidth=3) # display plot ax0.set_ylim((0.5, 1)) ax0.set_xlabel("number of training instances") ax0.set_ylabel("AUC") ax1.plot(np.linspace(0, 1), np.linspace(0, 1), "k:", linewidth=2) ax1.set_xlabel("false positive rate") ax1.set_ylabel("true positive rate") plt.show(block=False) return (model, auc_mean, auc_std)