def train_full_model(cfm_dev, cfm_test, best_alphas, exp_dir=None, name=None, label_file=None, target=None, feature_list=None, model_type=None, n_eval_iters=None, eval_prop=None, min_alpha_exp=-4, max_alpha_exp=5, alpha_exp_base=np.sqrt(10), reuse=False, orig_T=0.04, tau=0.01, verbose=1, best_alpha=None, weight_col=-1, additional_label_files=None, additional_label_weights=None, predict_all=False, metric='f1', only_unanimous=True, **kwargs): # load the labels all_items, target_name, labels, weights, unanimous_pairs = lr.get_labels(label_file, target, weight_col=weight_col) n_labels = np.max(labels) + 1 n_dev_folds = ds.get_n_dev_folds() if only_unanimous: unanimous_subset = set(unanimous_pairs.keys()) print len(unanimous_subset) else: unanimous_subset = None # create experiments directory and save the parameters for this experiment print exp_dir # load the features print "Loading features" _, feature_names, X = exp2.load_features(feature_list, all_items, verbose=verbose) # deal with loading items for pseudo-documents (i.e. partial documents, dependent on annotations) if additional_label_files is not None: print "Loading features for pseudodocuments" extra_items = [] extra_labels = [] extra_weights = [] extra_Xs = [] full_doc_index = {} for fi, f in enumerate(additional_label_files): basename = fh.get_basename_wo_ext(f) # assume the last part of the label file is a unique type parts = basename.split('_') pseudotype = parts[-1] # get the factor by which to multiply the weights for these items weight_factor = additional_label_weights[fi] # get the index which maps psueodocuments to real documents doc_index = fh.read_json(fh.make_filename(dirs.data_raw_index_dir, basename + '_index', 'json')) # update a global index full_doc_index.update(doc_index) # get the labels for these pseudodocuments f_items, f_target_name, f_labels, f_weights, _ = lr.get_labels(f, target, weight_col=weight_col) # add to our overall list of items, labels, and weights (for pseudodocuments) extra_items.extend(f_items) extra_labels.append(f_labels) extra_weights.append(weight_factor * f_weights) # create updated feature types for these pseudodocuments pseudo_feature_list = [f + ',pseudotype=' + pseudotype for f in feature_list] # load these featues _, _, extra_X = exp2.load_features(pseudo_feature_list, f_items, doc_index=doc_index, verbose=verbose) # put them in a list to make one big matrix of pseudo-documents extra_Xs.append(extra_X) # concatenate the numeric values from each pseudo-document extra_labels = np.concatenate(extra_labels) extra_weights = np.concatenate(extra_weights) extra_X = sparse.vstack(extra_Xs) else: extra_items = None extra_labels = None extra_weights = None extra_X = None full_doc_index = None n_test_folds = ds.get_n_test_folds() models = {} alphas = np.linspace(np.min(best_alphas), np.max(best_alphas), 10).tolist() f1_sums = np.zeros(len(alphas)) for test_fold in range(n_test_folds): print "Tuning hyperparameters" model = SparseModel(model_type=model_type, column_names=feature_names, metric=metric, **kwargs) if additional_label_files is not None: valid_f1s, best_alpha = model.tune_by_cv(X, labels, all_items, alphas, test_fold, n_dev_folds, sample_weights=weights, extra_X=extra_X, extra_y=extra_labels, extra_items=extra_items, extra_weights=extra_weights, doc_index=full_doc_index, dev_test_subset=unanimous_subset, verbose=verbose) else: valid_f1s, best_alpha = model.tune_by_cv(X, labels, all_items, alphas, test_fold, n_dev_folds, sample_weights=weights, dev_test_subset=unanimous_subset, verbose=verbose) f1_sums += np.mean(valid_f1s.as_matrix(), axis=0) print target, '; y_sum = ' + str(labels.sum()) + '; best alpha =', best_alpha, \ "; valid f1 = ", valid_f1s.mean(axis=0)[str(best_alpha)] for a_i, a in enumerate(alphas): print a, f1_sums[a_i] / n_test_folds best_index = np.argmax(f1_sums) best_alpha = alphas[best_index] params_list = [name, exp_dir, label_file, target, target_name, feature_list, model_type, min_alpha_exp, max_alpha_exp, alpha_exp_base, weight_col, reuse, orig_T, tau, best_alpha, additional_label_files, additional_label_weights, metric, only_unanimous] + kwargs.values() params_names = """name, exp_dir, label_file, target, target_name, feature_list, model_type, min_alpha_exp, max_alpha_exp, alpha_exp_base, weight_col, reuse, orig_T, tau, best_alpha, additional_label_files, additional_label_weights, metric, only_unanimous""" for key in kwargs.keys(): params_names += ', ' + key write_log(exp_dir, params_names, params_list) # use a set number of random splits to evaluate dev performance print "Estimating hold-out performance" valid_f1s = [] valid_f1s_unan = [] validation_loss_vector = [] validation_items_list = [] model = SparseModel() model = SparseModel(model_type=model_type, column_names=feature_names, metric=metric, alpha=float(best_alpha), **kwargs) # use a set number of random splits to evaluate dev performance print "Estimating hold-out performance" valid_f1s = [] valid_f1s_unan = [] masked_f1s = [] validation_loss_vector = [] validation_items_list = [] confusion_matrix_sum = np.zeros([n_labels, n_labels]) for dev_fold in range(n_eval_iters): n_eval = int(round(eval_prop * len(all_items))) eval_items = set(np.random.choice(all_items, n_eval, replace=False).tolist()) train_items = set(all_items) - eval_items if only_unanimous: unan_eval_items = list(set(eval_items).intersection(unanimous_subset)) unan_train_items = list(set(train_items).intersection(unanimous_subset)) else: unan_eval_items = eval_items unan_train_items = train_items eval_indices = exp2.get_indices(all_items, eval_items) train_indices = exp2.get_indices(all_items, train_items) unan_eval_indices = exp2.get_indices(all_items, unan_eval_items) unan_train_indices = exp2.get_indices(all_items, unan_train_items) if additional_label_files is not None: predictions, probs, all_probs = exp2.train_and_predict(model, train_indices, train_items, X, labels, sample_weights=weights, extra_X=extra_X, extra_y=extra_labels, extra_items=extra_items, extra_weights=extra_weights, doc_index=full_doc_index, verbose=1) else: predictions, probs, all_probs = exp2.train_and_predict(model, train_indices, train_items, X, labels, sample_weights=weights, verbose=1) #validation_loss_vector.extend(get_loss_vector(labels, predictions, weights, eval_indices)) #validation_items_list.extend([i for index, i in enumerate(all_items) if index in eval_indices]) train_macro_f1, valid_macro_f1 = exp2.evaluate_predictions(labels, predictions, eval_indices, train_indices, weights=weights, metric=metric) train_macro_f1_unan, valid_macro_f1_unan = exp2.evaluate_predictions(labels, predictions, unan_eval_indices, unan_train_indices, weights=weights, metric=metric) confusion_matrix = exp2.compute_confution_matrix(labels, weights, predictions, eval_indices, n_labels) confusion_matrix_sum += confusion_matrix valid_f1s_unan.append(valid_macro_f1_unan) valid_f1s.append(valid_macro_f1) print("Fold %d: train %s: %f; train unan %s: %f; valid %s: %f; valid unan %s: %f" % (dev_fold, metric, train_macro_f1, metric, train_macro_f1_unan, metric, valid_macro_f1, metric, valid_macro_f1_unan)) print confusion_matrix print "Mean unan dev f1 =", np.mean(valid_f1s_unan) #fh.write_to_json(zip(validation_items_list, validation_loss_vector), fh.make_filename(exp_dir, 'loss_vector', 'json')) fh.write_to_json(valid_f1s_unan, fh.make_filename(exp_dir, 'dev_f1s_unan', 'json')) fh.write_to_json(valid_f1s, fh.make_filename(exp_dir, 'dev_f1s', 'json')) fh.write_to_json(masked_f1s, fh.make_filename(exp_dir, 'dev_f1s_masked', 'json')) # finally, train one final model print "Training final model" #train_dict, valid_dict, test_dict = get_item_dicts(datasets, test_fold, dev_subfold) train_items = ds.get_all_documents() if only_unanimous: unan_train_items = list(set(train_items).intersection(unanimous_subset)) else: unan_train_items = train_items train_indices = exp2.get_indices(all_items, train_items) unan_train_indices = exp2.get_indices(all_items, unan_train_items) if verbose > 0: n, p = X.shape print ' n_train =', len(train_items), '; n_features =', p if additional_label_files is not None: predictions, probs, all_probs = exp2.train_and_predict(model, train_indices, train_items, X, labels, sample_weights=weights, extra_X=extra_X, extra_y=extra_labels, extra_items=extra_items, extra_weights=extra_weights, doc_index=full_doc_index, verbose=1) else: predictions, probs, all_probs = exp2.train_and_predict(model, train_indices, train_items, X, labels, sample_weights=weights, verbose=1) train_f1, test_f1 = exp2.evaluate_predictions(labels, predictions, train_indices, train_indices, weights=weights, metric='f1') train_acc, test_acc = exp2.evaluate_predictions(labels, predictions, train_indices, train_indices, weights=weights, metric='acc') unan_train_f1, unan_test_f1 = exp2.evaluate_predictions(labels, predictions, train_indices, unan_train_indices, weights=weights, metric='f1') #test_loss_vector = get_loss_vector(labels, predictions, weights, test_indices) #fh.write_to_json(zip(test_items, test_loss_vector), fh.make_filename(exp_dir, 'loss_vector_test', 'json')) evaluation.write_prediction_errors(labels, predictions, train_indices, all_items, weights, fh.make_filename(exp_dir, 'train_errors', 'csv')) #evaluation.write_prediction_errors(labels, predictions, test_indices, all_items, weights, # fh.make_filename(exp_dir, 'test_errors', 'csv')) output = pd.DataFrame(predictions, index=all_items, columns=[target_name]) output.to_csv(fh.make_filename(exp_dir, 'predictions', 'csv')) output = pd.DataFrame(probs, index=all_items, columns=[target_name]) output.to_csv(fh.make_filename(exp_dir, 'probs', 'csv')) fh.write_to_json({'train_f1': train_f1, 'train_acc': train_acc, 'train_f1_subset': unan_train_f1, 'mean_dev_f1_subset': float(np.mean(valid_f1s_unan))}, fh.make_filename(exp_dir, 'results', 'json')) model.write_to_file(fh.make_filename(exp_dir, 'model', 'json')) if predict_all: data = fh.read_json(dirs.data_processed_text_file) all_items = data.keys() all_items.sort() _, feature_names, X = exp2.load_features(feature_list, None, all_items, stage='test', verbose=verbose) predictions = model.predict(X) df = pd.DataFrame(predictions, index=all_items, columns=[target_name]) df.to_csv(fh.make_filename(exp_dir, 'full_predictions', 'csv')) if model_type == 'LR': probs = model.predict_max_probs(X) df = pd.DataFrame(probs, index=all_items, columns=[target_name]) df.to_csv(fh.make_filename(exp_dir, 'full_predictions_probs', 'csv')) if only_unanimous: loss = float(-np.mean(valid_f1s_unan)) else: loss = float(-np.mean(valid_f1s)) mean_confusion_matrix = exp2.normalize_confusion_matrix(confusion_matrix_sum) print "dev matrix" print cfm_dev print "test matrix" print cfm_test print "new matrix" print mean_confusion_matrix proportions_eval = {'confusion_matrix': mean_confusion_matrix.tolist()} fh.write_to_json(proportions_eval, fh.make_filename(exp_dir, 'proportions', 'json')) print("Overall: train f1: %f; mean valid f1 unan: %f; f1 unan: %f;" % (unan_train_f1, -loss, unan_test_f1))
def main(): usage = "%prog old_project old_exp_dir new_processed_dir old_splits_file output_dir" parser = OptionParser(usage=usage) parser.add_option('-f', dest='test_fold', default=0, help='Test fold: default= old name + _rerun') parser.add_option('-n', dest='new_name', default=None, help='New name for experiment: default= old name + _rerun') parser.add_option('-a', dest='alternate_label_file', default=None, help='Alternate label file: default= old name + _rerun') parser.add_option('--sentences', action="store_true", dest="sentences", default=False, help='Predict at the level of sentences (instead of documents): default=%default') (options, args) = parser.parse_args() #args = ['mfc_labeled', '/Users/dcard/Projects/CMU/ARK/guac/datasets/mfc_labeled/experiments/test_fold_0/primary/primary_test/', '/Users/dcard/Projects/CMU/ARK/guac/datasets/AILA_CIS/data/processed/', '/Users/dcard/Projects/CMU/ARK/guac/datasets/AILA_CIS/experiments/test_fold_0/blog/primary_test'] print args project = args[0] old_exp_dir = args[1] new_processed_dir = args[2] new_splits_file = args[3] output_dir = args[4] if not os.path.exists(output_dir): os.makedirs(output_dir) dirs.set_project(project, new_splits_file) log_filename = os.path.join(old_exp_dir, 'log.json') model_file = os.path.join(old_exp_dir, 'model.json') test_fold = int(options.test_fold) new_name = options.new_name use_sentences = options.sentences alternate_label_file = options.alternate_label_file log = fh.read_json(log_filename) if new_name is None: new_name = log['name'] + '_rerun' log['name'] = new_name float_vars = ['best_alpha', 'alpha_exp_base', 'max_alpha_exp', 'min_alpha_exp', 'orig_T', 'tau'] for v in float_vars: if v in log: if log[v] is not None: log[v] = float(log[v]) else: log[v] = None target_col = int(log['target']) weight_col = int(log['weight_col']) label_file = log['label_file'] # load the features print "Loading features" feature_list = log['feature_list'] sentences = fh.read_json(os.path.join(new_processed_dir, '..', 'raw', 'text', 'sentences.json')) items = sentences.keys() print len(items), items[0] _, feature_names, X = experiment2.load_features(feature_list, items, stage='predict', data_processed_dir=new_processed_dir) """ else: basename = os.path.split(fh.get_basename_wo_ext(label_file))[-1] index_filename = os.path.join(dirs.data_raw_index_dir, basename + '_sentence_index.json') # get the index which maps psueodocuments to real documents doc_index = fh.read_json(index_filename) items, target_name, labels, weights, _ = lr.get_labels(alternate_label_file, target_col, weight_col=weight_col) # add to our overall list of items, labels, and weights (for pseudodocuments) # load these featues _, feature_names, X = experiment.load_features(feature_list, test_fold, items, doc_index=doc_index, stage='testing', data_processed_dir=dirs.data_processed_dir) """ model = SparseModel() model.load(model_file) predictions = model.predict(X) print predictions.shape probs = model.predict_probs(X) print probs.shape df1 = pd.DataFrame(predictions, index=items, columns=['prediction']) df2 = pd.DataFrame(probs[:, 1], index=items, columns=['prob']) df = pd.concat([df1, df2], axis=1) output_file = os.path.join(output_dir, label_file + '.csv') df.to_csv(output_file) # need to refactor the following to make use of test data for the new data, but that's not even necessarily coherent """