def find_entities(n_files=None, use_lemmas=False): parsed_dir = os.path.join(dirs.data_stanford_dir, 'parsed') parsed_files = glob.glob(os.path.join(parsed_dir, '*.json')) dependencies_file = os.path.join(dirs.data_stanford_dir, 'dependency_tuple_ids.json') dependencies = fh.read_json(dependencies_file) coref_file = os.path.join(dirs.data_stanford_dir, 'coref_heads.json') coref_heads = fh.read_json(coref_file) supersense_tags = fh.read_json(os.path.join(dirs.data_amalgram_dir, 'all_tags.json')) trees = {} clustered_indices = {} print "Building trees and finding story elements" if n_files is None: n_files = len(parsed_files) else: n_files = int(n_files) for f_i, f in enumerate(parsed_files[:n_files]): sentences = fh.read_json(f) basename = fh.get_basename_wo_ext(f) trees[f] = build_tree(sentences, dependencies[basename], coref_heads[basename], supersense_tags[basename], basename, use_lemmas) clustered_indices[f] = find_entities_in_article(trees[f]) if f_i % 1000 == 0 and f_i > 0: print f_i return trees, clustered_indices
def write_sentences(f): output_dir = fh.makedirs(dirs.data_semafor_dir, 'temp') index = 0 sent_index = {} responses = fh.read_json(f) keys = responses.keys() keys.sort() #all_items = ds.get_all_documents() #unlabeled = list(set(keys) - all_items) #print len(unlabeled) for k in keys: sentence_filename = os.path.join(output_dir, k + '.txt') #index_filename = fh.make_filename(output_dir, fh.get_basename(f), 'json') with codecs.open(sentence_filename, 'w', encoding='utf-8') as output_file: text = responses[k] paragraphs = text.split('\n\n') paragraphs = [p for p in paragraphs if p != ''] for p in paragraphs: sentences = tokenizer.split_sentences(p) for sent in sentences: sent = sent.lstrip() sent = sent.rstrip() if len(sent) > 0: output_file.write(sent + '\n')
def preprocess_for_brown_clustering(): input_filename = dirs.data_processed_text_file articles = fh.read_json(input_filename) keys = articles.keys() keys.sort() items = keys print len(items) processed_dict = {} output_filename = fh.make_filename(dirs.data_processed_brown_dir, 'input', 'txt') with codecs.open(output_filename, 'w', encoding='utf-8') as output_file: for k in keys: text = articles[k] tokens = [] sentences = text.split('\n') for s in sentences: sent_tokens = tokenizer.split_into_words(s, reattach=False, split_off_quotes=False, lemmatize=False, replace_numbers=True) tokens = tokens + sent_tokens if k in items: output_file.write(' '.join(tokens) + '\n') processed_dict[k] = tokens output_filename = fh.make_filename(dirs.data_processed_brown_dir, 'processed', 'json') fh.write_to_json(processed_dict, output_filename)
def split_into_files(input_filename, output_dir): data = fh.read_json(input_filename) keys = data.keys() keys.sort() filelist = [] for key in keys: key = key.rstrip('\n') line = data[key].rstrip('\n') normalized_filename = os.path.join(output_dir, key + '.txt') filelist.append(normalized_filename) with codecs.open(normalized_filename, 'w', encoding='utf-8') as output_file: output_file.write(line) filelist_filename = fh.make_filename(output_dir, 'filelist', 'txt') fh.write_list_to_text(filelist, filelist_filename) return filelist_filename
def write_tagged_text(parsed_filename, output_filename): data = fh.read_json(parsed_filename) tagged_text = {} for key, sentences in data.items(): tagged_sentences = [] for sentence in sentences: tagged_tokens = [] for token in sentence: word = token.get('word', '__MISSING__') POS = token.get('POS', '__MISSING__') lemma = token.get('lemma', '__MISSING__') NER = token.get('NER', '__MISSING__') #tagged = word + '_' + POS tagged = POS + '_POS_' tagged_tokens.append(tagged) tagged_sentence = ' '.join(tagged_tokens) tagged_sentences.append(tagged_sentence) tagged_text[fh.get_basename_wo_ext(key)] = ' '.join(tagged_sentences) fh.write_to_json(tagged_text, output_filename, sort_keys=False)
def preprocess_for_easysrl(): input_filename = dirs.data_processed_text_file articles = fh.read_json(input_filename) keys = articles.keys() keys.sort() labeled = list(ds.get_all_documents()) labeled.sort() processed_dict = {} output_filename = fh.make_filename(dirs.data_easysrl_dir, 'input', 'txt') with codecs.open(output_filename, 'w', encoding='utf-8') as output_file: count = 0 for k in labeled: output_file.write(k + ' starts here\n') text = articles[k] paragraphs = text.split('\n\n') for p in paragraphs: sentences = tokenizer.split_sentences(p.strip()) for s in sentences: output_file.write(s.strip() + '\n')
def test_over_time(project_dir, subset, config_file, model_type, field, train_start, train_end, test_start, test_end, n_train=None, n_calib=0, penalty='l2', suffix='', loss='log', objective='f1', do_ensemble=True, dh=300, label='label', intercept=True, n_dev_folds=5, average='micro', seed=None, alpha_min=0.01, alpha_max=1000.0, n_alphas=8, sample_labels=False, group_identical=False, annotated_subset=None, nonlinearity='tanh', init_lr=1e-2, min_epochs=2, max_epochs=50, patience=5, tol=1e-4, list_size=1, repeats=1, oracle=False, lower=None, interactive=False, stoplist_file=None, cshift=False, n_cshift=None, do_cfm=True, do_platt=True, dropout=0.0, min_test=None, test_prop=None, verbose=False): # Just run a regular model, one per year, training on the past, and save the reults if seed is not None: seed = int(seed) np.random.seed(seed) log = { 'project': project_dir, 'subset': subset, 'config_file': config_file, 'model_type': model_type, 'field': field, 'train_start': train_start, 'train_end': train_end, 'test_start': test_start, 'test_end': test_end, 'n_train': n_train, 'n_calib': n_calib, 'penalty': penalty, 'cshift': cshift, 'n_cshift': n_cshift, 'suffix': suffix, 'loss': loss, 'objective': objective, 'do_ensemble': do_ensemble, 'dh': dh, 'label': label, 'intercept': intercept, 'n_dev_folds': n_dev_folds, 'average': average, 'seed': seed, 'alpha_min': alpha_min, 'alpha_max': alpha_max, 'n_alphas': n_alphas, 'sample_labels': sample_labels, 'group_identical': group_identical, 'annotated_subset': annotated_subset, 'nonlinearity': nonlinearity, 'init_lr': init_lr, 'min_epochs': min_epochs, 'max_epochs': max_epochs, 'patience': patience, 'tol': tol, 'interactive': interactive, 'stoplist_file': stoplist_file, 'list_size': list_size } model_basename = make_model_basename(log) # save the experiment parameters to a log file logfile = os.path.join(dirs.dir_logs(project_dir), model_basename + '.json') fh.makedirs(dirs.dir_logs(project_dir)) fh.write_to_json(log, logfile) # load the features specified in the config file config = fh.read_json(config_file) feature_defs = [] for f in config['feature_defs']: feature_defs.append(features.parse_feature_string(f)) # load the file that contains metadata about each item metadata_file = os.path.join(dirs.dir_subset(project_dir, subset), 'metadata.csv') metadata = fh.read_csv_to_df(metadata_file) field_vals = list(set(metadata[field].values)) field_vals.sort() print("Splitting data according to %s", field) print("Values:", field_vals) print("\nTesting on %s to %s" % (test_start, test_end)) # first, split into training and non-train data based on the field of interest all_items = list(metadata.index) test_selector_all = (metadata[field] >= int(test_start)) & (metadata[field] <= int(test_end)) test_subset_all = metadata[test_selector_all] test_items_all = test_subset_all.index.tolist() n_test_all = len(test_items_all) if min_test is not None: if n_test_all < min_test: print("Not enough test samples; exiting") return if train_end is None: if train_start is None: train_selector_all = metadata[field] < int(test_start) else: train_selector_all = (metadata[field] < int(test_start)) & (metadata[field] >= train_start) else: if train_start is None: train_selector_all = metadata[field] <= int(train_end) else: train_selector_all = (metadata[field] <= int(train_end)) & (metadata[field] >= train_start) train_subset_all = metadata[train_selector_all] train_items_all = list(train_subset_all.index) n_train_all = len(train_items_all) # only keep the items in the train and test sets all_items = train_items_all + test_items_all print("Train: %d, Test: %d (labeled and unlabeled)" % (n_train_all, n_test_all)) # load all labels label_dir = dirs.dir_labels(project_dir, subset) labels_df = fh.read_csv_to_df(os.path.join(label_dir, label + '.csv'), index_col=0, header=0) labels_df = labels_df.loc[all_items] # if desired, attempt to learn weights for the training data using techniques for covariate shift if cshift: print("Training a classifier for covariate shift") # start by learning to discriminate train from non-train data # Label items based on whether they come from train or test train_test_labels = np.zeros((len(all_items), 2), dtype=int) train_test_labels[:n_train_all, 0] = 1 train_test_labels[n_train_all:, 1] = 1 if np.sum(train_test_labels[:, 0]) < np.sum(train_test_labels[:, 1]): cshift_pos_label = 0 else: cshift_pos_label = 1 train_test_labels_df = pd.DataFrame(train_test_labels, index=all_items, columns=[0, 1]) if n_cshift is not None and len(all_items) >= n_cshift: print("Taking a random sample of %d items for reweighting" % n_cshift) #np.random.shuffle(all_items) cshift_items = np.random.choice(all_items, size=n_cshift, replace=False) else: print("Using all train items") cshift_items = all_items print(train_test_labels_df.loc[cshift_items].mean(axis=0)) # create a cshift model using the same specifiction as our model below (e.g. LR/MLP, etc.) model_name = model_basename + '_' + str(test_start) + '-' + str(test_end) + 'cshift' model, dev_f1, dev_acc, dev_cal, dev_cal_overall = train.train_model_with_labels(project_dir, model_type, loss, model_name, subset, train_test_labels_df, feature_defs, items_to_use=cshift_items, penalty=penalty, alpha_min=alpha_min, alpha_max=alpha_max, n_alphas=n_alphas, intercept=intercept, n_dev_folds=n_dev_folds, save_model=True, do_ensemble=False, dh=dh, seed=seed, pos_label=cshift_pos_label, verbose=False) print("cshift results: %0.4f f1, %0.4f acc" % (dev_f1, dev_acc)) #X_cshift, features_concat = predict.load_data(project_dir, model_name, subset, items_to_use=all_items) X_cshift, features_concat = predict.load_data(project_dir, model_name, subset, items_to_use=all_items) cshift_pred_probs = model.predict_probs(X_cshift) f_items = features_concat.get_items() assert len(f_items) == len(all_items) for i in range(len(all_items)): assert all_items[i] == f_items[i] cshift_pred_probs_df = pd.DataFrame(cshift_pred_probs, index=features_concat.get_items(), columns=range(2)) # display the min and max probs print("Min: %0.6f" % cshift_pred_probs_df[1].values[:n_train_all].min()) print("Mean: %0.6f" % cshift_pred_probs_df[1].values[:n_train_all].mean()) print("Max: %0.6f" % cshift_pred_probs_df[1].values[:n_train_all].max()) # HACK: need to prevent 0s in prob(y=0|x) p_train_values = cshift_pred_probs_df[0].values threshold = 0.01 p_train_values[p_train_values < threshold] = threshold print("After thresholding") print("Min: %0.6f" % p_train_values[:n_train_all].min()) print("Mean: %0.6f" % p_train_values[:n_train_all].mean()) print("Max: %0.6f" % p_train_values[:n_train_all].max()) # use the estimated probability of each item being a training item to compute item weights weights = n_train_all / float(n_test_all) * (1.0/p_train_values - 1) weights_df_all = pd.DataFrame(weights, index=all_items) # print a summary of the weights from just the training items print("Min weight: %0.4f" % weights[:n_train_all].min()) print("Ave weight: %0.4f" % weights[:n_train_all].mean()) print("Max weight: %0.4f" % weights[:n_train_all].max()) # print a summary of all weights #print("Min weight: %0.4f" % weights.min()) #print("Ave weight: %0.4f" % weights.mean()) #print("Max weight: %0.4f" % weights.max()) # create a data frame with this information else: weights_df_all = None # find the labeled items print("Subsetting items with labels") label_sums_df = labels_df.sum(axis=1) labeled_item_selector = label_sums_df > 0 labels_df = labels_df[labeled_item_selector] n_labeled_items, n_classes = labels_df.shape print("%d labeled items" % n_labeled_items) labeled_items = set(labels_df.index) train_items_labeled = [i for i in train_items_all if i in labeled_items] test_items = [i for i in test_items_all if i in labeled_items] #n_train = len(train_items) n_test = len(test_items) for r in range(repeats): # set seed very explicily here to make sure experiments are comparable if seed is not None: seed += 1 np.random.seed(seed) print("* Starting repetition %d *" % r) model_name = model_basename + '_' + str(test_start) + '-' + str(test_end) + '_' + str(r) if n_train is not None and len(train_items_labeled) >= n_train: np.random.shuffle(train_items_labeled) train_items = np.random.choice(train_items_labeled, size=n_train, replace=False) else: print("Using all train items") train_items = train_items_labeled n_train_r = len(train_items) # now, choose a calibration set if n_calib > 0 and n_test >= n_calib: np.random.shuffle(test_items) calib_items = np.random.choice(test_items, size=n_calib, replace=False) elif n_test < n_calib: print("Error: Only %d labeled test instances available" % n_test) calib_items = test_items else: calib_items = [] if weights_df_all is not None: weights_df = weights_df_all[labeled_item_selector] else: weights_df = None print("Labeled train: %d, test: %d" % (n_train_r, n_test)) # create a data frame to hold a summary of the results output_df = pd.DataFrame([], columns=['N', 'training data', 'test data', 'cal', 'estimate', 'MAE', '95lcl', '95ucl', 'contains_test']) test_labels_df = labels_df.loc[test_items] # do a fake adjustment of the test label proportions if test_prop is not None: test_prop = float(test_prop) test_label_values = test_labels_df.values test_label_props = test_label_values[:, 1] / (test_label_values[:, 1] + test_label_values[:, 0]) order = list(np.argsort(test_label_props)) true_prop = np.mean(test_label_props) if test_prop < true_prop: i = 0 running = test_label_props[order[i]] new_test_items = [test_items[order[i]]] i += 1 while (running / i) <= test_prop: running += test_label_props[order[i]] new_test_items.append(test_items[order[i]]) i += 1 print("Taking %d test_items" % len(new_test_items)) test_items = new_test_items[:] else: order.reverse() i = 0 running = test_label_props[order[i]] new_test_items = [test_items[order[i]]] i += 1 while (running / i) >= test_prop: running += test_label_props[order[i]] new_test_items.append(test_items[order[i]]) i += 1 print("Taking %d test_items" % len(new_test_items)) test_items = new_test_items[:] test_labels_df = labels_df.loc[test_items] test_label_values = test_labels_df.values test_label_props = test_label_values[:, 1] / (test_label_values[:, 1] + test_label_values[:, 0]) print("New props = %0.3f" % np.mean(test_label_props)) # if instructed, sample labels in proportion to annotations (to simulate having one label per item) if sample_labels: print("Sampling labels") # normalize the labels temp = labels_df.values / np.array(labels_df.values.sum(axis=1).reshape((n_labeled_items, 1)), dtype=float) samples = np.zeros([n_labeled_items, n_classes], dtype=int) for i in range(n_labeled_items): index = np.random.choice(np.arange(n_classes), size=1, p=temp[i, :]) samples[i, index] = 1 sampled_labels_df = pd.DataFrame(samples, index=labels_df.index, columns=labels_df.columns) else: sampled_labels_df = labels_df train_labels_df = sampled_labels_df.loc[train_items].copy() if n_calib > 0: calib_labels_df = sampled_labels_df.loc[calib_items].copy() else: calib_labels_df = None # get the true proportion of labels in the test OR non-training data (calibration and test combined) target_props, target_estimate, target_std = get_estimate_and_std(test_labels_df, use_n_annotations=True) output_df.loc['target'] = [n_test, 'test', 'test', 'n/a', target_estimate, 0, target_estimate - 2 * target_std, target_estimate + 2 * target_std, np.nan] # get the same estimate from training data train_props, train_estimate, train_std = get_estimate_and_std(train_labels_df, use_n_annotations=True) print("Train props:", train_props, train_estimate) train_rmse = np.abs(train_estimate - target_estimate) train_contains_test = target_estimate > train_estimate - 2 * train_std and target_estimate < train_estimate + 2 * train_std output_df.loc['train'] = [n_train_r, 'train', 'test', 'n/a', train_estimate, train_rmse, train_estimate - 2 * train_std, train_estimate + 2 * train_std, train_contains_test] # get the same estimate from training data if n_calib > 0: calib_props, calib_estimate, calib_std = get_estimate_and_std(calib_labels_df, use_n_annotations=True) # compute the error of this estimate calib_rmse = np.abs(calib_estimate - target_estimate) calib_contains_test = target_estimate > calib_estimate - 2 * calib_std and target_estimate < calib_estimate + 2 * calib_std output_df.loc['calib'] = [n_calib, 'calib', 'test', 'n/a', calib_estimate, calib_rmse, calib_estimate - 2 * calib_std, calib_estimate + 2 * calib_std, calib_contains_test] else: calib_estimate = 0.0 calib_std = 1.0 output_df.loc['calib'] = [n_calib, 'calib', 'test', 'n/a', np.nan, np.nan, np.nan, np.nan, np.nan] if train_estimate > 0.5: pos_label = 0 else: pos_label = 1 print("Using %d as the positive label" % pos_label) results_df = pd.DataFrame([], columns=['f1', 'acc', 'mae', 'estimated calibration']) # Now train a model on the training data, saving the calibration data for calibration if stoplist_file is not None: stoplist = fh.read_text(stoplist_file) stoplist = {s.strip() for s in stoplist} print(stoplist) else: stoplist = None print("Training a LR model") model, dev_f1, dev_acc, dev_cal_mae, dev_cal_est = train.train_model_with_labels(project_dir, model_type, 'log', model_name, subset, sampled_labels_df, feature_defs, weights_df=weights_df, items_to_use=train_items, penalty=penalty, alpha_min=alpha_min, alpha_max=alpha_max, n_alphas=n_alphas, intercept=intercept, objective=objective, n_dev_folds=n_dev_folds, do_ensemble=do_ensemble, dh=dh, seed=seed, pos_label=pos_label, vocab=None, group_identical=group_identical, nonlinearity=nonlinearity, init_lr=init_lr, min_epochs=min_epochs, max_epochs=max_epochs, patience=patience, do_cfm=do_cfm, do_platt=do_platt, lower=lower, stoplist=stoplist, dropout=dropout, verbose=verbose) results_df.loc['cross_val'] = [dev_f1, dev_acc, dev_cal_mae, dev_cal_est] X_test, features_concat = predict.load_data(project_dir, model_name, subset, items_to_use=test_items) test_predictions = model.predict(X_test) test_predictions_df = pd.DataFrame(test_predictions, index=features_concat.get_items(), columns=[label]) test_pred_probs = model.predict_probs(X_test) _, n_labels = test_pred_probs.shape test_pred_probs_df = pd.DataFrame(test_pred_probs, index=features_concat.get_items(), columns=range(n_labels)) f1_test, acc_test = evaluate_predictions.evaluate_predictions(test_labels_df, test_predictions_df, test_pred_probs_df, pos_label=pos_label, average=average) true_test_vector = np.argmax(test_labels_df.as_matrix(), axis=1) test_cal_est = evaluation.evaluate_calibration_rmse(true_test_vector, test_pred_probs_df.as_matrix(), min_bins=1, max_bins=1) test_cc_estimate, test_pcc_estimate = model.predict_proportions(X_test) test_cc_mae = np.mean(np.abs(test_cc_estimate[1] - target_estimate)) test_pcc_mae = np.mean(np.abs(test_pcc_estimate[1] - target_estimate)) results_df.loc['test'] = [f1_test, acc_test, test_pcc_mae, test_cal_est] output_df.loc['CC'] = [n_train_r, 'train', 'test', 'n/a', test_cc_estimate[1], test_cc_mae, np.nan, np.nan, np.nan] output_df.loc['PCC'] = [n_train_r, 'train', 'test', 'n/a', test_pcc_estimate[1], test_pcc_mae, np.nan, np.nan, np.nan] test_acc_estimate_internal, test_acc_ms_estimate_internal = model.predict_proportions(X_test, do_cfm=do_cfm) test_acc_rmse_internal = np.abs(test_acc_estimate_internal[1] - target_estimate) test_acc_ms_rmse_internal = np.abs(test_acc_ms_estimate_internal[1] - target_estimate) output_df.loc['ACC_internal'] = [n_train_r, 'train', 'test', 'n/a', test_acc_estimate_internal[1], test_acc_rmse_internal, np.nan, np.nan, np.nan] output_df.loc['MS_internal'] = [n_train_r, 'train', 'nontrain', 'predicted', test_acc_ms_estimate_internal[1], test_acc_ms_rmse_internal, np.nan, np.nan, np.nan] test_platt1_estimate, test_platt2_estimate = model.predict_proportions(X_test, do_platt=do_platt) test_platt1_rmse = np.abs(test_platt1_estimate[1] - target_estimate) test_platt2_rmse = np.abs(test_platt2_estimate[1] - target_estimate) output_df.loc['PCC_platt1'] = [n_train_r, 'train', 'test', 'n/a', test_platt1_estimate[1], test_platt1_rmse, np.nan, np.nan, np.nan] output_df.loc['PCC_platt2'] = [n_train_r, 'train', 'nontrain', 'predicted', test_platt2_estimate[1], test_platt2_rmse, np.nan, np.nan, np.nan] if n_calib > 0: cc_plus_cal_estimate = (test_cc_estimate[1] + calib_estimate) / 2.0 pcc_plus_cal_estimate = (test_pcc_estimate[1] + calib_estimate) / 2.0 cc_plus_cal_mae = np.mean(np.abs(cc_plus_cal_estimate - target_estimate)) pcc_plus_cal_mae = np.mean(np.abs(pcc_plus_cal_estimate - target_estimate)) #output_df.loc['CC_plus_cal'] = [n_train, 'train', 'test', 'n/a', cc_plus_cal_estimate, cc_plus_cal_mae, np.nan, np.nan, np.nan] output_df.loc['PCC_plus_cal'] = [n_train_r, 'train', 'test', 'n/a', pcc_plus_cal_estimate, pcc_plus_cal_mae, np.nan, np.nan, np.nan] results_df.to_csv(os.path.join(dirs.dir_models(project_dir), model_name, 'accuracy.csv')) output_df.to_csv(os.path.join(dirs.dir_models(project_dir), model_name, 'results.csv')) """
def cross_train_and_eval(project_dir, reference_model_dir, subset, field_name, config_file, n_train=100, field_val=None, vocab_file=None, group_identical=False, suffix='', model_type='MLP', loss='log', do_ensemble=True, dh=100, label='label', n_dev_folds=5, repeats=1, verbose=False, average='micro', objective='calibration', seed=None, init_lr=1e-4, min_epochs=2, max_epochs=50, early_stopping=False, tol=1e-4, patience=8): n_calib = 0 model_basename = subset + '_' + label + '_' + field_name + '_' + model_type if model_type == 'MLP': model_basename += '_' + str(dh) model_basename += '_' + str(n_train) + '_' + str(n_calib) + '_' + objective model_basename += suffix # save the experiment parameters to a log file logfile = os.path.join(dirs.dir_logs(project_dir), model_basename + '.json') fh.makedirs(dirs.dir_logs(project_dir)) log = { 'project': project_dir, 'subset': subset, 'field_name': field_name, 'config_file': config_file, 'n_calib': n_calib, 'n_train': n_train, 'suffix': suffix, 'model_type': model_type, 'loss': loss, 'dh': dh, 'do_ensemble': do_ensemble, 'label': label, 'field_val': field_val, 'n_dev_folds': n_dev_folds, 'repeats': repeats, 'average': average, 'objective': objective, } fh.write_to_json(log, logfile) # load the features specified in the config file config = fh.read_json(config_file) feature_defs = [] for f in config['feature_defs']: feature_defs.append(features.parse_feature_string(f)) # load the file that contains metadata about each item metadata_file = os.path.join(dirs.dir_subset(project_dir, subset), 'metadata.csv') metadata = fh.read_csv_to_df(metadata_file) field_vals = list(set(metadata[field_name].values)) field_vals.sort() print("Splitting data according to :", field_vals) print(field_vals) if field_val is not None: field_vals = [field_val] # repeat the following value for each fold of the partition of interest (up to max_folds, if given) for v_i, v in enumerate(field_vals): print("\nTesting on %s" % v) # first, split into training and non-train data based on the field of interest train_selector = metadata[field_name] != v train_subset = metadata[train_selector] train_items = list(train_subset.index) n_train_cshift = len(train_items) non_train_selector = metadata[field_name] == v non_train_subset = metadata[non_train_selector] non_train_items = non_train_subset.index.tolist() n_non_train_cshift = len(non_train_items) print("Train: %d, non-train: %d" % (n_train_cshift, n_non_train_cshift)) # load all labels label_dir = dirs.dir_labels(project_dir, subset) labels_df = fh.read_csv_to_df(os.path.join(label_dir, label + '.csv'), index_col=0, header=0) n_items, n_classes = labels_df.shape weights_df = None # add in a stage to eliminate items with no labels? print("Subsetting items with labels") label_sums_df = labels_df.sum(axis=1) labeled_item_selector = label_sums_df > 0 labels_df = labels_df[labeled_item_selector] n_items, n_classes = labels_df.shape labeled_items = set(labels_df.index) train_items = [i for i in train_items if i in labeled_items] non_train_items = [i for i in non_train_items if i in labeled_items] n_non_train = len(non_train_items) if weights_df is not None: weights_df = weights_df[labeled_item_selector] print("Starting repeats") # repeat the following process multiple times with different random splits of train / calibration / test data for r in range(repeats): print("* Repetition %d *" % r) # next, take a random subset of the training data (and ignore the rest), to simulate fewer annotated items if n_train > 0: np.random.shuffle(train_items) train_items_r = np.random.choice(train_items, size=n_train, replace=False) else: train_items_r = train_items n_train_r = len(train_items_r) # create a data frame to hold a summary of the results output_df = pd.DataFrame([], columns=[ 'N', 'training data', 'test data', 'cal', 'estimate', 'RMSE', '95lcl', '95ucl', 'contains_test' ]) # create a unique name ofr this model model_name = model_basename + '_' + str(v) + '_' + str(r) # now, divide the non-train data into a calibration and a test set #n_calib = int(calib_prop * n_non_train) np.random.shuffle(non_train_items) if n_calib > n_non_train: n_calib = int(n_non_train / 2) print( "Warning!!: only %d non-train items; using 1/2 for calibration" % n_non_train) calib_items = non_train_items[:n_calib] test_items = non_train_items[n_calib:] n_test = len(test_items) print("Train: %d, calibration: %d, test: %d" % (n_train_r, n_calib, n_test)) test_labels_df = labels_df.loc[test_items] non_train_labels_df = labels_df.loc[non_train_items] sampled_labels_df = labels_df train_labels_r_df = sampled_labels_df.loc[train_items_r].copy() calib_labels_df = sampled_labels_df.loc[calib_items].copy() # get the true proportion of labels in the test OR non-training data (calibration and test combined) target_props, target_estimate, target_std = get_estimate_and_std( non_train_labels_df) output_df.loc['target'] = [ n_test, 'nontrain', 'nontrain', 'given', target_estimate, 0, target_estimate - 2 * target_std, target_estimate + 2 * target_std, np.nan ] # get the same estimate from training data train_props, train_estimate, train_std = get_estimate_and_std( train_labels_r_df) # compute the error of this estimate train_rmse = np.sqrt((train_estimate - target_estimate)**2) train_contains_test = target_estimate > train_estimate - 2 * train_std and target_estimate < train_estimate + 2 * train_std output_df.loc['train'] = [ n_train_r, 'train', 'train', 'n/a', train_estimate, train_rmse, np.nan, np.nan, np.nan ] print( "target proportions: (%0.3f, %0.3f); train proportions: %0.3f" % (target_estimate - 2 * target_std, target_estimate + 2 * target_std, train_estimate)) if train_estimate > 0.5: pos_label = 0 else: pos_label = 1 print("Using %d as the positive label" % pos_label) # repeat for labeled calibration data if n_calib > 0: calib_props, calib_estimate, calib_std = get_estimate_and_std( calib_labels_df) calib_rmse = np.sqrt((calib_estimate - target_estimate)**2) # check if the test estimate is within 2 standard deviations of the estimate calib_contains_test = target_estimate > calib_estimate - 2 * calib_std and calib_estimate < calib_estimate + 2 * calib_std output_df.loc['calibration'] = [ n_calib, 'calibration', 'nontrain', 'given', calib_estimate, calib_rmse, calib_estimate - 2 * calib_std, calib_estimate + 2 * calib_std, calib_contains_test ] # do a test using the number of annotations rather than the number of items calib_props2, calib_estimate2, calib_std2 = get_estimate_and_std( calib_labels_df, use_n_annotations=True) calib_rmse2 = np.sqrt((calib_estimate2 - target_estimate)**2) calib_contains_test2 = target_estimate > calib_estimate2 - 2 * calib_std2 and calib_estimate < calib_estimate2 + 2 * calib_std2 output_df.loc['calibration_n_annotations'] = [ n_calib, 'calibration', 'nontrain', 'given', calib_estimate2, calib_rmse2, calib_estimate2 - 2 * calib_std2, calib_estimate2 + 2 * calib_std2, calib_contains_test2 ] results_df = pd.DataFrame( [], columns=['f1', 'acc', 'calibration', 'calib overall']) # Now train a model on the training data, saving the calibration data for calibration print("Training model on training data only") model, dev_f1, dev_acc, dev_cal, dev_cal_overall = train.train_brier_grouped( project_dir, model_name, subset, sampled_labels_df, feature_defs, weights_df=weights_df, vocab_file=vocab_file, group_identical=group_identical, items_to_use=train_items_r, intercept=True, n_dev_folds=n_dev_folds, do_ensemble=do_ensemble, dh=dh, seed=seed, pos_label=pos_label, verbose=verbose, init_lr=init_lr, min_epochs=min_epochs, max_epochs=max_epochs, early_stopping=early_stopping, tol=tol, patience=patience) results_df.loc['cross_val'] = [ dev_f1, dev_acc, dev_cal, dev_cal_overall ] # predict on calibration data if n_calib > 0: calib_predictions_df, calib_pred_probs_df, calib_pred_proportions = predict.predict( project_dir, model, model_name, subset, label, items_to_use=calib_items, verbose=verbose, force_dense=True) calib_cc, calib_pcc, calib_acc, calib_pvc = calib_pred_proportions f1_cal, acc_cal = evaluate_predictions.evaluate_predictions( calib_labels_df, calib_predictions_df, calib_pred_probs_df, pos_label=pos_label, average=average, verbose=False) true_calib_vector = np.argmax(calib_labels_df.as_matrix(), axis=1) calib_cal_rmse = evaluation.evaluate_calibration_rmse( true_calib_vector, calib_pred_probs_df.as_matrix()) calib_cal_rmse_overall = evaluation.evaluate_calibration_rmse( true_calib_vector, calib_pred_probs_df.as_matrix(), min_bins=1, max_bins=1) results_df.loc['calibration'] = [ f1_cal, acc_cal, calib_cal_rmse, calib_cal_rmse_overall ] # predict on test data test_predictions_df, test_pred_probs_df, test_pred_proportions = predict.predict( project_dir, model, model_name, subset, label, items_to_use=test_items, verbose=verbose, force_dense=True) f1_test, acc_test = evaluate_predictions.evaluate_predictions( test_labels_df, test_predictions_df, test_pred_probs_df, pos_label=pos_label, average=average) true_test_vector = np.argmax(test_labels_df.as_matrix(), axis=1) test_cal_rmse = evaluation.evaluate_calibration_rmse( true_test_vector, test_pred_probs_df.as_matrix()) test_cal_rmse_overall = evaluation.evaluate_calibration_rmse( true_test_vector, test_pred_probs_df.as_matrix(), min_bins=1, max_bins=1) results_df.loc['test'] = [ f1_test, acc_test, test_cal_rmse, test_cal_rmse_overall ] test_cc_estimate, test_pcc_estimate, test_acc_estimate_internal, test_pvc_estimate_internal = test_pred_proportions # predict on calibration and test data combined nontrain_predictions_df, nontrain_pred_probs_df, nontrain_pred_proportions = predict.predict( project_dir, model, model_name, subset, label, items_to_use=non_train_items, verbose=verbose, force_dense=True) nontrain_cc_estimate, nontrain_pcc_estimate, nontrain_acc_estimate_internal, nontrain_pvc_estimate_internal = nontrain_pred_proportions if n_calib > 0: cc_calib_rmse = np.sqrt((calib_cc[1] - calib_estimate)**2) output_df.loc['CC_cal'] = [ n_non_train, 'train', 'calibration', 'predicted', calib_cc[1], cc_calib_rmse, np.nan, np.nan, np.nan ] pcc_calib_rmse = np.sqrt((calib_pcc[1] - calib_estimate)**2) output_df.loc['PCC_cal'] = [ n_non_train, 'train', 'calibration', 'predicted', calib_pcc[1], pcc_calib_rmse, np.nan, np.nan, np.nan ] cc_rmse = np.sqrt((nontrain_cc_estimate[1] - target_estimate)**2) pcc_rmse = np.sqrt((nontrain_pcc_estimate[1] - target_estimate)**2) output_df.loc['CC_nontrain'] = [ n_non_train, 'train', 'nontrain', 'predicted', nontrain_cc_estimate[1], cc_rmse, np.nan, np.nan, np.nan ] output_df.loc['PCC_nontrain'] = [ n_non_train, 'train', 'nontrain', 'predicted', nontrain_pcc_estimate[1], pcc_rmse, np.nan, np.nan, np.nan ] if n_calib > 0: averaged_cc_estimate = ( test_cc_estimate[1] * n_test + calib_estimate * n_calib) / float(n_test + n_calib) averaged_pcc_estimate = ( test_pcc_estimate[1] * n_test + calib_estimate * n_calib) / float(n_test + n_calib) averaged_cc_rmse = np.sqrt( (averaged_cc_estimate - target_estimate)**2) averaged_pcc_rmse = np.sqrt( (averaged_pcc_estimate - target_estimate)**2) output_df.loc['CC_nontrain_averaged'] = [ n_non_train, 'train', 'nontrain', 'given', averaged_cc_estimate, averaged_cc_rmse, np.nan, np.nan, np.nan ] output_df.loc['PCC_nontrain_averaged'] = [ n_non_train, 'train', 'nontrain', 'given', averaged_pcc_estimate, averaged_pcc_rmse, np.nan, np.nan, np.nan ] """ nontrain_acc_rmse_internal = np.sqrt((nontrain_acc_estimate_internal[1] - target_estimate) ** 2) nontrain_pvc_rmse_internal = np.sqrt((nontrain_pvc_estimate_internal[1] - target_estimate) ** 2) output_df.loc['ACC_internal'] = [n_non_train, 'train', 'nontrain', 'predicted', nontrain_acc_estimate_internal[1], nontrain_acc_rmse_internal, np.nan, np.nan, np.nan] output_df.loc['PVC_internal'] = [n_non_train, 'train', 'nontrain', 'predicted', nontrain_pvc_estimate_internal[1], nontrain_pvc_rmse_internal, np.nan, np.nan, np.nan] if n_calib > 0: averaged_acc_estimate_internal = (test_acc_estimate_internal[1] * n_test + calib_estimate * n_calib) / float(n_test + n_calib) averaged_pvc_estimate_internal = (test_pvc_estimate_internal[1] * n_test + calib_estimate * n_calib) / float(n_test + n_calib) averaged_acc_rmse_internal = np.sqrt((averaged_acc_estimate_internal - target_estimate) ** 2) averaged_pvc_rmse_internal = np.sqrt((averaged_pvc_estimate_internal - target_estimate) ** 2) output_df.loc['ACC_internal_averaged'] = [n_non_train, 'train', 'nontrain', 'given', averaged_acc_estimate_internal, averaged_acc_rmse_internal, np.nan, np.nan, np.nan] output_df.loc['PVC_internal_averaged'] = [n_non_train, 'train', 'nontrain', 'given', averaged_pvc_estimate_internal, averaged_pvc_rmse_internal, np.nan, np.nan, np.nan] # do calibration here using calibration data if n_calib > 0: # expand the data so as to only have singly-labeled, weighted items _, calib_labels, calib_weights, calib_predictions = train.prepare_data(np.zeros([n_calib, 2]), calib_labels_df.values, predictions=calib_predictions_df.values) #calib_labels_expanded, calib_weights_expanded, calib_predictions_expanded = expand_labels(calib_labels.values, calib_predictions.values) acc = calibration.compute_acc(calib_labels, calib_predictions, n_classes, weights=calib_weights) acc_corrected = calibration.apply_acc_binary(nontrain_predictions_df.values, acc) acc_estimate = acc_corrected[1] acc_rmse = np.sqrt((acc_estimate - target_estimate) ** 2) output_df.loc['ACC'] = [n_non_train, 'train', 'nontrain', 'predicted', acc_estimate, acc_rmse, np.nan, np.nan, np.nan] pvc = calibration.compute_pvc(calib_labels, calib_predictions, n_classes, weights=calib_weights) pvc_corrected = calibration.apply_pvc(nontrain_predictions_df.values, pvc) pvc_estimate = pvc_corrected[1] pvc_rmse = np.sqrt((pvc_estimate - target_estimate) ** 2) output_df.loc['PVC'] = [n_non_train, 'train', 'nontrain', 'predicted', pvc_estimate, pvc_rmse, np.nan, np.nan, np.nan] acc_corrected = calibration.apply_acc_binary(test_predictions_df.values, acc) acc_estimate = acc_corrected[1] averaged_acc_estimate = (acc_estimate * n_test + calib_estimate * n_calib) / float(n_test + n_calib) averaged_acc_rmse = np.sqrt((acc_estimate - target_estimate) ** 2) output_df.loc['ACC_averaged'] = [n_non_train, 'train', 'nontrain', 'given', averaged_acc_estimate, averaged_acc_rmse, np.nan, np.nan, np.nan] pvc_corrected = calibration.apply_pvc(test_predictions_df.values, pvc) pvc_estimate = pvc_corrected[1] averaged_pvc_estimate = (pvc_estimate * n_test + calib_estimate * n_calib) / float(n_test + n_calib) averaged_pvc_rmse = np.sqrt((pvc_estimate - target_estimate) ** 2) output_df.loc['PVC_averaged'] = [n_non_train, 'train', 'nontrain', 'given', averaged_pvc_estimate, averaged_pvc_rmse, np.nan, np.nan, np.nan] print("Venn internal nontrain") #models = list(model._models.values()) nontrain_pred_ranges_internal, nontrain_preds_internal = ivap.estimate_probs_from_labels_internal(project_dir, model, model_name, subset, non_train_items) pred_range = np.mean(nontrain_pred_ranges_internal, axis=0) venn_estimate = np.mean(nontrain_preds_internal) venn_rmse = np.sqrt((venn_estimate - target_estimate)**2) venn_contains_test = pred_range[0] < target_estimate < pred_range[1] output_df.loc['Venn_internal'] = [n_non_train, 'train', 'nontrain', 'predicted', venn_estimate, venn_rmse, pred_range[0], pred_range[1], venn_contains_test] if n_calib > 0: print("Venn internal test") test_pred_ranges_internal, test_preds_internal = ivap.estimate_probs_from_labels_internal(project_dir, model, model_name, subset, test_items) pred_range = np.mean(test_pred_ranges_internal, axis=0) venn_estimate = (np.mean(test_preds_internal) * n_test + calib_estimate * n_calib) / float(n_test + n_calib) venn_rmse = np.sqrt((venn_estimate - target_estimate)**2) averaged_lower = (pred_range[0] * n_test + (calib_estimate - 2 * calib_std) * n_calib) / float(n_test + n_calib) averaged_upper = (pred_range[1] * n_test + (calib_estimate + 2 * calib_std) * n_calib) / float(n_test + n_calib) venn_contains_test = averaged_lower < target_estimate < averaged_upper output_df.loc['Venn_internal_averaged'] = [n_non_train, 'train', 'nontrain', 'given', venn_estimate, venn_rmse, averaged_lower, averaged_upper, venn_contains_test] # Venn prediction using proper calibration data print("Venn calibration") calib_pred_ranges, calib_preds, calib_props_in_range, list_of_n_levels = ivap.estimate_probs_from_labels_cv(project_dir, model, model_name, sampled_labels_df, subset, calib_items=calib_items) print("Venn test") test_pred_ranges, test_preds = ivap.estimate_probs_from_labels(project_dir, model, model_name, sampled_labels_df, subset, subset, calib_items=calib_items, test_items=test_items) nontrain_pred_ranges = np.vstack([calib_pred_ranges, test_pred_ranges]) nontrain_preds = np.r_[calib_preds, test_preds] nontrain_pred_range = np.mean(nontrain_pred_ranges, axis=0) nontrain_venn_estimate = np.mean(nontrain_preds) nontrain_venn_rmse = np.sqrt((nontrain_venn_estimate - target_estimate)**2) nontrain_contains_test = nontrain_pred_range[0] < target_estimate < nontrain_pred_range[1] output_df.loc['Venn'] = [n_non_train, 'train', 'nontrain', 'predicted', nontrain_venn_estimate, nontrain_venn_rmse, nontrain_pred_range[0], nontrain_pred_range[1], nontrain_contains_test] test_pred_range = np.mean(test_pred_ranges, axis=0) averaged_venn_estimate = (np.mean(test_preds) * n_test + calib_estimate * n_calib) / float(n_test + n_calib) averaged_venn_rmse = np.sqrt((averaged_venn_estimate - target_estimate)**2) averaged_lower = (test_pred_range[0] * n_test + (calib_estimate - 2 * calib_std) * n_calib) / float(n_test + n_calib) averaged_upper = (test_pred_range[1] * n_test + (calib_estimate + 2 * calib_std) * n_calib) / float(n_test + n_calib) venn_contains_test = averaged_lower < target_estimate < averaged_upper output_df.loc['Venn_averaged'] = [n_non_train, 'train', 'nontrain', 'given', averaged_venn_estimate, averaged_venn_rmse, averaged_lower, averaged_upper, venn_contains_test] fh.write_list_to_text(calib_props_in_range, os.path.join(dirs.dir_models(project_dir), model_name, 'venn_calib_props_in_range.csv')) fh.write_list_to_text(list_of_n_levels, os.path.join(dirs.dir_models(project_dir), model_name, 'list_of_n_levels.csv')) results_df.to_csv(os.path.join(dirs.dir_models(project_dir), model_name, 'accuracy.csv')) # now train a model on the training and calibration data combined if run_all: print("Training model on all labeled data") calib_and_train_items_r = np.array(list(calib_items) + list(train_items_r)) model, dev_f1, dev_acc, dev_cal, dev_cal_overall = train.train_model_with_labels(project_dir, model_type, loss, model_name, subset, sampled_labels_df, feature_defs, weights_df=weights_df, items_to_use=calib_and_train_items_r, penalty=penalty, alpha_min=alpha_min, alpha_max=alpha_max, intercept=intercept, objective=objective, n_dev_folds=n_dev_folds, do_ensemble=do_ensemble, dh=dh, seed=seed, pos_label=pos_label, verbose=verbose) results_df.loc['cross_val_all'] = [dev_f1, dev_acc, dev_cal, dev_cal_overall] # get labels for test data test_predictions_df, test_pred_probs_df, test_pred_proportions = predict.predict(project_dir, model, model_name, subset, label, items_to_use=test_items, verbose=verbose) f1_test, acc_test = evaluate_predictions.evaluate_predictions(test_labels_df, test_predictions_df, test_pred_probs_df, pos_label=pos_label, average=average) test_cc_estimate, test_pcc_estimate, test_acc_estimate_internal, test_pvc_estimate_internal = test_pred_proportions true_test_vector = np.argmax(test_labels_df.as_matrix(), axis=1) test_cal_rmse = evaluation.evaluate_calibration_rmse(true_test_vector, test_pred_probs_df.as_matrix()) results_df.loc['test'] = [f1_test, acc_test, test_cal_rmse, 0] results_df.loc['test_all'] = [f1_test, acc_test, test_cal_rmse, 0] nontrain_predictions_df, nontrain_pred_probs_df, nontrain_pred_proportions = predict.predict(project_dir, model, model_name, subset, label, items_to_use=non_train_items, verbose=verbose) nontrain_cc_estimate, nontrain_pcc_estimate, nontrain_acc_estimate_internal, nontrain_pvc_estimate_internal = nontrain_pred_proportions cc_rmse = np.sqrt((nontrain_cc_estimate[1] - target_estimate)**2) pcc_rmse = np.sqrt((nontrain_pcc_estimate[1] - target_estimate)**2) output_df.loc['CC_nontrain_all'] = [n_non_train, 'nontest', 'nontrain', 'predicted', nontrain_cc_estimate[1], cc_rmse, np.nan, np.nan, np.nan] output_df.loc['PCC_nontrain_all'] = [n_non_train, 'nontest', 'nontrain', 'predicted', nontrain_pcc_estimate[1], pcc_rmse, np.nan, np.nan, np.nan] if n_calib > 0: averaged_cc_estimate = (test_cc_estimate[1] * n_test + calib_estimate * n_calib) / float(n_test + n_calib) averaged_pcc_estimate = (test_pcc_estimate[1] * n_test + calib_estimate * n_calib) / float(n_test + n_calib) averaged_cc_rmse = np.sqrt((averaged_cc_estimate - target_estimate)**2) averaged_pcc_rmse = np.sqrt((averaged_pcc_estimate - target_estimate)**2) output_df.loc['CC_nontrain_averaged_all'] = [n_non_train, 'nontest', 'nontrain', 'given', averaged_cc_estimate, averaged_cc_rmse, np.nan, np.nan, np.nan] output_df.loc['PCC_nontrain_averaged_all'] = [n_non_train, 'nontest', 'nontrain', 'given', averaged_pcc_estimate, averaged_pcc_rmse, np.nan, np.nan, np.nan] nontrain_acc_rmse_internal = np.sqrt((nontrain_acc_estimate_internal[1] - target_estimate) ** 2) nontrain_pvc_rmse_internal = np.sqrt((nontrain_pvc_estimate_internal[1] - target_estimate) ** 2) output_df.loc['ACC_internal_all'] = [n_non_train, 'nontest', 'nontrain', 'predicted', nontrain_acc_estimate_internal[1], nontrain_acc_rmse_internal, np.nan, np.nan, np.nan] output_df.loc['PVC_internal_all'] = [n_non_train, 'nontest', 'nontrain', 'predicted', nontrain_pvc_estimate_internal[1], nontrain_pvc_rmse_internal, np.nan, np.nan, np.nan] if n_calib > 0: averaged_acc_estimate_internal = (test_acc_estimate_internal[1] * n_test + calib_estimate * n_calib) / float(n_test + n_calib) averaged_pvc_estimate_internal = (test_pvc_estimate_internal[1] * n_test + calib_estimate * n_calib) / float(n_test + n_calib) averaged_acc_rmse_internal = np.sqrt((averaged_acc_estimate_internal - target_estimate) ** 2) averaged_pvc_rmse_internal = np.sqrt((averaged_pvc_estimate_internal - target_estimate) ** 2) output_df.loc['ACC_internal_averaged_all'] = [n_non_train, 'nontest', 'nontrain', 'given', averaged_acc_estimate_internal, averaged_acc_rmse_internal, np.nan, np.nan, np.nan] output_df.loc['PVC_internal_averaged_all'] = [n_non_train, 'nontest', 'nontrain', 'given', averaged_pvc_estimate_internal, averaged_pvc_rmse_internal, np.nan, np.nan, np.nan] print("Venn internal nontrain") nontrain_pred_ranges_internal, nontrain_preds_internal = ivap.estimate_probs_from_labels_internal(project_dir, model, model_name, subset, non_train_items) pred_range = np.mean(nontrain_pred_ranges_internal, axis=0) venn_estimate = np.mean(nontrain_preds_internal) venn_rmse = np.sqrt((venn_estimate - target_estimate)**2) venn_contains_test = pred_range[0] < target_estimate < pred_range[1] output_df.loc['Venn_internal_all'] = [n_non_train, 'nontest', 'nontrain', 'predicted', venn_estimate, venn_rmse, pred_range[0], pred_range[1], venn_contains_test] if n_calib > 0: print("Venn internal test") test_pred_ranges_internal, test_preds_internal = ivap.estimate_probs_from_labels_internal(project_dir, model, model_name, subset, test_items) pred_range = np.mean(test_pred_ranges_internal, axis=0) venn_estimate = (np.mean(test_preds_internal) * n_test + calib_estimate * n_calib) / float(n_test + n_calib) venn_rmse = np.sqrt((venn_estimate - target_estimate)**2) averaged_lower = (pred_range[0] * n_test + (calib_estimate - 2 * calib_std) * n_calib) / float(n_test + n_calib) averaged_upper = (pred_range[1] * n_test + (calib_estimate + 2 * calib_std) * n_calib) / float(n_test + n_calib) venn_contains_test = averaged_lower < target_estimate < averaged_upper output_df.loc['Venn_internal_averaged_all'] = [n_non_train, 'nontest', 'nontrain', 'given', venn_estimate, venn_rmse, averaged_lower, averaged_upper, venn_contains_test] """ results_df.to_csv( os.path.join(dirs.dir_models(project_dir), model_name, 'accuracy.csv')) output_df.to_csv( os.path.join(dirs.dir_models(project_dir), model_name, 'results.csv'))
def test_over_time(project_dir, subset, config_file, first_year, stage1_logfile=None, penalty='l2', suffix='', model_type='LR', loss='log', objective='f1', do_ensemble=True, dh=100, label='label', intercept=True, n_dev_folds=5, verbose=False, average='micro', seed=None, alpha_min=0.01, alpha_max=1000.0, n_alphas=8, sample_labels=False, group_identical=False, annotated_subset=None, n_terms=0, nonlinearity='tanh', init_lr=1e-4, min_epochs=2, max_epochs=100, patience=8, tol=1e-4, early_stopping=False, DL=False): # Just run a regular model, one per year, training on the past, and save the reults log = { 'project': project_dir, 'subset': subset, 'config_file': config_file, 'first_year': first_year, 'stage1_logfile': stage1_logfile, 'penalty': penalty, 'suffix': suffix, 'model_type': model_type, 'loss': loss, 'objective': objective, 'do_ensemble': do_ensemble, 'dh': dh, 'label': label, 'intercept': intercept, 'n_dev_folds': n_dev_folds, 'average': average, 'seed': seed, 'alpha_min': alpha_min, 'alpha_max': alpha_max, 'n_alphas': n_alphas, 'sample_labels': sample_labels, 'group_identical': group_identical, 'annotated_subset': annotated_subset, 'n_terms': n_terms, 'nonlinearity': nonlinearity, 'init_lr': init_lr, 'min_epochs': min_epochs, 'max_epochs': max_epochs, 'patience': patience, 'tol': tol, 'early_stopping': early_stopping } model_basename = make_model_basename(log) stage1_model_basename = '' if stage1_logfile is not None: stage1_log = fh.read_json(stage1_logfile) stage1_model_basename = make_model_basename(stage1_log) # save the experiment parameters to a log file logfile = os.path.join(dirs.dir_logs(project_dir), model_basename + '.json') fh.makedirs(dirs.dir_logs(project_dir)) fh.write_to_json(log, logfile) # load the features specified in the config file config = fh.read_json(config_file) feature_defs = [] for f in config['feature_defs']: feature_defs.append(features.parse_feature_string(f)) # load the file that contains metadata about each item metadata_file = os.path.join(dirs.dir_subset(project_dir, subset), 'metadata.csv') metadata = fh.read_csv_to_df(metadata_file) field_vals = list(set(metadata['year'].values)) field_vals.sort() print("Splitting data according to :", field_vals) # DEBUG: field_vals = ['2009'] for target_year in field_vals: if int(target_year) >= first_year: print("\nTesting on %s" % target_year) model_name = model_basename + '_' + str(target_year) stage1_model_name = stage1_model_basename + '_' + str(target_year) # first, split into training and non-train data based on the field of interest ## DEBUG! test_selector_all = metadata['year'] >= int(target_year) test_subset_all = metadata[test_selector_all] test_items_all = test_subset_all.index.tolist() n_test_all = len(test_items_all) train_selector_all = metadata['year'] < int(target_year) train_subset_all = metadata[train_selector_all] train_items_all = list(train_subset_all.index) n_train_all = len(train_items_all) print("Test year: %d Train: %d, Test: %d (labeled and unlabeled)" % (int(target_year), n_train_all, n_test_all)) # load all labels label_dir = dirs.dir_labels(project_dir, subset) labels_df = fh.read_csv_to_df(os.path.join(label_dir, label + '.csv'), index_col=0, header=0) n_items, n_classes = labels_df.shape vocab = None if stage1_logfile is not None: fightin_lexicon = None if annotated_subset is not None: print("Determining fightin' words") fightin_words.find_most_annotated_features( project_dir, annotated_subset, subset, config_file, items_to_use=train_items_all, remove_stopwords=False) fightin_lexicon, scores = fightin_words.load_from_config_files( project_dir, annotated_subset, subset, config_file, items_to_use=train_items_all, n=n_terms, remove_stopwords=True) fightin_lexicon_test, scores = fightin_words.load_from_config_files( project_dir, annotated_subset, subset, config_file, items_to_use=test_items_all, n=n_terms, remove_stopwords=True) print(fightin_lexicon) #print(fightin_lexicon_test) #vocab = list(fightin_lexicon) #vocab.sort() print("Loading feature from stage 1") # load features from previous model top_features = get_top_features.get_top_features( os.path.join(dirs.dir_models(project_dir), stage1_model_name), n_terms) lr_features, weights = zip(*top_features) vocab = list(lr_features) #if annotated_subset is not None: # print("\nTaking intersection:") # intersection = set(lr_features).intersection(set(fightin_lexicon)) # vocab = list(intersection) # vocab.sort() # for w in vocab: # print(w) #vocab = [w for w in vocab if w not in stopwords] for w in vocab: print(w) vocab.sort() #if annotated_subset is not None: # print("Missing:") # print(set(fightin_lexicon_test) - set(vocab)) # add in a stage to eliminate items with no labels print("Subsetting items with labels") label_sums_df = labels_df.sum(axis=1) labeled_item_selector = label_sums_df > 0 labels_df = labels_df[labeled_item_selector] n_items, n_classes = labels_df.shape labeled_items = set(labels_df.index) train_items = [i for i in train_items_all if i in labeled_items] test_items = [i for i in test_items_all if i in labeled_items] n_train = len(train_items) n_test = len(test_items) weights_df = None if weights_df is not None: weights_df = weights_df[labeled_item_selector] print("Labeled train: %d, test: %d" % (n_train, n_test)) # create a data frame to hold a summary of the results output_df = pd.DataFrame([], columns=[ 'N', 'training data', 'test data', 'cal', 'estimate', 'MAE', '95lcl', '95ucl', 'contains_test' ]) test_labels_df = labels_df.loc[test_items] # if instructed, sample labels in proportion to annotations (to simulate having one label per item) if sample_labels: print("Sampling labels") # normalize the labels temp = labels_df.values / np.array( labels_df.values.sum(axis=1).reshape((n_items, 1)), dtype=float) samples = np.zeros([n_items, n_classes], dtype=int) for i in range(n_items): index = np.random.choice(np.arange(n_classes), size=1, p=temp[i, :]) samples[i, index] = 1 sampled_labels_df = pd.DataFrame(samples, index=labels_df.index, columns=labels_df.columns) else: sampled_labels_df = labels_df train_labels_df = sampled_labels_df.loc[train_items].copy() # get the true proportion of labels in the test OR non-training data (calibration and test combined) target_props, target_estimate, target_std = get_estimate_and_std( test_labels_df, use_n_annotations=True) output_df.loc['target'] = [ n_test, 'test', 'test', 'n/a', target_estimate, 0, target_estimate - 2 * target_std, target_estimate + 2 * target_std, np.nan ] # get the same estimate from training data train_props, train_estimate, train_std = get_estimate_and_std( train_labels_df, use_n_annotations=True) # compute the error of this estimate train_rmse = np.sqrt((train_estimate - target_estimate)**2) train_contains_test = target_estimate > train_estimate - 2 * train_std and target_estimate < train_estimate + 2 * train_std output_df.loc['train'] = [ n_train, 'train', 'test', 'n/a', train_estimate, train_rmse, train_estimate - 2 * train_std, train_estimate + 2 * train_std, train_contains_test ] #print("target proportions: (%0.3f, %0.3f); train proportions: %0.3f" % (target_estimate - 2 * target_std, target_estimate + 2 * target_std, train_estimate)) if train_estimate > 0.5: pos_label = 0 else: pos_label = 1 print("Using %d as the positive label" % pos_label) results_df = pd.DataFrame( [], columns=['f1', 'acc', 'mae', 'estimated calibration']) # Now train a model on the training data, saving the calibration data for calibration print("Training a model") model, dev_f1, dev_acc, dev_cal_mae, dev_cal_est = train.train_model_with_labels( project_dir, model_type, loss, model_name, subset, sampled_labels_df, feature_defs, weights_df=weights_df, items_to_use=train_items, penalty='l2', alpha_min=alpha_min, alpha_max=alpha_max, n_alphas=n_alphas, intercept=intercept, objective=objective, n_dev_folds=n_dev_folds, do_ensemble=do_ensemble, dh=dh, seed=seed, pos_label=pos_label, vocab=vocab, group_identical=group_identical, nonlinearity=nonlinearity, init_lr=init_lr, min_epochs=min_epochs, max_epochs=max_epochs, patience=patience, tol=tol, early_stopping=early_stopping, verbose=verbose) results_df.loc['cross_val'] = [ dev_f1, dev_acc, dev_cal_mae, dev_cal_est ] # predict on test data force_dense = False if model_type == 'MLP': force_dense = True test_predictions_df, test_pred_probs_df, test_pred_proportions = predict.predict( project_dir, model, model_name, subset, label, items_to_use=test_items, verbose=verbose, force_dense=force_dense, group_identical=group_identical) f1_test, acc_test = evaluate_predictions.evaluate_predictions( test_labels_df, test_predictions_df, test_pred_probs_df, pos_label=pos_label, average=average) true_test_vector = np.argmax(test_labels_df.as_matrix(), axis=1) #test_cal_mae = evaluation.eval_proportions_mae(test_labels_df.as_matrix(), test_pred_probs_df.as_matrix()) test_cal_est = evaluation.evaluate_calibration_rmse( true_test_vector, test_pred_probs_df.as_matrix(), min_bins=1, max_bins=1) test_cc_estimate, test_pcc_estimate, test_acc_estimate_internal, test_pvc_estimate_internal = test_pred_proportions test_cc_mae = np.mean(np.abs(test_cc_estimate[1] - target_estimate)) test_pcc_mae = np.mean( np.abs(test_pcc_estimate[1] - target_estimate)) results_df.loc['test'] = [ f1_test, acc_test, test_pcc_mae, test_cal_est ] output_df.loc['CC_test'] = [ n_train, 'train', 'test', 'n/a', test_cc_estimate[1], test_cc_mae, np.nan, np.nan, np.nan ] output_df.loc['PCC_test'] = [ n_train, 'train', 'test', 'n/a', test_pcc_estimate[1], test_pcc_mae, np.nan, np.nan, np.nan ] test_acc_rmse_internal = np.sqrt( (test_acc_estimate_internal[1] - target_estimate)**2) test_pvc_rmse_internal = np.sqrt( (test_pvc_estimate_internal[1] - target_estimate)**2) output_df.loc['ACC_internal'] = [ n_train, 'train', 'test', 'n/a', test_acc_estimate_internal[1], test_acc_rmse_internal, np.nan, np.nan, np.nan ] output_df.loc['PVC_internal'] = [ n_train, 'train', 'nontrain', 'predicted', test_pvc_estimate_internal[1], test_pvc_rmse_internal, np.nan, np.nan, np.nan ] """ if DL: print("Training a model") model_type = 'DL' DL_model_name = model_name + '_DL' model, _, _, _, _ = train.train_model_with_labels(project_dir, model_type, loss, DL_model_name, subset, sampled_labels_df, feature_defs, weights_df=weights_df, items_to_use=train_items, penalty='l2', alpha_min=alpha_min, alpha_max=alpha_max, n_alphas=n_alphas, intercept=intercept, objective=objective, n_dev_folds=n_dev_folds, do_ensemble=do_ensemble, dh=dh, seed=seed, pos_label=pos_label, vocab=vocab, group_identical=group_identical, nonlinearity=nonlinearity, init_lr=init_lr, min_epochs=min_epochs, max_epochs=max_epochs, patience=patience, tol=tol, early_stopping=early_stopping, verbose=verbose) # predict on test data force_dense = False if model_type == 'MLP': force_dense = True test_predictions_df, test_pred_probs_df, test_pred_proportions = predict.predict(project_dir, model, DL_model_name, subset, label, items_to_use=test_items, verbose=verbose, force_dense=force_dense, group_identical=group_identical) f1_test, acc_test = evaluate_predictions.evaluate_predictions(test_labels_df, test_predictions_df, test_pred_probs_df, pos_label=pos_label, average=average) true_test_vector = np.argmax(test_labels_df.as_matrix(), axis=1) #test_cal_mae = evaluation.eval_proportions_mae(test_labels_df.as_matrix(), test_pred_probs_df.as_matrix()) test_cal_est = evaluation.evaluate_calibration_rmse(true_test_vector, test_pred_probs_df.as_matrix(), min_bins=1, max_bins=1) test_cc_estimate, test_pcc_estimate, test_acc_estimate_internal, test_pvc_estimate_internal = test_pred_proportions test_cc_mae = np.mean(np.abs(test_cc_estimate[1] - target_estimate)) test_pcc_mae = np.mean(np.abs(test_pcc_estimate[1] - target_estimate)) output_df.loc['CC_test_DL'] = [n_train, 'train', 'test', 'n/a', test_cc_estimate[1], test_cc_mae, np.nan, np.nan, np.nan] output_df.loc['PCC_test_DL'] = [n_train, 'train', 'test', 'n/a', test_pcc_estimate[1], test_pcc_mae, np.nan, np.nan, np.nan] """ results_df.to_csv( os.path.join(dirs.dir_models(project_dir), model_name, 'accuracy.csv')) output_df.to_csv( os.path.join(dirs.dir_models(project_dir), model_name, 'results.csv'))
def make_random_split(input_file, field_name, calib_percent, overwrite=False, sampling='proportional'): """ Split a dataset into multiple overlapping datasets based on some metadata variable (such as year) The idea is to create subsets to test domain adaptation / covariate shift For each value of the variable, create three datasets: train = all those items that don't have that value (training data) calib = random subset of items that do have that value (calibration data) test = remaining items that do have that value (evaluation data) :param input_file: :param field_name: :param calib_percent: :param overwrite: :param sampling: :return: """ basedir = os.path.dirname(input_file) data = fh.read_json(input_file) field_vals = set([data[k][field_name] for k in data.keys()]) if sampling == 'proportional': for val in field_vals: print(val) train = { k: v for k, v in data.items() if data[k][field_name] != val } subset = { k: v for k, v in data.items() if data[k][field_name] == val } keys = list(subset.keys()) random.shuffle(keys) n_items = len(keys) print("Loaded %d items" % n_items) n_calib = int(n_items * calib_percent) calib = {k: data[k] for k in keys[:n_calib]} test = {k: data[k] for k in keys[n_calib:]} print( "Creating train, calibration, and test sets of sizes %d, %d and %d, respectively" % (len(train), len(calib), len(test))) output_file = os.path.join( basedir, field_name + '_' + str(val) + '_train.json') if os.path.exists(output_file) and not overwrite: sys.exit("Error: output file %s exists" % output_file) fh.write_to_json(train, output_file) output_file = os.path.join( basedir, field_name + '_' + str(val) + '_calib.json') if os.path.exists(output_file) and not overwrite: sys.exit("Error: output file %s exists" % output_file) fh.write_to_json(calib, output_file) output_file = os.path.join( basedir, field_name + '_' + str(val) + '_test.json') if os.path.exists(output_file) and not overwrite: sys.exit("Error: output file %s exists" % output_file) fh.write_to_json(test, output_file) else: keys = list(data.keys()) random.shuffle(keys) n_items = len(keys) print("Loaded %d items" % n_items) n_calib = int(n_items * calib_percent) calib = {k: data[k] for k in keys[:n_calib]} test = {k: data[k] for k in keys[n_calib:]} for val in field_vals: print(val) train = { k: v for k, v in data.items() if data[k][field_name] != val } calib_subset = { k: v for k, v in calib.items() if calib[k][field_name] == val } test_subset = { k: v for k, v in test.items() if test[k][field_name] == val } print( "Creating train, calibration, and test sets of sizes %d, %d and %d, respectively" % (len(train), len(calib_subset), len(test_subset))) output_file = os.path.join( basedir, field_name + '_' + str(val) + '_train.json') if os.path.exists(output_file) and not overwrite: sys.exit("Error: output file %s exists" % output_file) fh.write_to_json(train, output_file) output_file = os.path.join( basedir, field_name + '_' + str(val) + '_calib.json') if os.path.exists(output_file) and not overwrite: sys.exit("Error: output file %s exists" % output_file) fh.write_to_json(calib_subset, output_file) output_file = os.path.join( basedir, field_name + '_' + str(val) + '_test.json') if os.path.exists(output_file) and not overwrite: sys.exit("Error: output file %s exists" % output_file) fh.write_to_json(test_subset, output_file)
def extract_story_elements(): min_head_vocab = 5 min_role_vocab = 4 min_tuples = 3 ATTRIBUTE = 0 AGENT_ROLE = 1 PATIENT_ROLE = 2 SURFACE_FORM = 3 parsed_dir = os.path.join(dirs.data_stanford_dir, 'parsed') parsed_files = glob.glob(os.path.join(parsed_dir, '*.json')) dependencies_file = os.path.join(dirs.data_stanford_dir, 'dependency_tuple_ids.json') dependencies = fh.read_json(dependencies_file) coref_file = os.path.join(dirs.data_stanford_dir, 'coref_heads.json') coref_heads = fh.read_json(coref_file) supersense_tags = fh.read_json(os.path.join(dirs.data_amalgram_dir, 'all_tags.json')) heads = defaultdict(int) tokens = defaultdict(int) attributes = defaultdict(int) agent_roles = defaultdict(int) patient_roles = defaultdict(int) story_elements = {} print "Extracting story elements" for f_i, f in enumerate(parsed_files): sentences = fh.read_json(f) basename = fh.get_basename_wo_ext(f) element_list = extract_story_elements_from_article(sentences, dependencies[basename], coref_heads[basename], supersense_tags[basename], basename) story_elements[basename] = element_list for element in element_list: for h in element.head_words: heads[h] += 1 for t in element.attributes: attributes[t] += 1 for t in element.agent_roles: agent_roles[t] += 1 for t in element.patient_roles: patient_roles[t] += 1 print "Finding most common tokens" common_heads = [(v, k) for k, v in heads.items()] common_heads.sort() common_heads.reverse() output_filename = os.path.join(dirs.lda_dir, 'common_heads.json') fh.write_to_json(common_heads, output_filename, sort_keys=False) """ common_tokens = [(v, k) for k, v in tokens.items()] common_tokens.sort() common_tokens.reverse() output_filename = os.path.join(dirs.lda_dir, 'common_tokens.json') fh.write_to_json(common_tokens, output_filename, sort_keys=False) """ common_attributes = [(v, k) for k, v in attributes.items()] common_attributes.sort() common_attributes.reverse() output_filename = os.path.join(dirs.lda_dir, 'common_attributes.json') fh.write_to_json(common_attributes, output_filename, sort_keys=False) common_agent_roles = [(v, k) for k, v in agent_roles.items()] common_agent_roles.sort() common_agent_roles.reverse() output_filename = os.path.join(dirs.lda_dir, 'common_agent_roles.json') fh.write_to_json(common_agent_roles, output_filename, sort_keys=False) common_patient_roles = [(v, k) for k, v in patient_roles.items()] common_patient_roles.sort() common_patient_roles.reverse() output_filename = os.path.join(dirs.lda_dir, 'common_patient_roles.json') fh.write_to_json(common_patient_roles, output_filename, sort_keys=False) print pronoun_list #most_common_heads = {k: v for v, k in common_heads if v >= min_head_vocab and k not in pronoun_list} most_common_attributes = {k: v for v, k in common_attributes if (v >= min_role_vocab and k not in pronoun_list)} most_common_agent_roles = {k: v for v, k in common_agent_roles if (v >= min_role_vocab and k not in pronoun_list and k not in stopwords)} most_common_patient_roles = {k: v for v, k in common_patient_roles if (v >= min_role_vocab and k not in pronoun_list and k not in stopwords)} output_filename = os.path.join(dirs.lda_dir, 'most_common_attributes.json') fh.write_to_json(most_common_attributes, output_filename, sort_keys=False) output_filename = os.path.join(dirs.lda_dir, 'most_common_agent_roles.json') fh.write_to_json(most_common_agent_roles, output_filename, sort_keys=False) output_filename = os.path.join(dirs.lda_dir, 'most_common_patient_roles.json') fh.write_to_json(most_common_patient_roles, output_filename, sort_keys=False) print len(most_common_attributes) print len(most_common_agent_roles) print len(most_common_patient_roles) print "Filtering tuples" valid_elements = defaultdict(list) for basename, element_list in story_elements.items(): for se in element_list: se.valid_heads = [h for h in se.head_words if h not in pronoun_list] se.valid_phrases = [h for h in se.phrases if h not in pronoun_list] if len(se.valid_heads) > 0: se.valid_attributes = [t for t in se.attributes if t in most_common_attributes] se.valid_agent_roles = [t for t in se.agent_roles if t in most_common_agent_roles] se.valid_patient_roles = [t for t in se.patient_roles if t in most_common_patient_roles] se.tuples = [(ATTRIBUTE, t) for t in se.valid_attributes] + \ [(AGENT_ROLE, t) for t in se.valid_agent_roles] + \ [(PATIENT_ROLE, t) for t in se.valid_patient_roles] #[(SURFACE_FORM, t) for t in se.valid_heads] if len(se.tuples) >= min_tuples: valid_elements[basename].append(se) print "Constructing vocabulary" n_tuples = 0 vocab = VocabWithCounts('', add_oov=False) n_entities = 0 for basename, element_list in valid_elements.items(): for se in element_list: tokens = [token for role, token in se.tuples] vocab.add_tokens(tokens) n_tuples += len(tokens) n_entities += 1 head_word_vocab = VocabWithCounts('', add_oov=False) for basename, element_list in valid_elements.items(): for se in element_list: tokens = [token for token in se.valid_heads] head_word_vocab.add_tokens(tokens) head_phrase_vocab = VocabWithCounts('', add_oov=False) for basename, element_list in valid_elements.items(): for se in element_list: tokens = [token for token in se.valid_phrases] head_phrase_vocab.add_tokens(tokens) print "Building indices" tuple_vocab = np.zeros(n_tuples, dtype=int) # vocab index of the ith word tuple_entity = np.zeros(n_tuples, dtype=int) tuple_role = [] entity_doc = np.zeros(n_entities, dtype=int) # topic of the ith word docs = valid_elements.keys() docs.sort() vocab_counts = np.zeros(len(vocab), dtype=int) article_mapping = [] entity_index = 0 head_word_vocab_list = [] head_word_entity_list = [] head_phrase_vocab_list = [] head_phrase_entity_list = [] t_i = 0 for d_i, d in enumerate(docs): element_list = valid_elements[d] for se in element_list: entity_doc[entity_index] = d_i for role, token in se.tuples: tuple_entity[t_i] = entity_index tuple_role.append(role) vocab_index = vocab.get_index(token) tuple_vocab[t_i] = vocab_index vocab_counts[vocab_index] += 1 t_i += 1 for token in se.valid_heads: head_word_vocab_index = head_word_vocab.get_index(token) head_word_vocab_list.append(head_word_vocab_index) head_word_entity_list.append(entity_index) for token in se.valid_phrases: head_phrase_vocab_index = head_phrase_vocab.get_index(token) head_phrase_vocab_list.append(head_phrase_vocab_index) head_phrase_entity_list.append(entity_index) article_mapping.append(str(entity_index) + ':' + d + ':' + ','.join(se.head_words) + ':' + ','.join(se.valid_attributes) + ':' + ','.join(se.valid_agent_roles) + ':' + ','.join(se.valid_patient_roles)) entity_index += 1 print len(docs), "valid documents" print entity_index, "entities" print t_i, "tuples" print len(vocab), "word types" print np.min(vocab_counts), np.max(vocab_counts), np.sum(vocab_counts) output_filename = os.path.join(dirs.lda_dir, 'tuple_vocab.json') fh.write_to_json(list(tuple_vocab), output_filename, sort_keys=False) output_filename = os.path.join(dirs.lda_dir, 'tuple_role.json') fh.write_to_json(list(tuple_role), output_filename, sort_keys=False) output_filename = os.path.join(dirs.lda_dir, 'tuple_entity.json') fh.write_to_json(list(tuple_entity), output_filename, sort_keys=False) output_filename = os.path.join(dirs.lda_dir, 'entity_doc.json') fh.write_to_json(list(entity_doc), output_filename, sort_keys=False) output_filename = os.path.join(dirs.lda_dir, 'vocab.json') fh.write_to_json(vocab.index2token, output_filename, sort_keys=False) output_filename = os.path.join(dirs.lda_dir, 'docs.json') fh.write_to_json(list(docs), output_filename, sort_keys=False) output_filename = os.path.join(dirs.lda_dir, 'article_map.json') fh.write_to_json(list(article_mapping), output_filename, sort_keys=False) output_filename = os.path.join(dirs.lda_dir, 'head_word_vocab.json') fh.write_to_json(head_word_vocab.index2token, output_filename, sort_keys=False) output_filename = os.path.join(dirs.lda_dir, 'head_phrase_vocab.json') fh.write_to_json(head_phrase_vocab.index2token, output_filename, sort_keys=False) output_filename = os.path.join(dirs.lda_dir, 'head_word_vocab_list.json') fh.write_to_json(head_word_vocab_list, output_filename, sort_keys=False) output_filename = os.path.join(dirs.lda_dir, 'head_word_entity_list.json') fh.write_to_json(head_word_entity_list, output_filename, sort_keys=False) output_filename = os.path.join(dirs.lda_dir, 'head_phrase_vocab_list.json') fh.write_to_json(head_phrase_vocab_list, output_filename, sort_keys=False) output_filename = os.path.join(dirs.lda_dir, 'head_phrase_entity_list.json') fh.write_to_json(head_phrase_entity_list, output_filename, sort_keys=False)
def cross_train_and_eval(project_dir, subset, field_name, config_file, calib_prop=0.33, train_prop=1.0, prefix=None, max_folds=None, min_val=None, max_val=None, model_type='LR', loss='log', do_ensemble=False, dh=0, label='label', penalty='l1', cshift=None, intercept=True, n_dev_folds=5, repeats=1, verbose=False, pos_label=1, average='micro', objective='f1', seed=None, use_calib_pred=False, exclude_calib=False, alpha_min=0.01, alpha_max=1000, sample_labels=False): model_basename = subset + '_' + field_name if prefix is not None: model_basename = prefix + '_' + model_basename # save the experiment parameters to a log file logfile = os.path.join(dirs.dir_logs(project_dir), model_basename + '.json') fh.makedirs(dirs.dir_logs(project_dir)) log = { 'project': project_dir, 'subset': subset, 'field_name': field_name, 'config_file': config_file, 'calib_prop': calib_prop, 'train_prop': train_prop, 'prefix': prefix, 'max_folds': max_folds, 'model_type': model_type, 'loss': loss, 'dh': dh, 'alpha_min': alpha_min, 'alpha_max': alpha_max, 'do_ensemble': do_ensemble, 'label': label, 'penalty': penalty, 'cshift': cshift, 'intercept': intercept, 'objective': objective, 'n_dev_folds': n_dev_folds, 'repeats': repeats, 'pos_label': pos_label, 'average': average, 'use_calib_pred': use_calib_pred, 'exclude_calib': exclude_calib } fh.write_to_json(log, logfile) # load the features specified in the config file config = fh.read_json(config_file) feature_defs = [] for f in config['feature_defs']: feature_defs.append(features.parse_feature_string(f)) # load the file that contains metadata about each item metadata_file = os.path.join(dirs.dir_subset(project_dir, subset), 'metadata.csv') metadata = fh.read_csv_to_df(metadata_file) field_vals = list(set(metadata[field_name].values)) field_vals.sort() print(field_vals) # exclude certain values of the partition if desired if min_val is not None: field_vals = [v for v in field_vals if v >= float(min_val)] if max_val is not None: field_vals = [v for v in field_vals if v <= float(max_val)] if max_folds is None: max_folds = len(field_vals) # repeat the following value for each fold of the partition of interest (up to max_folds, if given) for v_i, v in enumerate(field_vals[:max_folds]): print("\nTesting on %s" % v) # first, split into training and non-train data based on the field of interest train_selector = metadata[field_name] != v train_subset = metadata[train_selector] train_items = list(train_subset.index) n_train = len(train_items) non_train_selector = metadata[field_name] == v non_train_subset = metadata[non_train_selector] non_train_items = non_train_subset.index.tolist() n_non_train = len(non_train_items) print("Train: %d, non-train: %d" % (n_train, n_non_train)) # load all labels label_dir = dirs.dir_labels(project_dir, subset) labels_df = fh.read_csv_to_df(os.path.join(label_dir, label + '.csv'), index_col=0, header=0) n_items, n_classes = labels_df.shape train_labels = labels_df.loc[train_items] # if desired, attempt to learn weights for the training data using techniques for covariate shift if cshift is not None: print("Training a classifier for covariate shift") # start by learning to discriminate train from non-train data train_test_labels = np.zeros((n_items, 2), dtype=int) train_test_labels[train_selector, 0] = 1 train_test_labels[non_train_selector, 1] = 1 train_test_labels_df = pd.DataFrame(train_test_labels, index=labels_df.index, columns=[0, 1]) # create a cshift model using the same specifiction as our model below (e.g. LR/MLP, etc.) model_name = model_basename + '_' + str(v) + '_' + 'cshift' model, dev_f1, dev_acc, dev_cal, _, _ = train.train_model_with_labels(project_dir, model_type, loss, model_name, subset, train_test_labels_df, feature_defs, penalty=penalty, alpha_min=alpha_min, alpha_max=alpha_max, intercept=intercept, n_dev_folds=n_dev_folds, save_model=True, do_ensemble=do_ensemble, dh=dh, seed=seed, verbose=False) print("cshift results: %0.4f f1, %0.4f acc" % (dev_f1, dev_acc)) # take predictions from model on the training data train_test_pred_df, train_test_probs_df = predict.predict(project_dir, model, model_name, subset, label, verbose=verbose) # display the min and max probs print("Min: %0.4f" % train_test_probs_df[1].min()) print("Max: %0.4f" % train_test_probs_df[1].max()) # use the estimated probability of each item being a training item to compute item weights weights = n_train / float(n_non_train) * (1.0/train_test_probs_df[0].values - 1) # print a summary of the weights from just the training items print("Min weight: %0.4f" % weights[train_selector].min()) print("Ave weight: %0.4f" % weights[train_selector].mean()) print("Max weight: %0.4f" % weights[train_selector].max()) # print a summary of all weights print("Min weight: %0.4f" % weights.min()) print("Ave weight: %0.4f" % weights.mean()) print("Max weight: %0.4f" % weights.max()) # create a data frame with this information weights_df = pd.DataFrame(weights, index=labels_df.index) else: weights_df = None # repeat the following process multiple times with different random splits of train / calibration / test data for r in range(repeats): # next, take a random subset of the training data (and ignore the rest), to simulate fewer annotated items if train_prop < 1.0: np.random.shuffle(train_items) train_items_r = np.random.choice(train_items, size=int(n_train * train_prop), replace=False) n_train_r = len(train_items_r) # create a data frame to hold a summary of the results output_df = pd.DataFrame([], columns=['N', 'estimate', 'RMSE', '95lcl', '95ucl', 'contains_test']) # create a unique name ofr this model model_name = model_basename + '_' + str(v) + '_' + str(r) # now, divide the non-train data into a calibration and a test set n_calib = int(calib_prop * n_non_train) np.random.shuffle(non_train_items) calib_items = non_train_items[:n_calib] test_items = non_train_items[n_calib:] n_test = len(test_items) print("%d %d %d" % (n_train_r, n_calib, n_test)) test_labels_df = labels_df.loc[test_items] non_train_labels_df = labels_df.loc[non_train_items] # if instructed, sample labels in proportion to annotations (to simulate having one label per item) if sample_labels: print("Sampling labels") # normalize the labels temp = labels_df.values / np.array(labels_df.values.sum(axis=1).reshape((n_items, 1)), dtype=float) samples = np.zeros([n_items, n_classes], dtype=int) for i in range(n_items): index = np.random.choice(np.arange(n_classes), size=1, p=temp[i, :]) samples[i, index] = 1 sampled_labels_df = pd.DataFrame(samples, index=labels_df.index, columns=labels_df.columns) else: sampled_labels_df = labels_df train_labels_r_df = sampled_labels_df.loc[train_items_r].copy() calib_labels_df = sampled_labels_df.loc[calib_items].copy() # get the true proportion of labels in the test OR non-training data (calibration and test combined) if exclude_calib: test_props, test_estimate, test_std = get_estimate_and_std(test_labels_df) else: test_props, test_estimate, test_std = get_estimate_and_std(non_train_labels_df) output_df.loc['test'] = [n_test, test_estimate, 0, test_estimate - 2 * test_std, test_estimate + 2 * test_std, 1] # get the same estimate from training data train_props, train_estimate, train_std = get_estimate_and_std(train_labels_r_df) # compute the error of this estimate train_rmse = np.sqrt((train_estimate - test_estimate)**2) train_contains_test = test_estimate > train_estimate - 2 * train_std and test_estimate < train_estimate + 2 * train_std output_df.loc['train'] = [n_train_r, train_estimate, train_rmse, train_estimate - 2 * train_std, train_estimate + 2 * train_std, train_contains_test] # repeat for calibration data calib_props, calib_estimate, calib_std = get_estimate_and_std(calib_labels_df) calib_rmse = np.sqrt((calib_estimate - test_estimate)**2) # check if the test estimate is within 2 standard deviations of the estimate calib_contains_test = test_estimate > calib_estimate - 2 * calib_std and calib_estimate < calib_estimate + 2 * calib_std output_df.loc['calibration'] = [n_calib, calib_estimate, calib_rmse, calib_estimate - 2 * calib_std, calib_estimate + 2 * calib_std, calib_contains_test] results_df = pd.DataFrame([], columns=['f1', 'acc', 'cal']) print("Training model on all labeled data") # first train a model on the training and calibration data combined calib_and_train_items_r = np.array(list(calib_items) + list(train_items_r)) model, dev_f1, dev_acc, dev_cal, acc_cfm, pvc_cfm = train.train_model_with_labels(project_dir, model_type, loss, model_name, subset, sampled_labels_df, feature_defs, weights_df=weights_df, items_to_use=calib_and_train_items_r, penalty=penalty, alpha_min=alpha_min, alpha_max=alpha_max, intercept=intercept, objective=objective, n_dev_folds=n_dev_folds, do_ensemble=do_ensemble, dh=dh, seed=seed, verbose=verbose) results_df.loc['cross_val_all'] = [dev_f1, dev_acc, dev_cal] # get labels for test data test_predictions_df, test_pred_probs_df = predict.predict(project_dir, model, model_name, subset, label, items_to_use=test_items, verbose=verbose) f1_test, acc_test = evaluate_predictions.evaluate_predictions(test_labels_df, test_predictions_df, test_pred_probs_df, pos_label=pos_label, average=average) results_df.loc['test_all'] = [f1_test, acc_test, 0.0] # combine the predictions on the test and calibration data (unless excluding calibration data from this) if exclude_calib: test_predictions = test_predictions_df.values test_pred_probs = test_pred_probs_df.values else: # get labels for calibration data if use_calib_pred: calib_predictions_df, calib_pred_probs_df = predict.predict(project_dir, model, model_name, subset, label, items_to_use=calib_items, verbose=verbose) else: calib_predictions_df = pd.DataFrame(np.argmax(calib_labels_df.values, axis=1), index=calib_labels_df.index) # normalize labels to get (questionable) estimates of probabilities calib_pred_probs_df = pd.DataFrame(calib_labels_df.values / np.array(np.sum(calib_labels_df.values, axis=1).reshape((n_calib, 1)), dtype=float), index=calib_labels_df.index) test_predictions = np.r_[test_predictions_df.values, calib_predictions_df.values] test_pred_probs = np.vstack([test_pred_probs_df.values, calib_pred_probs_df.values]) # get the basic error estimates for this model cc_estimate = np.mean(test_predictions) cc_rmse = np.sqrt((cc_estimate - test_estimate)**2) # average the predicted probabilities for the positive label (assuming binary labels) pcc_estimate = np.mean(test_pred_probs[:, 1]) pcc_rmse = np.sqrt((pcc_estimate - test_estimate)**2) output_df.loc['CC_all'] = [n_test, cc_estimate, cc_rmse, np.nan, np.nan, np.nan] output_df.loc['PCC_all'] = [n_test, pcc_estimate, pcc_rmse, np.nan, np.nan, np.nan] # Now repeat for a model trained on the training data, saving the calibration data for calibration print("Training model on training data only") model, dev_f1, dev_acc, dev_cal, acc_cfm, pvc_cfm = train.train_model_with_labels(project_dir, model_type, loss, model_name, subset, sampled_labels_df, feature_defs, weights_df=weights_df, items_to_use=train_items_r, penalty=penalty, alpha_min=alpha_min, alpha_max=alpha_max, intercept=intercept, objective=objective, n_dev_folds=n_dev_folds, do_ensemble=do_ensemble, dh=dh, seed=seed, verbose=verbose) results_df.loc['cross_val'] = [dev_f1, dev_acc, dev_cal] # predict on calibration data calib_predictions_df, calib_pred_probs_df = predict.predict(project_dir, model, model_name, subset, label, items_to_use=calib_items, verbose=verbose) f1_cal, acc_cal = evaluate_predictions.evaluate_predictions(calib_labels_df, calib_predictions_df, calib_pred_probs_df, pos_label=pos_label, average=average, verbose=False) results_df.loc['calibration'] = [f1_cal, acc_cal, calib_rmse] # predict on test data test_predictions_df, test_pred_probs_df = predict.predict(project_dir, model, model_name, subset, label, items_to_use=test_items, verbose=verbose) f1_test, acc_test = evaluate_predictions.evaluate_predictions(test_labels_df, test_predictions_df, test_pred_probs_df, pos_label=pos_label, average=average) results_df.loc['test'] = [f1_test, acc_test, 0.0] results_df.to_csv(os.path.join(dirs.dir_models(project_dir), model_name, 'results.csv')) # combine the predictions on the test and calibration data (unless excluding calibration data from this) if exclude_calib: test_predictions = test_predictions_df.values test_pred_probs = test_pred_probs_df.values else: if not use_calib_pred: calib_predictions_df = pd.DataFrame(np.argmax(calib_labels_df.values, axis=1), index=calib_labels_df.index) # normalize labels to get (questionable) estimates of probabilities calib_pred_probs_df = pd.DataFrame(calib_labels_df.values / np.array(np.sum(calib_labels_df.values, axis=1).reshape((n_calib, 1)), dtype=float), index=calib_labels_df.index) test_predictions = np.r_[test_predictions_df.values, calib_predictions_df.values] test_pred_probs = np.vstack([test_pred_probs_df.values, calib_pred_probs_df.values]) # now evaluate in terms of predicted proportions # average the predictions (assuming binary labels) cc_estimate = np.mean(test_predictions) cc_rmse = np.sqrt((cc_estimate - test_estimate)**2) # average the predicted probabilities for the positive label (assuming binary labels) pcc_estimate = np.mean(test_pred_probs[:, 1]) pcc_rmse = np.sqrt((pcc_estimate - test_estimate)**2) pcc_calib_estimate = np.mean(calib_pred_probs_df.values[:, 1]) pcc_calib_rmse = np.sqrt((pcc_calib_estimate - calib_estimate)**2) output_df.loc['PCC_cal'] = [n_calib, pcc_calib_estimate, pcc_calib_rmse, np.nan, np.nan, np.nan] output_df.loc['CC'] = [n_test, cc_estimate, cc_rmse, np.nan, np.nan, np.nan] output_df.loc['PCC'] = [n_test, pcc_estimate, pcc_rmse, np.nan, np.nan, np.nan] # expand the data so as to only have singly-labeled, weighted items _, calib_labels, calib_weights, calib_predictions = train.prepare_data(np.zeros([n_calib, 2]), calib_labels_df.values, predictions=calib_predictions_df.values) # do some sort of calibration here (ACC, PACC, PVC) print("ACC correction") #calib_labels_expanded, calib_weights_expanded, calib_predictions_expanded = expand_labels(calib_labels.values, calib_predictions.values) acc = calibration.compute_acc(calib_labels, calib_predictions, n_classes, weights=calib_weights) acc_corrected = calibration.apply_acc_binary(test_predictions, acc) acc_estimate = acc_corrected[1] acc_rmse = np.sqrt((acc_estimate - test_estimate) ** 2) output_df.loc['ACC'] = [n_calib, acc_estimate, acc_rmse, np.nan, np.nan, np.nan] print("ACC internal") acc_corrected = calibration.apply_acc_binary(test_predictions, acc_cfm) acc_estimate = acc_corrected[1] acc_rmse = np.sqrt((acc_estimate - test_estimate) ** 2) output_df.loc['ACC_int'] = [n_calib, acc_estimate, acc_rmse, np.nan, np.nan, np.nan] print("PVC correction") pvc = calibration.compute_pvc(calib_labels, calib_predictions, n_classes, weights=calib_weights) pvc_corrected = calibration.apply_pvc(test_predictions, pvc) pvc_estimate = pvc_corrected[1] pvc_rmse = np.sqrt((pvc_estimate - test_estimate) ** 2) output_df.loc['PVC'] = [n_calib, pvc_estimate, pvc_rmse, np.nan, np.nan, np.nan] print("PVC internal") pvc_corrected = calibration.apply_pvc(test_predictions, pvc_cfm) pvc_estimate = pvc_corrected[1] pvc_rmse = np.sqrt((pvc_estimate - test_estimate) ** 2) output_df.loc['PVC_int'] = [n_calib, pvc_estimate, pvc_rmse, np.nan, np.nan, np.nan] print("Venn") test_pred_ranges, calib_pred_ranges = ivap.estimate_probs_from_labels(project_dir, model, model_name, subset, subset, sampled_labels_df, calib_items, test_items, weights_df=None) if not exclude_calib: test_pred_ranges = np.vstack([test_pred_ranges, calib_pred_ranges]) combo = test_pred_ranges[:, 1] / (1.0 - test_pred_ranges[:, 0] + test_pred_ranges[:, 1]) pred_range = np.mean(test_pred_ranges, axis=0) venn_estimate = np.mean(combo) venn_rmse = np.sqrt((venn_estimate - test_estimate)**2) venn_contains_test = pred_range[0] < test_estimate < pred_range[1] output_df.loc['Venn'] = [n_calib, venn_estimate, venn_rmse, pred_range[0], pred_range[1], venn_contains_test] output_filename = os.path.join(dirs.dir_models(project_dir), model_name, field_name + '_' + str(v) + '.csv') output_df.to_csv(output_filename)
def cross_train_and_eval(project_dir, subset, config_file, n_train=500, suffix='', model_type='LR', loss='log', do_ensemble=True, dh=100, label='label', penalty='l1', intercept=True, n_dev_folds=5, repeats=1, verbose=False, average='micro', objective='f1', seed=None, alpha_min=0.01, alpha_max=1000.0, sample_labels=False, run_all=False): field_name = 'nosplit' model_basename = subset + '_' + label + '_' + field_name + '_' + model_type + '_' + penalty if model_type == 'MLP': model_basename += '_' + str(dh) model_basename += '_' + str(n_train) + '_' + objective if sample_labels: model_basename += '_sampled' model_basename += suffix # save the experiment parameters to a log file logfile = os.path.join(dirs.dir_logs(project_dir), model_basename + '.json') fh.makedirs(dirs.dir_logs(project_dir)) log = { 'project': project_dir, 'subset': subset, 'field_name': 'nosplit', 'config_file': config_file, 'n_train': n_train, 'suffix': suffix, 'model_type': model_type, 'loss': loss, 'dh': dh, 'alpha_min': alpha_min, 'alpha_max': alpha_max, 'do_ensemble': do_ensemble, 'label': label, 'penalty': penalty, 'intercept': intercept, 'objective': objective, 'n_dev_folds': n_dev_folds, 'repeats': repeats, 'average': average, #'use_calib_pred': use_calib_pred, #'exclude_calib': exclude_calib, 'sample_labels': sample_labels } fh.write_to_json(log, logfile) # load the features specified in the config file config = fh.read_json(config_file) feature_defs = [] for f in config['feature_defs']: feature_defs.append(features.parse_feature_string(f)) # load all labels label_dir = dirs.dir_labels(project_dir, subset) labels_df = fh.read_csv_to_df(os.path.join(label_dir, label + '.csv'), index_col=0, header=0) n_items, n_classes = labels_df.shape weights_df = None # eliminate items with no labels print("Subsetting items with labels") label_sums_df = labels_df.sum(axis=1) labeled_item_selector = label_sums_df > 0 labels_df = labels_df[labeled_item_selector] n_items, n_classes = labels_df.shape labeled_items = list(set(labels_df.index)) print("Starting repeats") # repeat the following process multiple times with different random splits of train / calibration / test data for r in range(repeats): print("* Repetition %d *" % r) # take a random subset of the training data np.random.shuffle(labeled_items) train_items = labeled_items[:n_train] test_items = labeled_items[n_train:] n_test = len(test_items) n_calib = 0 # create a data frame to hold a summary of the results output_df = pd.DataFrame([], columns=['N', 'training data', 'test data', 'cal', 'estimate', 'RMSE', '95lcl', '95ucl', 'contains_test']) # create a unique name ofr this model model_name = model_basename + '_' + 'nosplit' + '_' + str(r) print("Train: %d, calibration: %d, test: %d" % (n_train, n_calib, n_test)) test_labels_df = labels_df.loc[test_items] # if instructed, sample labels in proportion to annotations (to simulate having one label per item) if sample_labels: print("Sampling labels") # normalize the labels temp = labels_df.values / np.array(labels_df.values.sum(axis=1).reshape((n_items, 1)), dtype=float) samples = np.zeros([n_items, n_classes], dtype=int) for i in range(n_items): index = np.random.choice(np.arange(n_classes), size=1, p=temp[i, :]) samples[i, index] = 1 sampled_labels_df = pd.DataFrame(samples, index=labels_df.index, columns=labels_df.columns) else: sampled_labels_df = labels_df train_labels_df = sampled_labels_df.loc[train_items].copy() # get the true proportion of labels in the test OR non-training data (calibration and test combined) target_props, target_estimate, target_std = combo.get_estimate_and_std(labels_df) output_df.loc['target'] = [n_test, 'n/a', 'all', 'given', target_estimate, 0, target_estimate - 2 * target_std, target_estimate + 2 * target_std, np.nan] # get the same estimate from training data train_props, train_estimate, train_std = combo.get_estimate_and_std(train_labels_df) # compute the error of this estimate train_rmse = np.sqrt((train_estimate - target_estimate)**2) train_contains_test = target_estimate > train_estimate - 2 * train_std and target_estimate < train_estimate + 2 * train_std output_df.loc['train'] = [n_train, 'train', 'train', 'n/a', train_estimate, train_rmse, train_estimate - 2 * train_std, train_estimate + 2 * train_std, train_contains_test] # do a test using the number of annotations rather than the number of items train_props2, train_estimate2, train_std2 = combo.get_estimate_and_std(train_labels_df, use_n_annotations=True) # compute the error of this estimate train_rmse2 = np.sqrt((train_estimate2 - target_estimate)**2) train_contains_test2 = target_estimate > train_estimate2 - 2 * train_std2 and target_estimate < train_estimate2 + 2 * train_std2 output_df.loc['train_n_annotations'] = [n_train, 'train', 'train', 'n/a', train_estimate2, train_rmse2, train_estimate2 - 2 * train_std2, train_estimate2 + 2 * train_std2, train_contains_test2] print("target proportions: (%0.3f, %0.3f); train proportions: %0.3f" % (target_estimate - 2 * target_std, target_estimate + 2 * target_std, train_estimate)) if train_estimate > 0.5: pos_label = 0 else: pos_label = 1 print("Using %d as the positive label" % pos_label) results_df = pd.DataFrame([], columns=['f1', 'acc', 'calibration', 'calib overall']) # Now train a model on the training data, saving the calibration data for calibration print("Training model on training data only") model, dev_f1, dev_acc, dev_cal, dev_cal_overall = train.train_model_with_labels(project_dir, model_type, loss, model_name, subset, sampled_labels_df, feature_defs, weights_df=weights_df, items_to_use=train_items, penalty=penalty, alpha_min=alpha_min, alpha_max=alpha_max, intercept=intercept, objective=objective, n_dev_folds=n_dev_folds, do_ensemble=do_ensemble, dh=dh, seed=seed, pos_label=pos_label, verbose=verbose) results_df.loc['cross_val'] = [dev_f1, dev_acc, dev_cal, dev_cal_overall] # predict on test data test_predictions_df, test_pred_probs_df, test_pred_proportions = predict.predict(project_dir, model, model_name, subset, label, items_to_use=test_items, verbose=verbose) f1_test, acc_test = evaluate_predictions.evaluate_predictions(test_labels_df, test_predictions_df, test_pred_probs_df, pos_label=pos_label, average=average) true_test_vector = np.argmax(test_labels_df.as_matrix(), axis=1) test_cal_rmse = evaluation.evaluate_calibration_rmse(true_test_vector, test_pred_probs_df.as_matrix()) test_cal_rmse_overall = evaluation.evaluate_calibration_rmse(true_test_vector, test_pred_probs_df.as_matrix(), min_bins=1, max_bins=1) results_df.loc['test'] = [f1_test, acc_test, test_cal_rmse, test_cal_rmse_overall] test_cc_estimate, test_pcc_estimate, test_acc_estimate_internal, test_pvc_estimate_internal = test_pred_proportions # predict on calibration and test data combined all_predictions_df, all_pred_probs_df, all_pred_proportions = predict.predict(project_dir, model, model_name, subset, label, items_to_use=labeled_items, verbose=verbose) all_cc_estimate, all_pcc_estimate, all_acc_estimate_internal, all_pvc_estimate_internal = all_pred_proportions cc_rmse = np.sqrt((all_cc_estimate[1] - target_estimate)**2) pcc_rmse = np.sqrt((all_pcc_estimate[1] - target_estimate)**2) output_df.loc['CC_all'] = [n_items, 'train', 'all', 'predicted', all_cc_estimate[1], cc_rmse, np.nan, np.nan, np.nan] output_df.loc['PCC_all'] = [n_items, 'train', 'all', 'predicted', all_pcc_estimate[1], pcc_rmse, np.nan, np.nan, np.nan] averaged_cc_estimate = (test_cc_estimate[1] * n_test + train_estimate * n_train) / float(n_test + n_train) averaged_pcc_estimate = (test_pcc_estimate[1] * n_test + train_estimate * n_train) / float(n_test + n_train) averaged_cc_rmse = np.sqrt((averaged_cc_estimate - target_estimate)**2) averaged_pcc_rmse = np.sqrt((averaged_pcc_estimate - target_estimate)**2) output_df.loc['CC_nontrain_averaged'] = [n_items, 'train', 'all', 'given', averaged_cc_estimate, averaged_cc_rmse, np.nan, np.nan, np.nan] output_df.loc['PCC_nontrain_averaged'] = [n_items, 'train', 'all', 'given', averaged_pcc_estimate, averaged_pcc_rmse, np.nan, np.nan, np.nan] all_acc_rmse_internal = np.sqrt((all_acc_estimate_internal[1] - target_estimate) ** 2) all_pvc_rmse_internal = np.sqrt((all_pvc_estimate_internal[1] - target_estimate) ** 2) output_df.loc['ACC_internal'] = [n_items, 'train', 'all', 'predicted', all_acc_estimate_internal[1], all_acc_rmse_internal, np.nan, np.nan, np.nan] output_df.loc['PVC_internal'] = [n_items, 'train', 'all', 'predicted', all_pvc_estimate_internal[1], all_pvc_rmse_internal, np.nan, np.nan, np.nan] print("Venn internal all") all_pred_ranges_internal, all_preds_internal = ivap.estimate_probs_from_labels_internal(project_dir, model, model_name, subset, labeled_items, plot=False) pred_range = np.mean(all_pred_ranges_internal, axis=0) venn_estimate = np.mean(all_preds_internal) venn_rmse = np.sqrt((venn_estimate - target_estimate)**2) venn_contains_test = pred_range[0] < target_estimate < pred_range[1] output_df.loc['Venn_internal'] = [n_items, 'train', 'all', 'predicted', venn_estimate, venn_rmse, pred_range[0], pred_range[1], venn_contains_test] print("Venn internal test") test_pred_ranges_internal, test_preds_internal = ivap.estimate_probs_from_labels_internal(project_dir, model, model_name, subset, test_items) pred_range = np.mean(test_pred_ranges_internal, axis=0) venn_estimate = (np.mean(test_preds_internal) * n_test + train_estimate * n_train) / float(n_test + n_train) venn_rmse = np.sqrt((venn_estimate - target_estimate)**2) averaged_lower = (pred_range[0] * n_test + (train_estimate - 2 * train_std) * n_train) / float(n_test + n_train) averaged_upper = (pred_range[1] * n_test + (train_estimate + 2 * train_std) * n_train) / float(n_test + n_train) venn_contains_test = averaged_lower < target_estimate < averaged_upper output_df.loc['Venn_internal_averaged'] = [n_items, 'train', 'all', 'given', venn_estimate, venn_rmse, averaged_lower, averaged_upper, venn_contains_test] results_df.to_csv(os.path.join(dirs.dir_models(project_dir), model_name, 'accuracy.csv')) output_df.to_csv(os.path.join(dirs.dir_models(project_dir), model_name, 'results.csv'))
def cross_train_and_eval(project_dir, subset, field_name, config_file, calib_prop=0.33, nontest_prop=1.0, prefix=None, max_folds=None, model_type='LR', label='label', penalty='l2', cshift=None, intercept=True, n_dev_folds=5, repeats=1, verbose=False, pos_label=1, average='micro', objective='f1'): model_basename = subset + '_' + field_name if prefix is not None: model_basename = prefix + '_' + model_basename logfile = os.path.join(dirs.dir_logs(project_dir), model_basename + '.json') fh.makedirs(dirs.dir_logs(project_dir)) log = { 'project': project_dir, 'subset': subset, 'field_name': field_name, 'config_file': config_file, 'calib_prop': calib_prop, 'train_prop': nontest_prop, 'prefix': prefix, 'max_folds': max_folds, 'model_type': model_type, 'label': label, 'penalty': penalty, 'cshift': cshift, 'intercept': intercept, 'objective': objective, 'n_dev_folds': n_dev_folds, 'repeats': repeats, 'pos_label': pos_label, 'average': average } fh.write_to_json(log, logfile) config = fh.read_json(config_file) feature_defs = [] for f in config['feature_defs']: feature_defs.append(features.parse_feature_string(f)) weights_file = None metadata_file = os.path.join(dirs.dir_subset(project_dir, subset), 'metadata.csv') metadata = fh.read_csv_to_df(metadata_file) field_vals = list(set(metadata[field_name].values)) field_vals.sort() print(field_vals) if max_folds is None: max_folds = len(field_vals) for v_i, v in enumerate(field_vals[:max_folds]): print("\nTesting on %s" % v) nontest_selector = metadata[field_name] != v nontest_subset = metadata[nontest_selector] nontest_items = list(nontest_subset.index) n_nontest = len(nontest_items) test_selector = metadata[field_name] == v test_subset = metadata[test_selector] test_items = test_subset.index.tolist() n_test = len(test_items) # load all labels label_dir = dirs.dir_labels(project_dir, subset) labels_df = fh.read_csv_to_df(os.path.join(label_dir, label + '.csv'), index_col=0, header=0) n_items, n_classes = labels_df.shape # subsample the non-test items if desired if nontest_prop < 1.0: np.random.shuffle(nontest_items) nontest_items = np.random.choice(nontest_items, size=int(n_nontest * nontest_prop), replace=False) n_nontest = len(nontest_items) nontest_labels = labels_df.loc[nontest_items] if cshift is not None: print("Training a classifier for covariate shift") # start by learning to discriminate test from non-test data train_test_labels = np.zeros((n_items, 2), dtype=int) train_test_labels[nontest_selector, 0] = 1 train_test_labels[test_selector, 1] = 1 train_test_labels_df = pd.DataFrame(train_test_labels, index=labels_df.index, columns=[0, 1]) model_name = model_basename + '_' + str(v) + '_' + 'cshift' model, dev_f1, dev_cal, _, _ = train.train_model_with_labels( project_dir, model_type, model_name, subset, train_test_labels_df, feature_defs, penalty=penalty, intercept=intercept, n_dev_folds=n_dev_folds, verbose=False) train_test_pred_df, train_test_probs_df = predict.predict( project_dir, model, model_name, subset, label, verbose=verbose) print("Min: %0.4f" % train_test_probs_df[1].min()) print("Max: %0.4f" % train_test_probs_df[1].max()) # base the weights on the probability of each item being a training item weights = n_nontest / float(n_test) * ( 1.0 / train_test_probs_df[0].values - 1) print("Min weight: %0.4f" % weights[nontest_selector].min()) print("Ave weight: %0.4f" % weights[nontest_selector].mean()) print("Max weight: %0.4f" % weights[nontest_selector].max()) print("Min weight: %0.4f" % weights.min()) print("Ave weight: %0.4f" % weights.mean()) print("Max weight: %0.4f" % weights.max()) weights_df = pd.DataFrame(weights, index=labels_df.index) else: weights_df = None # repeat the following process multiple times with different random splits of calibration / test data for r in range(repeats): output_df = pd.DataFrame([], columns=[ 'N', 'estimate', 'RMSE', '95lcl', '95ucl', 'contains_test' ]) model_name = model_basename + '_' + str(v) + '_' + str(r) # split the non-test items into train and calibration n_calib = int(n_nontest * calib_prop) np.random.shuffle(nontest_items) calib_items = nontest_items[:n_calib] train_items = nontest_items[n_calib:] train_labels = labels_df.loc[train_items] calib_labels = labels_df.loc[calib_items] test_labels = labels_df.loc[test_items] # get the label proportions from the test and non-test data test_props, test_estimate, test_std = get_estimate_and_std( test_labels) output_df.loc['test'] = [ n_test, test_estimate, 0, test_estimate - 2 * test_std, test_estimate + 2 * test_std, 1 ] nontest_props, nontest_estimate, nontest_std = get_estimate_and_std( nontest_labels) nontest_rmse = np.sqrt((nontest_estimate - test_estimate)**2) nontest_contains_test = test_estimate > nontest_estimate - 2 * nontest_std and test_estimate < nontest_estimate + 2 * nontest_std output_df.loc['nontest'] = [ n_nontest, nontest_estimate, nontest_rmse, nontest_estimate - 2 * nontest_std, nontest_estimate + 2 * nontest_std, nontest_contains_test ] # train a model print("Doing training") model, dev_f1, dev_cal, acc_cfm, pvc_cfm = train.train_model_with_labels( project_dir, model_type, model_name, subset, labels_df, feature_defs, weights_df=weights_df, items_to_use=train_items, penalty=penalty, intercept=intercept, objective=objective, n_dev_folds=n_dev_folds, verbose=verbose) # predict on the calibration and test sets print("Doing prediction on calibration items") calib_predictions, calib_pred_probs = predict.predict( project_dir, model, model_name, subset, label, items_to_use=calib_items, verbose=verbose) print("Doing prediction on test items") test_predictions, test_pred_probs = predict.predict( project_dir, model, model_name, subset, label, items_to_use=test_items, verbose=verbose) # evaluate the model on the calibration and test data print("Doing evaluation") f1_cal, acc_cal = evaluate_predictions.evaluate_predictions( calib_labels, calib_predictions, pos_label=pos_label, average=average) f1_test, acc_test = evaluate_predictions.evaluate_predictions( test_labels, test_predictions, pos_label=pos_label, average=average) results_df = pd.DataFrame([], columns=['f1', 'acc']) results_df.loc['calibration'] = [f1_cal, acc_cal] results_df.loc['test'] = [f1_test, acc_test] results_df.to_csv( os.path.join(dirs.dir_models(project_dir), model_name, 'results.csv')) # first check results without any correction # average the preditions (assuming binary labels) cc_estimate = np.mean(test_predictions[label].values) cc_rmse = np.sqrt((cc_estimate - test_estimate)**2) # average the predicted probabilities for the positive label (assuming binary labels) pcc_estimate = np.mean(test_pred_probs[1].values) pcc_rmse = np.sqrt((pcc_estimate - test_estimate)**2) output_df.loc['CC'] = [n_test, cc_estimate, cc_rmse, 0, 1, np.nan] output_df.loc['PCC'] = [ n_test, pcc_estimate, pcc_rmse, 0, 1, np.nan ] # do the two basic corrections, based on the calibration data print("ACC internal") calib_labels_expanded, calib_weights_expanded, calib_predictions_expanded = expand_labels( calib_labels.values, calib_predictions.values) acc = calibration.compute_acc(calib_labels_expanded, calib_predictions_expanded, n_classes, calib_weights_expanded) acc_corrected = calibration.apply_acc_binary( test_predictions.values, acc) acc_estimate = acc_corrected[1] acc_rmse = np.sqrt((acc_estimate - test_estimate)**2) output_df.loc['ACC_int'] = [ n_calib, acc_estimate, acc_rmse, 0, 1, np.nan ] print("PVC internal") pvc = calibration.compute_pvc(calib_labels_expanded, calib_predictions_expanded, n_classes, weights=calib_weights_expanded) pvc_corrected = calibration.apply_pvc(test_predictions.values, pvc) pvc_estimate = pvc_corrected[1] pvc_rmse = np.sqrt((pvc_estimate - test_estimate)**2) output_df.loc['PVC_int'] = [ n_calib, pvc_estimate, pvc_rmse, 0, 1, np.nan ] # do IVAP for calibration print("Venn") test_pred_ranges = ivap.estimate_probs_from_labels( project_dir, model, model_name, subset, subset, labels_df, calib_items, test_items, weights_df=weights_df) combo = test_pred_ranges[:, 1] / (1.0 - test_pred_ranges[:, 0] + test_pred_ranges[:, 1]) pred_range = np.mean(test_pred_ranges, axis=0) venn_estimate = np.mean(combo) venn_rmse = np.sqrt((venn_estimate - test_estimate)**2) venn_contains_test = pred_range[0] < test_estimate < pred_range[1] output_df.loc['Venn'] = [ n_calib, venn_estimate, venn_rmse, pred_range[0], pred_range[1], venn_contains_test ] output_filename = os.path.join(dirs.dir_models(project_dir), model_name, field_name + '_' + str(v) + '.csv') output_df.to_csv(output_filename)
def identify_rnn_targets(output_data_filename): min_head_vocab = 5 min_role_vocab = 4 min_tuples = 3 ATTRIBUTE = 0 AGENT_ROLE = 1 PATIENT_ROLE = 2 SURFACE_FORM = 3 parsed_dir = os.path.join(dirs.data_stanford_dir, 'parsed') parsed_files = glob.glob(os.path.join(parsed_dir, '*.json')) dependencies_file = os.path.join(dirs.data_stanford_dir, 'dependency_tuple_ids.json') dependencies = fh.read_json(dependencies_file) coref_file = os.path.join(dirs.data_stanford_dir, 'coref_heads.json') coref_heads = fh.read_json(coref_file) supersense_tags = fh.read_json(os.path.join(dirs.data_amalgram_dir, 'all_tags.json')) heads = defaultdict(int) tokens = defaultdict(int) attributes = defaultdict(int) agent_roles = defaultdict(int) patient_roles = defaultdict(int) story_elements = {} print "Extracting story elements" for f_i, f in enumerate(parsed_files): sentences = fh.read_json(f) basename = fh.get_basename_wo_ext(f) print f element_list = extract_story_elements_from_article(sentences, dependencies[basename], coref_heads[basename], supersense_tags[basename], basename) story_elements[basename] = element_list for element in element_list: for h in element.head_words: heads[h] += 1 for t in element.attributes: attributes[t] += 1 for t in element.agent_roles: agent_roles[t] += 1 for t in element.patient_roles: patient_roles[t] += 1 print "Finding most common tokens" common_heads = [(v, k) for k, v in heads.items()] common_heads.sort() common_heads.reverse() output_filename = os.path.join(dirs.lda_dir, 'common_heads.json') fh.write_to_json(common_heads, output_filename, sort_keys=False) """ common_tokens = [(v, k) for k, v in tokens.items()] common_tokens.sort() common_tokens.reverse() output_filename = os.path.join(dirs.lda_dir, 'common_tokens.json') fh.write_to_json(common_tokens, output_filename, sort_keys=False) """ common_attributes = [(v, k) for k, v in attributes.items()] common_attributes.sort() common_attributes.reverse() output_filename = os.path.join(dirs.lda_dir, 'common_attributes.json') fh.write_to_json(common_attributes, output_filename, sort_keys=False) common_agent_roles = [(v, k) for k, v in agent_roles.items()] common_agent_roles.sort() common_agent_roles.reverse() output_filename = os.path.join(dirs.lda_dir, 'common_agent_roles.json') fh.write_to_json(common_agent_roles, output_filename, sort_keys=False) common_patient_roles = [(v, k) for k, v in patient_roles.items()] common_patient_roles.sort() common_patient_roles.reverse() output_filename = os.path.join(dirs.lda_dir, 'common_patient_roles.json') fh.write_to_json(common_patient_roles, output_filename, sort_keys=False) print pronoun_list #most_common_heads = {k: v for v, k in common_heads if v >= min_head_vocab and k not in pronoun_list} most_common_attributes = {k: v for v, k in common_attributes if (v >= min_role_vocab and k not in pronoun_list)} most_common_agent_roles = {k: v for v, k in common_agent_roles if (v >= min_role_vocab and k not in pronoun_list and k not in stopwords)} most_common_patient_roles = {k: v for v, k in common_patient_roles if (v >= min_role_vocab and k not in pronoun_list and k not in stopwords)} output_filename = os.path.join(dirs.lda_dir, 'most_common_attributes.json') fh.write_to_json(most_common_attributes, output_filename, sort_keys=False) output_filename = os.path.join(dirs.lda_dir, 'most_common_agent_roles.json') fh.write_to_json(most_common_agent_roles, output_filename, sort_keys=False) output_filename = os.path.join(dirs.lda_dir, 'most_common_patient_roles.json') fh.write_to_json(most_common_patient_roles, output_filename, sort_keys=False) print len(most_common_attributes) print len(most_common_agent_roles) print len(most_common_patient_roles) print "Filtering tuples" valid_elements = defaultdict(list) for basename, element_list in story_elements.items(): for se in element_list: # need at least one head word that is not a pronoun se.valid_heads = [h for h in se.head_words if h not in pronoun_list] if len(se.valid_heads) > 0: se.valid_attributes = [t for t in se.attributes if t in most_common_attributes] se.valid_agent_roles = [t for t in se.agent_roles if t in most_common_agent_roles] se.valid_patient_roles = [t for t in se.patient_roles if t in most_common_patient_roles] se.tuples = [(ATTRIBUTE, t) for t in se.valid_attributes] + \ [(AGENT_ROLE, t) for t in se.valid_agent_roles] + \ [(PATIENT_ROLE, t) for t in se.valid_patient_roles] #[(SURFACE_FORM, t) for t in se.valid_heads] if len(se.tuples) >= min_tuples: valid_elements[basename].append(se) output_data = [] for basename, element_list in valid_elements.items(): used_sentences = set() for se in element_list: for i in range(len(se.head_indices)): assert se.head_indices[i] < len(se.sentences[i].split()) if se.head_words[i] not in pronoun_list: if se.sentences[i] not in used_sentences: output_data.append((se.head_indices[i], se.sentences[i], basename)) # THIS IS TRYING SOMETHING NEW... used_sentences.add(se.sentences[i]) with codecs.open(output_data_filename, 'w', encoding='utf-8') as output_file: json.dump(output_data, output_file, indent=2, sort_keys=False) """
def cluster_entities(entities_file): # read the entities grouped by coref groups = fh.read_json(entities_file) print len(groups) # remove pronouns pronoun_list = ['he', 'his', 'it', 'they', 'their', 'He', 'It', 'I', 'them', 'him', 'its', 'her', 'she', 'They', 'That', 'His', 'we', 'We', 'that', 'she', 'my', 'me', 'our', 'himself', 'This', 'themselves', 'Her', 'Their', 'us', 'My', 'you', 'itself', 'this', 'Its', 'Our', 'herself', 'myself', 'You', 'These', 'those', 'your', "'s'", 'She', 'i'] pronoun_list.sort() for p in pronoun_list: if p in groups: groups.pop(p) for subgroup in groups.values(): if p in subgroup: subgroup.pop(p) # count the number of corefs for each group group_sizes = {k: np.sum(v.values()) for k, v in groups.items()} #keys = refs.keys() #vals = refs.values() # sort by the number of corefs for each type #order = np.argsort(vals).tolist() #order.reverse() # assign each entity to the group that it corefs with the most times entities = {} for group, corefs in groups.items(): for name, count in corefs.items(): if name not in entities: entities[name] = (group, count) elif count > entities[name][1]: entities[name] = (group, count) #names = entities.keys() #counts = [c for r, c in entities.values()] #order = np.argsort(counts).tolist() #order.reverse() #for i in range(40): # name = names[order[i]] # print name, entities[name] # for each entity in a document, map it to the group that it corefs with the most # choose the group that has the largest total counts # replace all references with that group name sample_document = os.path.join(dirs.data_stanford_dir, 'xml', 'Immigration1.0-24.txt.xml') sentences, _, _, _, _, doc_groups= stanford.parse_xml_output(sample_document) replacements = [] for doc_group in doc_groups: counts = {} for ref, sentence, start, end in doc_group: ref = ref.lower() if ref in entities: group, count = entities[ref] if group in counts: counts[group] += count else: counts[group] = count group_list = counts.keys() count_list = counts.values() order = np.argsort(count_list).tolist() order.reverse() replacements.append(group_list[order[0]]) print doc_group, counts