def main(): usage = "%prog results_file" parser = OptionParser(usage=usage) (options, args) = parser.parse_args() results_file = args[0] results = pd.DataFrame(columns=('masked', 'test', 'valid', 'dir')) lines = fh.read_text(results_file) for i, line in enumerate(lines): if i > 0: parts = line.split() date = parts[0] time = parts[1] name = parts[2] masked = float(parts[3]) test = float(parts[4]) name_parts = name.split('_') run_num = int(name_parts[-1]) if run_num < 41: if test > 0: valid = parts[5][1:-1] else: valid = masked #results.loc[run_num, 'iteration'] = run_num results.loc[run_num, 'masked'] = masked results.loc[run_num, 'test'] = test results.loc[run_num, 'valid'] = valid results.loc[run_num, 'dir'] = name results.to_csv(results_file + 'results.csv', columns=results.columns) sorted = results.sort('masked') print sorted print "best by masked" print sorted.values[-1, :] print "best by valid" sorted = results.sort('valid') print sorted.values[-1, :]
def parse_xml_output(xml_filename): raw_xml = fh.read_text(xml_filename) sentences = [] sentiments = [] dependencies = [] dependency_tuples = [] entities = [] coref = [] groups = [] detailed_groups = [] coref_indices = [] coref_heads = [] line_index = 0 sentence_id = 0 while line_index < len(raw_xml): line = raw_xml[line_index].lstrip() match = re.search('<sentence id="(\d+)" line="\d+" sentimentValue="(\d+)" sentiment="(.*)"', line) if match is not None: line_index, sentence, sent_dependencies, sent_dependency_tuples,\ sent_entities = parse_sentence(raw_xml, line_index) sentences.append(sentence) sentiments.append([match.group(3)]) dependencies.append(sent_dependencies) dependency_tuples.append(sent_dependency_tuples) entities.append(sent_entities) if re.search('<coreference>', line) is not None: line_index, coref, groups, detailed_groups, coref_indices,\ coref_heads = parse_coref(raw_xml, line_index, len(sentences)) line_index += 1 # deal with the cases where there are no entities if len(coref) == 0: for s in sentences: coref.append([]) return sentences, sentiments, dependencies, dependency_tuples, entities, coref, groups, detailed_groups,\ coref_indices, coref_heads
def test_over_time(project_dir, subset, config_file, model_type, field, train_start, train_end, test_start, test_end, n_train=None, n_calib=0, penalty='l2', suffix='', loss='log', objective='f1', do_ensemble=True, dh=300, label='label', intercept=True, n_dev_folds=5, average='micro', seed=None, alpha_min=0.01, alpha_max=1000.0, n_alphas=8, sample_labels=False, group_identical=False, annotated_subset=None, nonlinearity='tanh', init_lr=1e-2, min_epochs=2, max_epochs=50, patience=5, tol=1e-4, list_size=1, repeats=1, oracle=False, lower=None, interactive=False, stoplist_file=None, cshift=False, n_cshift=None, do_cfm=True, do_platt=True, dropout=0.0, min_test=None, test_prop=None, verbose=False): # Just run a regular model, one per year, training on the past, and save the reults if seed is not None: seed = int(seed) np.random.seed(seed) log = { 'project': project_dir, 'subset': subset, 'config_file': config_file, 'model_type': model_type, 'field': field, 'train_start': train_start, 'train_end': train_end, 'test_start': test_start, 'test_end': test_end, 'n_train': n_train, 'n_calib': n_calib, 'penalty': penalty, 'cshift': cshift, 'n_cshift': n_cshift, 'suffix': suffix, 'loss': loss, 'objective': objective, 'do_ensemble': do_ensemble, 'dh': dh, 'label': label, 'intercept': intercept, 'n_dev_folds': n_dev_folds, 'average': average, 'seed': seed, 'alpha_min': alpha_min, 'alpha_max': alpha_max, 'n_alphas': n_alphas, 'sample_labels': sample_labels, 'group_identical': group_identical, 'annotated_subset': annotated_subset, 'nonlinearity': nonlinearity, 'init_lr': init_lr, 'min_epochs': min_epochs, 'max_epochs': max_epochs, 'patience': patience, 'tol': tol, 'interactive': interactive, 'stoplist_file': stoplist_file, 'list_size': list_size } model_basename = make_model_basename(log) # save the experiment parameters to a log file logfile = os.path.join(dirs.dir_logs(project_dir), model_basename + '.json') fh.makedirs(dirs.dir_logs(project_dir)) fh.write_to_json(log, logfile) # load the features specified in the config file config = fh.read_json(config_file) feature_defs = [] for f in config['feature_defs']: feature_defs.append(features.parse_feature_string(f)) # load the file that contains metadata about each item metadata_file = os.path.join(dirs.dir_subset(project_dir, subset), 'metadata.csv') metadata = fh.read_csv_to_df(metadata_file) field_vals = list(set(metadata[field].values)) field_vals.sort() print("Splitting data according to %s", field) print("Values:", field_vals) print("\nTesting on %s to %s" % (test_start, test_end)) # first, split into training and non-train data based on the field of interest all_items = list(metadata.index) test_selector_all = (metadata[field] >= int(test_start)) & (metadata[field] <= int(test_end)) test_subset_all = metadata[test_selector_all] test_items_all = test_subset_all.index.tolist() n_test_all = len(test_items_all) if min_test is not None: if n_test_all < min_test: print("Not enough test samples; exiting") return if train_end is None: if train_start is None: train_selector_all = metadata[field] < int(test_start) else: train_selector_all = (metadata[field] < int(test_start)) & (metadata[field] >= train_start) else: if train_start is None: train_selector_all = metadata[field] <= int(train_end) else: train_selector_all = (metadata[field] <= int(train_end)) & (metadata[field] >= train_start) train_subset_all = metadata[train_selector_all] train_items_all = list(train_subset_all.index) n_train_all = len(train_items_all) # only keep the items in the train and test sets all_items = train_items_all + test_items_all print("Train: %d, Test: %d (labeled and unlabeled)" % (n_train_all, n_test_all)) # load all labels label_dir = dirs.dir_labels(project_dir, subset) labels_df = fh.read_csv_to_df(os.path.join(label_dir, label + '.csv'), index_col=0, header=0) labels_df = labels_df.loc[all_items] # if desired, attempt to learn weights for the training data using techniques for covariate shift if cshift: print("Training a classifier for covariate shift") # start by learning to discriminate train from non-train data # Label items based on whether they come from train or test train_test_labels = np.zeros((len(all_items), 2), dtype=int) train_test_labels[:n_train_all, 0] = 1 train_test_labels[n_train_all:, 1] = 1 if np.sum(train_test_labels[:, 0]) < np.sum(train_test_labels[:, 1]): cshift_pos_label = 0 else: cshift_pos_label = 1 train_test_labels_df = pd.DataFrame(train_test_labels, index=all_items, columns=[0, 1]) if n_cshift is not None and len(all_items) >= n_cshift: print("Taking a random sample of %d items for reweighting" % n_cshift) #np.random.shuffle(all_items) cshift_items = np.random.choice(all_items, size=n_cshift, replace=False) else: print("Using all train items") cshift_items = all_items print(train_test_labels_df.loc[cshift_items].mean(axis=0)) # create a cshift model using the same specifiction as our model below (e.g. LR/MLP, etc.) model_name = model_basename + '_' + str(test_start) + '-' + str(test_end) + 'cshift' model, dev_f1, dev_acc, dev_cal, dev_cal_overall = train.train_model_with_labels(project_dir, model_type, loss, model_name, subset, train_test_labels_df, feature_defs, items_to_use=cshift_items, penalty=penalty, alpha_min=alpha_min, alpha_max=alpha_max, n_alphas=n_alphas, intercept=intercept, n_dev_folds=n_dev_folds, save_model=True, do_ensemble=False, dh=dh, seed=seed, pos_label=cshift_pos_label, verbose=False) print("cshift results: %0.4f f1, %0.4f acc" % (dev_f1, dev_acc)) #X_cshift, features_concat = predict.load_data(project_dir, model_name, subset, items_to_use=all_items) X_cshift, features_concat = predict.load_data(project_dir, model_name, subset, items_to_use=all_items) cshift_pred_probs = model.predict_probs(X_cshift) f_items = features_concat.get_items() assert len(f_items) == len(all_items) for i in range(len(all_items)): assert all_items[i] == f_items[i] cshift_pred_probs_df = pd.DataFrame(cshift_pred_probs, index=features_concat.get_items(), columns=range(2)) # display the min and max probs print("Min: %0.6f" % cshift_pred_probs_df[1].values[:n_train_all].min()) print("Mean: %0.6f" % cshift_pred_probs_df[1].values[:n_train_all].mean()) print("Max: %0.6f" % cshift_pred_probs_df[1].values[:n_train_all].max()) # HACK: need to prevent 0s in prob(y=0|x) p_train_values = cshift_pred_probs_df[0].values threshold = 0.01 p_train_values[p_train_values < threshold] = threshold print("After thresholding") print("Min: %0.6f" % p_train_values[:n_train_all].min()) print("Mean: %0.6f" % p_train_values[:n_train_all].mean()) print("Max: %0.6f" % p_train_values[:n_train_all].max()) # use the estimated probability of each item being a training item to compute item weights weights = n_train_all / float(n_test_all) * (1.0/p_train_values - 1) weights_df_all = pd.DataFrame(weights, index=all_items) # print a summary of the weights from just the training items print("Min weight: %0.4f" % weights[:n_train_all].min()) print("Ave weight: %0.4f" % weights[:n_train_all].mean()) print("Max weight: %0.4f" % weights[:n_train_all].max()) # print a summary of all weights #print("Min weight: %0.4f" % weights.min()) #print("Ave weight: %0.4f" % weights.mean()) #print("Max weight: %0.4f" % weights.max()) # create a data frame with this information else: weights_df_all = None # find the labeled items print("Subsetting items with labels") label_sums_df = labels_df.sum(axis=1) labeled_item_selector = label_sums_df > 0 labels_df = labels_df[labeled_item_selector] n_labeled_items, n_classes = labels_df.shape print("%d labeled items" % n_labeled_items) labeled_items = set(labels_df.index) train_items_labeled = [i for i in train_items_all if i in labeled_items] test_items = [i for i in test_items_all if i in labeled_items] #n_train = len(train_items) n_test = len(test_items) for r in range(repeats): # set seed very explicily here to make sure experiments are comparable if seed is not None: seed += 1 np.random.seed(seed) print("* Starting repetition %d *" % r) model_name = model_basename + '_' + str(test_start) + '-' + str(test_end) + '_' + str(r) if n_train is not None and len(train_items_labeled) >= n_train: np.random.shuffle(train_items_labeled) train_items = np.random.choice(train_items_labeled, size=n_train, replace=False) else: print("Using all train items") train_items = train_items_labeled n_train_r = len(train_items) # now, choose a calibration set if n_calib > 0 and n_test >= n_calib: np.random.shuffle(test_items) calib_items = np.random.choice(test_items, size=n_calib, replace=False) elif n_test < n_calib: print("Error: Only %d labeled test instances available" % n_test) calib_items = test_items else: calib_items = [] if weights_df_all is not None: weights_df = weights_df_all[labeled_item_selector] else: weights_df = None print("Labeled train: %d, test: %d" % (n_train_r, n_test)) # create a data frame to hold a summary of the results output_df = pd.DataFrame([], columns=['N', 'training data', 'test data', 'cal', 'estimate', 'MAE', '95lcl', '95ucl', 'contains_test']) test_labels_df = labels_df.loc[test_items] # do a fake adjustment of the test label proportions if test_prop is not None: test_prop = float(test_prop) test_label_values = test_labels_df.values test_label_props = test_label_values[:, 1] / (test_label_values[:, 1] + test_label_values[:, 0]) order = list(np.argsort(test_label_props)) true_prop = np.mean(test_label_props) if test_prop < true_prop: i = 0 running = test_label_props[order[i]] new_test_items = [test_items[order[i]]] i += 1 while (running / i) <= test_prop: running += test_label_props[order[i]] new_test_items.append(test_items[order[i]]) i += 1 print("Taking %d test_items" % len(new_test_items)) test_items = new_test_items[:] else: order.reverse() i = 0 running = test_label_props[order[i]] new_test_items = [test_items[order[i]]] i += 1 while (running / i) >= test_prop: running += test_label_props[order[i]] new_test_items.append(test_items[order[i]]) i += 1 print("Taking %d test_items" % len(new_test_items)) test_items = new_test_items[:] test_labels_df = labels_df.loc[test_items] test_label_values = test_labels_df.values test_label_props = test_label_values[:, 1] / (test_label_values[:, 1] + test_label_values[:, 0]) print("New props = %0.3f" % np.mean(test_label_props)) # if instructed, sample labels in proportion to annotations (to simulate having one label per item) if sample_labels: print("Sampling labels") # normalize the labels temp = labels_df.values / np.array(labels_df.values.sum(axis=1).reshape((n_labeled_items, 1)), dtype=float) samples = np.zeros([n_labeled_items, n_classes], dtype=int) for i in range(n_labeled_items): index = np.random.choice(np.arange(n_classes), size=1, p=temp[i, :]) samples[i, index] = 1 sampled_labels_df = pd.DataFrame(samples, index=labels_df.index, columns=labels_df.columns) else: sampled_labels_df = labels_df train_labels_df = sampled_labels_df.loc[train_items].copy() if n_calib > 0: calib_labels_df = sampled_labels_df.loc[calib_items].copy() else: calib_labels_df = None # get the true proportion of labels in the test OR non-training data (calibration and test combined) target_props, target_estimate, target_std = get_estimate_and_std(test_labels_df, use_n_annotations=True) output_df.loc['target'] = [n_test, 'test', 'test', 'n/a', target_estimate, 0, target_estimate - 2 * target_std, target_estimate + 2 * target_std, np.nan] # get the same estimate from training data train_props, train_estimate, train_std = get_estimate_and_std(train_labels_df, use_n_annotations=True) print("Train props:", train_props, train_estimate) train_rmse = np.abs(train_estimate - target_estimate) train_contains_test = target_estimate > train_estimate - 2 * train_std and target_estimate < train_estimate + 2 * train_std output_df.loc['train'] = [n_train_r, 'train', 'test', 'n/a', train_estimate, train_rmse, train_estimate - 2 * train_std, train_estimate + 2 * train_std, train_contains_test] # get the same estimate from training data if n_calib > 0: calib_props, calib_estimate, calib_std = get_estimate_and_std(calib_labels_df, use_n_annotations=True) # compute the error of this estimate calib_rmse = np.abs(calib_estimate - target_estimate) calib_contains_test = target_estimate > calib_estimate - 2 * calib_std and target_estimate < calib_estimate + 2 * calib_std output_df.loc['calib'] = [n_calib, 'calib', 'test', 'n/a', calib_estimate, calib_rmse, calib_estimate - 2 * calib_std, calib_estimate + 2 * calib_std, calib_contains_test] else: calib_estimate = 0.0 calib_std = 1.0 output_df.loc['calib'] = [n_calib, 'calib', 'test', 'n/a', np.nan, np.nan, np.nan, np.nan, np.nan] if train_estimate > 0.5: pos_label = 0 else: pos_label = 1 print("Using %d as the positive label" % pos_label) results_df = pd.DataFrame([], columns=['f1', 'acc', 'mae', 'estimated calibration']) # Now train a model on the training data, saving the calibration data for calibration if stoplist_file is not None: stoplist = fh.read_text(stoplist_file) stoplist = {s.strip() for s in stoplist} print(stoplist) else: stoplist = None print("Training a LR model") model, dev_f1, dev_acc, dev_cal_mae, dev_cal_est = train.train_model_with_labels(project_dir, model_type, 'log', model_name, subset, sampled_labels_df, feature_defs, weights_df=weights_df, items_to_use=train_items, penalty=penalty, alpha_min=alpha_min, alpha_max=alpha_max, n_alphas=n_alphas, intercept=intercept, objective=objective, n_dev_folds=n_dev_folds, do_ensemble=do_ensemble, dh=dh, seed=seed, pos_label=pos_label, vocab=None, group_identical=group_identical, nonlinearity=nonlinearity, init_lr=init_lr, min_epochs=min_epochs, max_epochs=max_epochs, patience=patience, do_cfm=do_cfm, do_platt=do_platt, lower=lower, stoplist=stoplist, dropout=dropout, verbose=verbose) results_df.loc['cross_val'] = [dev_f1, dev_acc, dev_cal_mae, dev_cal_est] X_test, features_concat = predict.load_data(project_dir, model_name, subset, items_to_use=test_items) test_predictions = model.predict(X_test) test_predictions_df = pd.DataFrame(test_predictions, index=features_concat.get_items(), columns=[label]) test_pred_probs = model.predict_probs(X_test) _, n_labels = test_pred_probs.shape test_pred_probs_df = pd.DataFrame(test_pred_probs, index=features_concat.get_items(), columns=range(n_labels)) f1_test, acc_test = evaluate_predictions.evaluate_predictions(test_labels_df, test_predictions_df, test_pred_probs_df, pos_label=pos_label, average=average) true_test_vector = np.argmax(test_labels_df.as_matrix(), axis=1) test_cal_est = evaluation.evaluate_calibration_rmse(true_test_vector, test_pred_probs_df.as_matrix(), min_bins=1, max_bins=1) test_cc_estimate, test_pcc_estimate = model.predict_proportions(X_test) test_cc_mae = np.mean(np.abs(test_cc_estimate[1] - target_estimate)) test_pcc_mae = np.mean(np.abs(test_pcc_estimate[1] - target_estimate)) results_df.loc['test'] = [f1_test, acc_test, test_pcc_mae, test_cal_est] output_df.loc['CC'] = [n_train_r, 'train', 'test', 'n/a', test_cc_estimate[1], test_cc_mae, np.nan, np.nan, np.nan] output_df.loc['PCC'] = [n_train_r, 'train', 'test', 'n/a', test_pcc_estimate[1], test_pcc_mae, np.nan, np.nan, np.nan] test_acc_estimate_internal, test_acc_ms_estimate_internal = model.predict_proportions(X_test, do_cfm=do_cfm) test_acc_rmse_internal = np.abs(test_acc_estimate_internal[1] - target_estimate) test_acc_ms_rmse_internal = np.abs(test_acc_ms_estimate_internal[1] - target_estimate) output_df.loc['ACC_internal'] = [n_train_r, 'train', 'test', 'n/a', test_acc_estimate_internal[1], test_acc_rmse_internal, np.nan, np.nan, np.nan] output_df.loc['MS_internal'] = [n_train_r, 'train', 'nontrain', 'predicted', test_acc_ms_estimate_internal[1], test_acc_ms_rmse_internal, np.nan, np.nan, np.nan] test_platt1_estimate, test_platt2_estimate = model.predict_proportions(X_test, do_platt=do_platt) test_platt1_rmse = np.abs(test_platt1_estimate[1] - target_estimate) test_platt2_rmse = np.abs(test_platt2_estimate[1] - target_estimate) output_df.loc['PCC_platt1'] = [n_train_r, 'train', 'test', 'n/a', test_platt1_estimate[1], test_platt1_rmse, np.nan, np.nan, np.nan] output_df.loc['PCC_platt2'] = [n_train_r, 'train', 'nontrain', 'predicted', test_platt2_estimate[1], test_platt2_rmse, np.nan, np.nan, np.nan] if n_calib > 0: cc_plus_cal_estimate = (test_cc_estimate[1] + calib_estimate) / 2.0 pcc_plus_cal_estimate = (test_pcc_estimate[1] + calib_estimate) / 2.0 cc_plus_cal_mae = np.mean(np.abs(cc_plus_cal_estimate - target_estimate)) pcc_plus_cal_mae = np.mean(np.abs(pcc_plus_cal_estimate - target_estimate)) #output_df.loc['CC_plus_cal'] = [n_train, 'train', 'test', 'n/a', cc_plus_cal_estimate, cc_plus_cal_mae, np.nan, np.nan, np.nan] output_df.loc['PCC_plus_cal'] = [n_train_r, 'train', 'test', 'n/a', pcc_plus_cal_estimate, pcc_plus_cal_mae, np.nan, np.nan, np.nan] results_df.to_csv(os.path.join(dirs.dir_models(project_dir), model_name, 'accuracy.csv')) output_df.to_csv(os.path.join(dirs.dir_models(project_dir), model_name, 'results.csv')) """
def parse_xml_files(xml_filelist_filename, output_dir): filelist = fh.read_text(xml_filelist_filename) parsed_files = {} sentiments = {} dependencies = {} dependency_tuples = {} entities = {} coref = {} coref_entities = {} coref_heads = {} all_groups = {} jk_grams = {} amalgram_pairs = {} for file in filelist: file = file.rstrip('\n') print file # peel off both .txt and .xml basename = fh.get_basename_wo_ext(fh.get_basename_wo_ext(file)) sentences, doc_sentiments, doc_dependencies, doc_dependency_tuples, doc_entities, doc_coref, groups, _,\ doc_coref_entities, doc_coref_heads = parse_xml_output(file) parsed_files[basename] = sentences sentiments[basename] = doc_sentiments dependencies[basename] = doc_dependencies dependency_tuples[basename] = doc_dependency_tuples entities[basename] = doc_entities coref[basename] = doc_coref coref_entities[basename] = doc_coref_entities coref_heads[basename] = doc_coref_heads doc_jk_grams, doc_jk_indices = find_jk_grams(sentences) jk_grams[basename] = doc_jk_grams # output documents to amalgram format #amalgram_dir = os.path.join(dirs.data_amalgram_dir, 'input') #if not os.path.exists(amalgram_dir): # os.makedirs(amalgram_dir) tagged_sents = ['\n'.join([t['word'] + '\t' + t['POS'] for t in s]) + '\n' for s in sentences] # save word/tag pairs for amalgram tagged_sents = [[(t['word'], t['POS']) for t in s] for s in sentences] amalgram_pairs[basename] = tagged_sents # uncomment for extracting story elements... parsed_dir = os.path.join(output_dir, 'parsed') if not os.path.exists(parsed_dir): os.makedirs(parsed_dir) parsed_filename = os.path.join(parsed_dir, basename + '.json') fh.write_to_json(sentences, parsed_filename, sort_keys=False) sentiment_filename = fh.make_filename(output_dir, 'sentiments', 'json') fh.write_to_json(sentiments, sentiment_filename, sort_keys=False) dependencies_filename = fh.make_filename(output_dir, 'dependency_tuple_ids', 'json') fh.write_to_json(dependency_tuples, dependencies_filename, sort_keys=False) coref_filename = fh.make_filename(output_dir, 'entities', 'json') fh.write_to_json(coref, coref_filename, sort_keys=False) jkgrams_filename = fh.make_filename(output_dir, 'jkgrams', 'json') fh.write_to_json(jk_grams, jkgrams_filename, sort_keys=False) coref_heads_filename = fh.make_filename(output_dir, 'coref_heads', 'json') fh.write_to_json(coref_heads, coref_heads_filename, sort_keys=False) amalgram_keys = amalgram_pairs.keys() amalgram_keys.sort() amalgram_data_file = os.path.join(dirs.data_amalgram_dir, 'input.txt') with codecs.open(amalgram_data_file, 'w', encoding='utf-8') as output_file: for k in amalgram_keys: sents = amalgram_pairs[k] for s in sents: for p in s: output_file.write(p[0] + '\t' + p[1] + '\n') output_file.write('\n') for k in amalgram_keys: amalgram_data_file = os.path.join(dirs.data_amalgram_dir, k + '.txt') with codecs.open(amalgram_data_file, 'w', encoding='utf-8') as output_file: sents = amalgram_pairs[k] for s in sents: for p in s: output_file.write(p[0] + '\t' + p[1] + '\n') output_file.write('\n') amalgram_index_file = os.path.join(dirs.data_amalgram_dir, 'index.txt') with codecs.open(amalgram_index_file, 'w', encoding='utf-8') as output_file: for k in amalgram_keys: sents = amalgram_pairs[k] for s in sents: output_file.write(k + '\n') #all_groups_filename = fh.make_filename(output_dir, 'all_groups', 'json') #fh.write_to_json(all_groups, all_groups_filename) return parsed_files, dependencies