def assemble_annotated_output(sentences, doc_entities, doc_coref_entities, output_dir, basename): # reassemble an annotated version of the file temp = copy.deepcopy(sentences) # add entity marker for sent_index, sent in enumerate(doc_entities): for e in sent: start = int(e['start']) end = int(e['end']) id = int(e['id']) ent_type = e['type'] temp[sent_index][end-1]['word'] += '*' # add NER annotations last_tag = 'O' for si, s in enumerate(temp): for ti, t in enumerate(s): ner = t['NER'] if ner != last_tag: if last_tag != 'O': temp[si][ti-1]['word'] += ']_' + last_tag if ner != 'O': temp[si][ti]['word'] = '[' + temp[si][ti]['word'] last_tag = ner if last_tag != 'O': temp[si][-1]['word'] += ']_' + last_tag # add coref annotations for e in doc_coref_entities: sent_index = int(e['sentence']) start = int(e['start']) end = int(e['end']) id = int(e['id']) temp[sent_index][start]['word'] = '<' + temp[sent_index][start]['word'] temp[sent_index][end-1]['word'] += '>_' + str(id) # add jk_gram annotations for jk in doc_jk_indices: sent_index = int(jk['sentence']) start = int(jk['start']) end = int(jk['end']) temp[sent_index][start]['word'] = '{' + temp[sent_index][start]['word'] temp[sent_index][end-1]['word'] += '}' annotated_dir = os.path.join(output_dir, 'annotated') if not os.path.exists(annotated_dir): os.makedirs(annotated_dir) output_filename = os.path.join(annotated_dir, basename + '.txt') annotated_sents = [' '.join([t['word'] for t in s]) for s in temp] fh.write_list_to_text(annotated_sents, output_filename)
def split_into_files(input_filename, output_dir): data = fh.read_json(input_filename) keys = data.keys() keys.sort() filelist = [] for key in keys: key = key.rstrip('\n') line = data[key].rstrip('\n') normalized_filename = os.path.join(output_dir, key + '.txt') filelist.append(normalized_filename) with codecs.open(normalized_filename, 'w', encoding='utf-8') as output_file: output_file.write(line) filelist_filename = fh.make_filename(output_dir, 'filelist', 'txt') fh.write_list_to_text(filelist, filelist_filename) return filelist_filename
def cross_train_and_eval(project_dir, subset, field_name, config_file, n_calib=0, n_train=100, suffix='', model_type='LR', loss='log', do_ensemble=True, dh=100, label='label', penalty='l1', cshift=None, intercept=True, n_dev_folds=5, repeats=1, verbose=False, average='micro', objective='f1', seed=None, alpha_min=0.01, alpha_max=1000.0, sample_labels=False, run_all=False): model_basename = subset + '_' + label + '_' + field_name + '_' + model_type + '_' + penalty if model_type == 'MLP': model_basename += '_' + str(dh) model_basename += '_' + str(n_train) + '_' + str(n_calib) + '_' + objective if cshift is not None: model_basename += '_cshift' if sample_labels: model_basename += '_sampled' model_basename += suffix # save the experiment parameters to a log file logfile = os.path.join(dirs.dir_logs(project_dir), model_basename + '.json') fh.makedirs(dirs.dir_logs(project_dir)) log = { 'project': project_dir, 'subset': subset, 'field_name': field_name, 'config_file': config_file, 'n_calib': n_calib, 'n_train': n_train, 'suffix': suffix, 'model_type': model_type, 'loss': loss, 'dh': dh, 'alpha_min': alpha_min, 'alpha_max': alpha_max, 'do_ensemble': do_ensemble, 'label': label, 'penalty': penalty, 'cshift': cshift, 'intercept': intercept, 'objective': objective, 'n_dev_folds': n_dev_folds, 'repeats': repeats, 'average': average, #'use_calib_pred': use_calib_pred, #'exclude_calib': exclude_calib, 'sample_labels': sample_labels } fh.write_to_json(log, logfile) # load the features specified in the config file config = fh.read_json(config_file) feature_defs = [] for f in config['feature_defs']: feature_defs.append(features.parse_feature_string(f)) # load the file that contains metadata about each item metadata_file = os.path.join(dirs.dir_subset(project_dir, subset), 'metadata.csv') metadata = fh.read_csv_to_df(metadata_file) field_vals = list(set(metadata[field_name].values)) field_vals.sort() print("Splitting data according to :", field_vals) # repeat the following value for each fold of the partition of interest (up to max_folds, if given) for v_i, v in enumerate(field_vals): print("\nTesting on %s" % v) # first, split into training and non-train data based on the field of interest train_selector = metadata[field_name] != v train_subset = metadata[train_selector] train_items = list(train_subset.index) n_train_cshift = len(train_items) non_train_selector = metadata[field_name] == v non_train_subset = metadata[non_train_selector] non_train_items = non_train_subset.index.tolist() n_non_train_cshift = len(non_train_items) print("Train: %d, non-train: %d" % (n_train_cshift, n_non_train_cshift)) # load all labels label_dir = dirs.dir_labels(project_dir, subset) labels_df = fh.read_csv_to_df(os.path.join(label_dir, label + '.csv'), index_col=0, header=0) n_items, n_classes = labels_df.shape # if desired, attempt to learn weights for the training data using techniques for covariate shift if cshift is not None: print("Training a classifier for covariate shift") # start by learning to discriminate train from non-train data train_test_labels = np.zeros((n_items, 2), dtype=int) train_test_labels[train_selector, 0] = 1 train_test_labels[non_train_selector, 1] = 1 if np.sum(train_test_labels[:, 0]) < np.sum(train_test_labels[:, 1]): cshift_pos_label = 0 else: cshift_pos_label = 1 train_test_labels_df = pd.DataFrame(train_test_labels, index=labels_df.index, columns=[0, 1]) # create a cshift model using the same specifiction as our model below (e.g. LR/MLP, etc.) model_name = model_basename + '_' + str(v) + '_' + 'cshift' model, dev_f1, dev_acc, dev_cal, dev_cal_overall = train.train_model_with_labels( project_dir, model_type, loss, model_name, subset, train_test_labels_df, feature_defs, penalty=penalty, alpha_min=alpha_min, alpha_max=alpha_max, intercept=intercept, n_dev_folds=n_dev_folds, save_model=True, do_ensemble=do_ensemble, dh=dh, seed=seed, pos_label=cshift_pos_label, verbose=False) print("cshift results: %0.4f f1, %0.4f acc" % (dev_f1, dev_acc)) # take predictions from model on the training data train_test_pred_df, train_test_probs_df, _ = predict.predict( project_dir, model, model_name, subset, label, verbose=verbose) # display the min and max probs print("Min: %0.4f" % train_test_probs_df[1].min()) print("Max: %0.4f" % train_test_probs_df[1].max()) # use the estimated probability of each item being a training item to compute item weights weights = n_train_cshift / float(n_non_train_cshift) * ( 1.0 / train_test_probs_df[0].values - 1) # print a summary of the weights from just the training items print("Min weight: %0.4f" % weights[train_selector].min()) print("Ave weight: %0.4f" % weights[train_selector].mean()) print("Max weight: %0.4f" % weights[train_selector].max()) # print a summary of all weights print("Min weight: %0.4f" % weights.min()) print("Ave weight: %0.4f" % weights.mean()) print("Max weight: %0.4f" % weights.max()) # create a data frame with this information weights_df = pd.DataFrame(weights, index=labels_df.index) else: weights_df = None # add in a stage to eliminate items with no labels? print("Subsetting items with labels") label_sums_df = labels_df.sum(axis=1) labeled_item_selector = label_sums_df > 0 labels_df = labels_df[labeled_item_selector] n_items, n_classes = labels_df.shape labeled_items = set(labels_df.index) train_items = [i for i in train_items if i in labeled_items] non_train_items = [i for i in non_train_items if i in labeled_items] n_non_train = len(non_train_items) if weights_df is not None: weights_df = weights_df[labeled_item_selector] print("Starting repeats") # repeat the following process multiple times with different random splits of train / calibration / test data for r in range(repeats): print("* Repetition %d *" % r) # next, take a random subset of the training data (and ignore the rest), to simulate fewer annotated items if n_train > 0: np.random.shuffle(train_items) train_items_r = np.random.choice(train_items, size=n_train, replace=False) else: train_items_r = train_items n_train_r = len(train_items_r) # create a data frame to hold a summary of the results output_df = pd.DataFrame([], columns=[ 'N', 'training data', 'test data', 'cal', 'estimate', 'RMSE', '95lcl', '95ucl', 'contains_test' ]) # create a unique name ofr this model model_name = model_basename + '_' + str(v) + '_' + str(r) # now, divide the non-train data into a calibration and a test set #n_calib = int(calib_prop * n_non_train) np.random.shuffle(non_train_items) if n_calib > n_non_train: n_calib = int(n_non_train / 2) print( "Warning!!: only %d non-train items; using 1/2 for calibration" % n_non_train) calib_items = non_train_items[:n_calib] test_items = non_train_items[n_calib:] n_test = len(test_items) print("Train: %d, calibration: %d, test: %d" % (n_train_r, n_calib, n_test)) test_labels_df = labels_df.loc[test_items] non_train_labels_df = labels_df.loc[non_train_items] # if instructed, sample labels in proportion to annotations (to simulate having one label per item) if sample_labels: print("Sampling labels") # normalize the labels temp = labels_df.values / np.array( labels_df.values.sum(axis=1).reshape((n_items, 1)), dtype=float) samples = np.zeros([n_items, n_classes], dtype=int) for i in range(n_items): index = np.random.choice(np.arange(n_classes), size=1, p=temp[i, :]) samples[i, index] = 1 sampled_labels_df = pd.DataFrame(samples, index=labels_df.index, columns=labels_df.columns) else: sampled_labels_df = labels_df train_labels_r_df = sampled_labels_df.loc[train_items_r].copy() calib_labels_df = sampled_labels_df.loc[calib_items].copy() # get the true proportion of labels in the test OR non-training data (calibration and test combined) target_props, target_estimate, target_std = get_estimate_and_std( non_train_labels_df) output_df.loc['target'] = [ n_test, 'nontrain', 'nontrain', 'given', target_estimate, 0, target_estimate - 2 * target_std, target_estimate + 2 * target_std, np.nan ] # get the same estimate from training data train_props, train_estimate, train_std = get_estimate_and_std( train_labels_r_df) # compute the error of this estimate train_rmse = np.sqrt((train_estimate - target_estimate)**2) train_contains_test = target_estimate > train_estimate - 2 * train_std and target_estimate < train_estimate + 2 * train_std output_df.loc['train'] = [ n_train_r, 'train', 'train', 'n/a', train_estimate, train_rmse, np.nan, np.nan, np.nan ] print( "target proportions: (%0.3f, %0.3f); train proportions: %0.3f" % (target_estimate - 2 * target_std, target_estimate + 2 * target_std, train_estimate)) if train_estimate > 0.5: pos_label = 0 else: pos_label = 1 print("Using %d as the positive label" % pos_label) # repeat for labeled calibration data if n_calib > 0: calib_props, calib_estimate, calib_std = get_estimate_and_std( calib_labels_df) calib_rmse = np.sqrt((calib_estimate - target_estimate)**2) # check if the test estimate is within 2 standard deviations of the estimate calib_contains_test = target_estimate > calib_estimate - 2 * calib_std and calib_estimate < calib_estimate + 2 * calib_std output_df.loc['calibration'] = [ n_calib, 'calibration', 'nontrain', 'given', calib_estimate, calib_rmse, calib_estimate - 2 * calib_std, calib_estimate + 2 * calib_std, calib_contains_test ] # do a test using the number of annotations rather than the number of items calib_props2, calib_estimate2, calib_std2 = get_estimate_and_std( calib_labels_df, use_n_annotations=True) calib_rmse2 = np.sqrt((calib_estimate2 - target_estimate)**2) calib_contains_test2 = target_estimate > calib_estimate2 - 2 * calib_std2 and calib_estimate < calib_estimate2 + 2 * calib_std2 output_df.loc['calibration_n_annotations'] = [ n_calib, 'calibration', 'nontrain', 'given', calib_estimate2, calib_rmse2, calib_estimate2 - 2 * calib_std2, calib_estimate2 + 2 * calib_std2, calib_contains_test2 ] results_df = pd.DataFrame( [], columns=['f1', 'acc', 'calibration', 'calib overall']) # Now train a model on the training data, saving the calibration data for calibration print("Training model on training data only") model, dev_f1, dev_acc, dev_cal, dev_cal_overall = train.train_model_with_labels( project_dir, model_type, loss, model_name, subset, sampled_labels_df, feature_defs, weights_df=weights_df, items_to_use=train_items_r, penalty=penalty, alpha_min=alpha_min, alpha_max=alpha_max, intercept=intercept, objective=objective, n_dev_folds=n_dev_folds, do_ensemble=do_ensemble, dh=dh, seed=seed, pos_label=pos_label, verbose=verbose) results_df.loc['cross_val'] = [ dev_f1, dev_acc, dev_cal, dev_cal_overall ] # predict on calibration data if n_calib > 0: calib_predictions_df, calib_pred_probs_df, calib_pred_proportions = predict.predict( project_dir, model, model_name, subset, label, items_to_use=calib_items, verbose=verbose) calib_cc, calib_pcc, calib_acc, calib_pvc = calib_pred_proportions f1_cal, acc_cal = evaluate_predictions.evaluate_predictions( calib_labels_df, calib_predictions_df, calib_pred_probs_df, pos_label=pos_label, average=average, verbose=False) true_calib_vector = np.argmax(calib_labels_df.as_matrix(), axis=1) calib_cal_rmse = evaluation.evaluate_calibration_rmse( true_calib_vector, calib_pred_probs_df.as_matrix()) calib_cal_rmse_overall = evaluation.evaluate_calibration_rmse( true_calib_vector, calib_pred_probs_df.as_matrix(), min_bins=1, max_bins=1) results_df.loc['calibration'] = [ f1_cal, acc_cal, calib_cal_rmse, calib_cal_rmse_overall ] # predict on test data test_predictions_df, test_pred_probs_df, test_pred_proportions = predict.predict( project_dir, model, model_name, subset, label, items_to_use=test_items, verbose=verbose) f1_test, acc_test = evaluate_predictions.evaluate_predictions( test_labels_df, test_predictions_df, test_pred_probs_df, pos_label=pos_label, average=average) true_test_vector = np.argmax(test_labels_df.as_matrix(), axis=1) test_cal_rmse = evaluation.evaluate_calibration_rmse( true_test_vector, test_pred_probs_df.as_matrix()) test_cal_rmse_overall = evaluation.evaluate_calibration_rmse( true_test_vector, test_pred_probs_df.as_matrix(), min_bins=1, max_bins=1) results_df.loc['test'] = [ f1_test, acc_test, test_cal_rmse, test_cal_rmse_overall ] test_cc_estimate, test_pcc_estimate, test_acc_estimate_internal, test_pvc_estimate_internal = test_pred_proportions # predict on calibration and test data combined nontrain_predictions_df, nontrain_pred_probs_df, nontrain_pred_proportions = predict.predict( project_dir, model, model_name, subset, label, items_to_use=non_train_items, verbose=verbose) nontrain_cc_estimate, nontrain_pcc_estimate, nontrain_acc_estimate_internal, nontrain_pvc_estimate_internal = nontrain_pred_proportions if n_calib > 0: cc_calib_rmse = np.sqrt((calib_cc[1] - calib_estimate)**2) output_df.loc['CC_cal'] = [ n_non_train, 'train', 'calibration', 'predicted', calib_cc[1], cc_calib_rmse, np.nan, np.nan, np.nan ] pcc_calib_rmse = np.sqrt((calib_pcc[1] - calib_estimate)**2) output_df.loc['PCC_cal'] = [ n_non_train, 'train', 'calibration', 'predicted', calib_pcc[1], pcc_calib_rmse, np.nan, np.nan, np.nan ] cc_rmse = np.sqrt((nontrain_cc_estimate[1] - target_estimate)**2) pcc_rmse = np.sqrt((nontrain_pcc_estimate[1] - target_estimate)**2) output_df.loc['CC_nontrain'] = [ n_non_train, 'train', 'nontrain', 'predicted', nontrain_cc_estimate[1], cc_rmse, np.nan, np.nan, np.nan ] output_df.loc['PCC_nontrain'] = [ n_non_train, 'train', 'nontrain', 'predicted', nontrain_pcc_estimate[1], pcc_rmse, np.nan, np.nan, np.nan ] if n_calib > 0: averaged_cc_estimate = ( test_cc_estimate[1] * n_test + calib_estimate * n_calib) / float(n_test + n_calib) averaged_pcc_estimate = ( test_pcc_estimate[1] * n_test + calib_estimate * n_calib) / float(n_test + n_calib) averaged_cc_rmse = np.sqrt( (averaged_cc_estimate - target_estimate)**2) averaged_pcc_rmse = np.sqrt( (averaged_pcc_estimate - target_estimate)**2) output_df.loc['CC_nontrain_averaged'] = [ n_non_train, 'train', 'nontrain', 'given', averaged_cc_estimate, averaged_cc_rmse, np.nan, np.nan, np.nan ] output_df.loc['PCC_nontrain_averaged'] = [ n_non_train, 'train', 'nontrain', 'given', averaged_pcc_estimate, averaged_pcc_rmse, np.nan, np.nan, np.nan ] nontrain_acc_rmse_internal = np.sqrt( (nontrain_acc_estimate_internal[1] - target_estimate)**2) nontrain_pvc_rmse_internal = np.sqrt( (nontrain_pvc_estimate_internal[1] - target_estimate)**2) output_df.loc['ACC_internal'] = [ n_non_train, 'train', 'nontrain', 'predicted', nontrain_acc_estimate_internal[1], nontrain_acc_rmse_internal, np.nan, np.nan, np.nan ] output_df.loc['PVC_internal'] = [ n_non_train, 'train', 'nontrain', 'predicted', nontrain_pvc_estimate_internal[1], nontrain_pvc_rmse_internal, np.nan, np.nan, np.nan ] if n_calib > 0: averaged_acc_estimate_internal = ( test_acc_estimate_internal[1] * n_test + calib_estimate * n_calib) / float(n_test + n_calib) averaged_pvc_estimate_internal = ( test_pvc_estimate_internal[1] * n_test + calib_estimate * n_calib) / float(n_test + n_calib) averaged_acc_rmse_internal = np.sqrt( (averaged_acc_estimate_internal - target_estimate)**2) averaged_pvc_rmse_internal = np.sqrt( (averaged_pvc_estimate_internal - target_estimate)**2) output_df.loc['ACC_internal_averaged'] = [ n_non_train, 'train', 'nontrain', 'given', averaged_acc_estimate_internal, averaged_acc_rmse_internal, np.nan, np.nan, np.nan ] output_df.loc['PVC_internal_averaged'] = [ n_non_train, 'train', 'nontrain', 'given', averaged_pvc_estimate_internal, averaged_pvc_rmse_internal, np.nan, np.nan, np.nan ] # do calibration here using calibration data if n_calib > 0: # expand the data so as to only have singly-labeled, weighted items _, calib_labels, calib_weights, calib_predictions = train.prepare_data( np.zeros([n_calib, 2]), calib_labels_df.values, predictions=calib_predictions_df.values) #calib_labels_expanded, calib_weights_expanded, calib_predictions_expanded = expand_labels(calib_labels.values, calib_predictions.values) acc = calibration.compute_acc(calib_labels, calib_predictions, n_classes, weights=calib_weights) acc_corrected = calibration.apply_acc_binary( nontrain_predictions_df.values, acc) acc_estimate = acc_corrected[1] acc_rmse = np.sqrt((acc_estimate - target_estimate)**2) output_df.loc['ACC'] = [ n_non_train, 'train', 'nontrain', 'predicted', acc_estimate, acc_rmse, np.nan, np.nan, np.nan ] pvc = calibration.compute_pvc(calib_labels, calib_predictions, n_classes, weights=calib_weights) pvc_corrected = calibration.apply_pvc( nontrain_predictions_df.values, pvc) pvc_estimate = pvc_corrected[1] pvc_rmse = np.sqrt((pvc_estimate - target_estimate)**2) output_df.loc['PVC'] = [ n_non_train, 'train', 'nontrain', 'predicted', pvc_estimate, pvc_rmse, np.nan, np.nan, np.nan ] acc_corrected = calibration.apply_acc_binary( test_predictions_df.values, acc) acc_estimate = acc_corrected[1] averaged_acc_estimate = ( acc_estimate * n_test + calib_estimate * n_calib) / float(n_test + n_calib) averaged_acc_rmse = np.sqrt( (acc_estimate - target_estimate)**2) output_df.loc['ACC_averaged'] = [ n_non_train, 'train', 'nontrain', 'given', averaged_acc_estimate, averaged_acc_rmse, np.nan, np.nan, np.nan ] pvc_corrected = calibration.apply_pvc( test_predictions_df.values, pvc) pvc_estimate = pvc_corrected[1] averaged_pvc_estimate = ( pvc_estimate * n_test + calib_estimate * n_calib) / float(n_test + n_calib) averaged_pvc_rmse = np.sqrt( (pvc_estimate - target_estimate)**2) output_df.loc['PVC_averaged'] = [ n_non_train, 'train', 'nontrain', 'given', averaged_pvc_estimate, averaged_pvc_rmse, np.nan, np.nan, np.nan ] print("Venn internal nontrain") #models = list(model._models.values()) nontrain_pred_ranges_internal, nontrain_preds_internal = ivap.estimate_probs_from_labels_internal( project_dir, model, model_name, subset, non_train_items) pred_range = np.mean(nontrain_pred_ranges_internal, axis=0) venn_estimate = np.mean(nontrain_preds_internal) venn_rmse = np.sqrt((venn_estimate - target_estimate)**2) venn_contains_test = pred_range[0] < target_estimate < pred_range[1] output_df.loc['Venn_internal'] = [ n_non_train, 'train', 'nontrain', 'predicted', venn_estimate, venn_rmse, pred_range[0], pred_range[1], venn_contains_test ] if n_calib > 0: print("Venn internal test") test_pred_ranges_internal, test_preds_internal = ivap.estimate_probs_from_labels_internal( project_dir, model, model_name, subset, test_items) pred_range = np.mean(test_pred_ranges_internal, axis=0) venn_estimate = (np.mean(test_preds_internal) * n_test + calib_estimate * n_calib) / float(n_test + n_calib) venn_rmse = np.sqrt((venn_estimate - target_estimate)**2) averaged_lower = (pred_range[0] * n_test + (calib_estimate - 2 * calib_std) * n_calib) / float(n_test + n_calib) averaged_upper = (pred_range[1] * n_test + (calib_estimate + 2 * calib_std) * n_calib) / float(n_test + n_calib) venn_contains_test = averaged_lower < target_estimate < averaged_upper output_df.loc['Venn_internal_averaged'] = [ n_non_train, 'train', 'nontrain', 'given', venn_estimate, venn_rmse, averaged_lower, averaged_upper, venn_contains_test ] # Venn prediction using proper calibration data print("Venn calibration") calib_pred_ranges, calib_preds, calib_props_in_range, list_of_n_levels = ivap.estimate_probs_from_labels_cv( project_dir, model, model_name, sampled_labels_df, subset, calib_items=calib_items) print("Venn test") test_pred_ranges, test_preds = ivap.estimate_probs_from_labels( project_dir, model, model_name, sampled_labels_df, subset, subset, calib_items=calib_items, test_items=test_items) nontrain_pred_ranges = np.vstack( [calib_pred_ranges, test_pred_ranges]) nontrain_preds = np.r_[calib_preds, test_preds] nontrain_pred_range = np.mean(nontrain_pred_ranges, axis=0) nontrain_venn_estimate = np.mean(nontrain_preds) nontrain_venn_rmse = np.sqrt( (nontrain_venn_estimate - target_estimate)**2) nontrain_contains_test = nontrain_pred_range[ 0] < target_estimate < nontrain_pred_range[1] output_df.loc['Venn'] = [ n_non_train, 'train', 'nontrain', 'predicted', nontrain_venn_estimate, nontrain_venn_rmse, nontrain_pred_range[0], nontrain_pred_range[1], nontrain_contains_test ] test_pred_range = np.mean(test_pred_ranges, axis=0) averaged_venn_estimate = ( np.mean(test_preds) * n_test + calib_estimate * n_calib) / float(n_test + n_calib) averaged_venn_rmse = np.sqrt( (averaged_venn_estimate - target_estimate)**2) averaged_lower = (test_pred_range[0] * n_test + (calib_estimate - 2 * calib_std) * n_calib) / float(n_test + n_calib) averaged_upper = (test_pred_range[1] * n_test + (calib_estimate + 2 * calib_std) * n_calib) / float(n_test + n_calib) venn_contains_test = averaged_lower < target_estimate < averaged_upper output_df.loc['Venn_averaged'] = [ n_non_train, 'train', 'nontrain', 'given', averaged_venn_estimate, averaged_venn_rmse, averaged_lower, averaged_upper, venn_contains_test ] fh.write_list_to_text( calib_props_in_range, os.path.join(dirs.dir_models(project_dir), model_name, 'venn_calib_props_in_range.csv')) fh.write_list_to_text( list_of_n_levels, os.path.join(dirs.dir_models(project_dir), model_name, 'list_of_n_levels.csv')) results_df.to_csv( os.path.join(dirs.dir_models(project_dir), model_name, 'accuracy.csv')) # now train a model on the training and calibration data combined if run_all: print("Training model on all labeled data") calib_and_train_items_r = np.array( list(calib_items) + list(train_items_r)) model, dev_f1, dev_acc, dev_cal, dev_cal_overall = train.train_model_with_labels( project_dir, model_type, loss, model_name, subset, sampled_labels_df, feature_defs, weights_df=weights_df, items_to_use=calib_and_train_items_r, penalty=penalty, alpha_min=alpha_min, alpha_max=alpha_max, intercept=intercept, objective=objective, n_dev_folds=n_dev_folds, do_ensemble=do_ensemble, dh=dh, seed=seed, pos_label=pos_label, verbose=verbose) results_df.loc['cross_val_all'] = [ dev_f1, dev_acc, dev_cal, dev_cal_overall ] # get labels for test data test_predictions_df, test_pred_probs_df, test_pred_proportions = predict.predict( project_dir, model, model_name, subset, label, items_to_use=test_items, verbose=verbose) f1_test, acc_test = evaluate_predictions.evaluate_predictions( test_labels_df, test_predictions_df, test_pred_probs_df, pos_label=pos_label, average=average) test_cc_estimate, test_pcc_estimate, test_acc_estimate_internal, test_pvc_estimate_internal = test_pred_proportions true_test_vector = np.argmax(test_labels_df.as_matrix(), axis=1) test_cal_rmse = evaluation.evaluate_calibration_rmse( true_test_vector, test_pred_probs_df.as_matrix()) results_df.loc['test'] = [f1_test, acc_test, test_cal_rmse, 0] results_df.loc['test_all'] = [ f1_test, acc_test, test_cal_rmse, 0 ] nontrain_predictions_df, nontrain_pred_probs_df, nontrain_pred_proportions = predict.predict( project_dir, model, model_name, subset, label, items_to_use=non_train_items, verbose=verbose) nontrain_cc_estimate, nontrain_pcc_estimate, nontrain_acc_estimate_internal, nontrain_pvc_estimate_internal = nontrain_pred_proportions cc_rmse = np.sqrt( (nontrain_cc_estimate[1] - target_estimate)**2) pcc_rmse = np.sqrt( (nontrain_pcc_estimate[1] - target_estimate)**2) output_df.loc['CC_nontrain_all'] = [ n_non_train, 'nontest', 'nontrain', 'predicted', nontrain_cc_estimate[1], cc_rmse, np.nan, np.nan, np.nan ] output_df.loc['PCC_nontrain_all'] = [ n_non_train, 'nontest', 'nontrain', 'predicted', nontrain_pcc_estimate[1], pcc_rmse, np.nan, np.nan, np.nan ] if n_calib > 0: averaged_cc_estimate = ( test_cc_estimate[1] * n_test + calib_estimate * n_calib) / float(n_test + n_calib) averaged_pcc_estimate = ( test_pcc_estimate[1] * n_test + calib_estimate * n_calib) / float(n_test + n_calib) averaged_cc_rmse = np.sqrt( (averaged_cc_estimate - target_estimate)**2) averaged_pcc_rmse = np.sqrt( (averaged_pcc_estimate - target_estimate)**2) output_df.loc['CC_nontrain_averaged_all'] = [ n_non_train, 'nontest', 'nontrain', 'given', averaged_cc_estimate, averaged_cc_rmse, np.nan, np.nan, np.nan ] output_df.loc['PCC_nontrain_averaged_all'] = [ n_non_train, 'nontest', 'nontrain', 'given', averaged_pcc_estimate, averaged_pcc_rmse, np.nan, np.nan, np.nan ] nontrain_acc_rmse_internal = np.sqrt( (nontrain_acc_estimate_internal[1] - target_estimate)**2) nontrain_pvc_rmse_internal = np.sqrt( (nontrain_pvc_estimate_internal[1] - target_estimate)**2) output_df.loc['ACC_internal_all'] = [ n_non_train, 'nontest', 'nontrain', 'predicted', nontrain_acc_estimate_internal[1], nontrain_acc_rmse_internal, np.nan, np.nan, np.nan ] output_df.loc['PVC_internal_all'] = [ n_non_train, 'nontest', 'nontrain', 'predicted', nontrain_pvc_estimate_internal[1], nontrain_pvc_rmse_internal, np.nan, np.nan, np.nan ] if n_calib > 0: averaged_acc_estimate_internal = ( test_acc_estimate_internal[1] * n_test + calib_estimate * n_calib) / float(n_test + n_calib) averaged_pvc_estimate_internal = ( test_pvc_estimate_internal[1] * n_test + calib_estimate * n_calib) / float(n_test + n_calib) averaged_acc_rmse_internal = np.sqrt( (averaged_acc_estimate_internal - target_estimate)**2) averaged_pvc_rmse_internal = np.sqrt( (averaged_pvc_estimate_internal - target_estimate)**2) output_df.loc['ACC_internal_averaged_all'] = [ n_non_train, 'nontest', 'nontrain', 'given', averaged_acc_estimate_internal, averaged_acc_rmse_internal, np.nan, np.nan, np.nan ] output_df.loc['PVC_internal_averaged_all'] = [ n_non_train, 'nontest', 'nontrain', 'given', averaged_pvc_estimate_internal, averaged_pvc_rmse_internal, np.nan, np.nan, np.nan ] print("Venn internal nontrain") nontrain_pred_ranges_internal, nontrain_preds_internal = ivap.estimate_probs_from_labels_internal( project_dir, model, model_name, subset, non_train_items) pred_range = np.mean(nontrain_pred_ranges_internal, axis=0) venn_estimate = np.mean(nontrain_preds_internal) venn_rmse = np.sqrt((venn_estimate - target_estimate)**2) venn_contains_test = pred_range[ 0] < target_estimate < pred_range[1] output_df.loc['Venn_internal_all'] = [ n_non_train, 'nontest', 'nontrain', 'predicted', venn_estimate, venn_rmse, pred_range[0], pred_range[1], venn_contains_test ] if n_calib > 0: print("Venn internal test") test_pred_ranges_internal, test_preds_internal = ivap.estimate_probs_from_labels_internal( project_dir, model, model_name, subset, test_items) pred_range = np.mean(test_pred_ranges_internal, axis=0) venn_estimate = (np.mean(test_preds_internal) * n_test + calib_estimate * n_calib) / float(n_test + n_calib) venn_rmse = np.sqrt((venn_estimate - target_estimate)**2) averaged_lower = (pred_range[0] * n_test + (calib_estimate - 2 * calib_std) * n_calib) / float(n_test + n_calib) averaged_upper = (pred_range[1] * n_test + (calib_estimate + 2 * calib_std) * n_calib) / float(n_test + n_calib) venn_contains_test = averaged_lower < target_estimate < averaged_upper output_df.loc['Venn_internal_averaged_all'] = [ n_non_train, 'nontest', 'nontrain', 'given', venn_estimate, venn_rmse, averaged_lower, averaged_upper, venn_contains_test ] results_df.to_csv( os.path.join(dirs.dir_models(project_dir), model_name, 'accuracy.csv')) output_df.to_csv( os.path.join(dirs.dir_models(project_dir), model_name, 'results.csv'))