def evaluate(mode): for layer in range(1, 7): label2int = read.read_from_json( os.path.join(output_dir, str(layer) + "/label2int")) if mode == "dev": dev_pred = read.read_from_tsv( os.path.join(bert_output, str(layer) + "/eval_results.tsv")) dev_pred = np.asarray(dev_pred, np.float32) labels = read.read_from_json( os.path.join(output_dir, str(layer) + "/label_" + mode)) multihots = [] for label_q in labels: multihots.append(multihot(label_q, label2int)) dev_true = np.asarray(multihots) map, pre_1 = mean_ap(dev_true, dev_pred) print("layer " + str(layer), map, pre_1) elif mode == "test": dev_pred = read.read_from_tsv( os.path.join(bert_output, str(layer) + "/test_results.tsv")) dev_pred = np.asarray(dev_pred, np.float32) labels = read.read_from_json( os.path.join(output_dir, str(layer) + "/label_" + mode)) multihots = [] for label_q in labels: multihots.append(multihot(label_q, label2int)) dev_true = np.asarray(multihots) map, pre_1 = mean_ap(dev_true, dev_pred) print("layer " + str(layer), map, pre_1)
def generate_dist(mode): for layer in range(1, 7): label2int = read.read_from_json( os.path.join(output_dir, str(layer) + "/label2int")) if mode == "dev": question_id = read.read_from_json( os.path.join(output_dir, "question_id_" + mode)) dev_pred = read.read_from_tsv( os.path.join(bert_output, str(layer) + "/eval_results.tsv")) dev_pred = np.asarray(dev_pred, np.float32) generate_dist_file( question_id, dev_pred, label2int, os.path.join( bert_output_distribution, "dev/BERT-Base-DEV.L" + str(layer) + ".classdist.txt")) elif mode == "test": question_id = read.read_from_json( os.path.join(output_dir, "question_id_" + mode)) dev_pred = read.read_from_tsv( os.path.join(bert_output, str(layer) + "/test_results.tsv")) dev_pred = np.asarray(dev_pred, np.float32) generate_dist_file( question_id, dev_pred, label2int, os.path.join( bert_output_distribution, "test/BERT-Base-TEST.L" + str(layer) + ".classdist.txt"))
def data2bert(): for layer in range(1, 3): label2int = read.read_from_json( os.path.join(bert_input, str(layer) + "/label2int")) for mode in modes: labels = read.read_from_json( os.path.join(processed_dir, str(layer) + "/" + "label_" + mode)) question_only_texts = read.read_from_json( os.path.join(processed_dir, "question_" + mode)) generate_tsv( question_only_texts, labels, label2int, os.path.join( bert_input, str(layer) + "/trec_" + mode + "_" + str(layer) + ".tsv"), mode)
def split_label(level, mode): labels = read.read_from_json(os.path.join(processed_dir, "label_" + mode)) labels_level = [] for label in labels: label_level = get_label_at_level(label, level) labels_level.append(label_level) read.save_in_json( os.path.join(processed_dir, str(level) + "/" + "label_" + mode), labels_level)
def label_dict(): for layer in range(1, 3): label_counts = defaultdict(float) for mode in modes: split_label(layer, mode) labels = read.read_from_json( os.path.join(processed_dir, str(layer) + "/" + "label_" + mode)) for label in labels: label = label.strip() label_counts[label] += 1.0 label2int = {j: i for i, j in enumerate(label_counts)} read.save_in_json( os.path.join(processed_dir, str(layer) + "/label_counts"), label_counts) read.save_in_json(os.path.join(bert_input, str(layer) + "/label2int"), label2int)
def generate_cv(): data = read_questions(data_path) label2int = read.read_from_json(os.path.join(bert_input,"label2int")) questions, label = data data_set= list(zip(questions,label)) question_idx = list(range(0, len(questions))) for fold, [train_idx, validation_idx,test_idx] in enumerate(k_fold_cross_validation(question_idx, 10, randomize=True)): train = [data_set[question_id] for question_id in train_idx] validation = [data_set[question_id] for question_id in validation_idx] test = [data_set[question_id] for question_id in test_idx] data = [train, validation, test] for idx, mode in enumerate(modes): input = [dataitem[0] for dataitem in data[idx]] labels = [dataitem[1] for dataitem in data[idx]] idx_label = [[label2int[label_single] for label_single in label] for label in labels ] read.save_in_json(os.path.join(processed_dir,"index_label_" + mode +"_"+ str(fold)), idx_label) read.save_in_json(os.path.join(processed_dir,"question_" + mode +"_"+ str(fold)), data[idx]) generate_tsv(input, label, label2int,os.path.join(bert_input,"lat_" + mode +"_"+ str(fold) + ".tsv"),mode)