def finetune(input_path, output_model_dir, text_col, label_col, model_dir, gpu, with_header): '''Train a new model or finetune an existing model with labels, output fine-tuned model''' # assign GPU os.environ['CUDA_VISIBLE_DEVICES'] = gpu dataset = csv_handler.csv_readlines(input_path) header = None if with_header == True: header = dataset[0] dataset = dataset[1:] print("Loading source model from %s ...\n" % (model_dir)) model = BertModel(model_dir) text_col = text_col - 1 label_col = label_col - 1 labels = transform.map_func(range(len(dataset)), lambda i : [i, dataset[i][text_col], dataset[i][label_col]]) print("Fine-tuning with input labels") model.train(labels) model.checkpoint(output_model_dir) print("Finished. Fine-tuned model is ready at " + output_model_dir)
def transform(input_path, output_path): dataset = csv_handler.csv_readlines(input_path) dataset = dataset[1:] dataset = transformer.map_func( range(len(dataset)), lambda i: (i, dataset[i][0], 1 if dataset[i][1] == "True" else 0)) csv_handler.csv_writelines(output_path, dataset)
def extractor(anno_dir, id_to_file, paper_id): file_path = anno_dir + id_to_file[paper_id] label_sent_dataset = csv_handler.csv_readlines(file_path, delimit='\t') indexed_result = transformer.indexleft_func(label_sent_dataset) final = transformer.map_func( indexed_result, lambda p: (paper_id + "_" + str(p[0]), p[1][1], p[1][0])) return final
def bert_estimate(input_path, text_col, output_path, model_dir, gpu, with_header): # assign GPU os.environ['CUDA_VISIBLE_DEVICES'] = gpu # load model if model_dir == '': model_dir = os.path.join(os.environ['FACTMINE_HOME'], 'cache/company_model') print("Loading model from %s ...\n" % (model_dir)) model = BertModel(model_dir) print("Calculating negative score, positive score, and argmax class...\n") dataset = csv_handler.csv_readlines(input_path) header = None if with_header == True: header = dataset[0] dataset = dataset[1:] text_col = text_col - 1 texts = transform.map_func(dataset, lambda row: row[text_col]) preds = model.predict(texts) assert (len(dataset) == len(preds)) # apply sigmoid preds = transform.map_func( preds, lambda quad: [util.sigmoid(quad[0]), util.sigmoid(quad[1]), quad[2], quad[3]]) for i in range(len(dataset)): dataset[i].append(preds[i][0]) dataset[i].append(preds[i][1]) dataset[i].append(preds[i][2]) if with_header == True: header.append('score_0') header.append('score_1') header.append('arg_class') dataset = [header] + dataset csv_handler.csv_writelines(output_path, dataset) if output_path != "": print("Finished, Results are ready at %s " % (output_path))
def extract(data, typename): dataset = csv_handler.csv_readlines("./dataset/" + data + "set.csv") dataset = transformer.indexleft_func(dataset) dataset = transformer.map_func(dataset, lambda row: (row[0], row[1][1], row[1][2])) output_path = "./" + data + ".csv" def e_func(triplet): label = 0 if triplet[2] == typename: label = 1 return (triplet[0], triplet[1], label) final = transformer.map_func(dataset, lambda triplet: e_func(triplet)) csv_handler.csv_writelines(output_path, final)
def max_balancer(input_csv_path, output_csv_path='./output.csv'): dataset = csv_handler.csv_readlines(input_csv_path) pos_dataset = transform.filter_func(dataset, lambda row: row[2] == '1') neg_dataset = transform.filter_func(dataset, lambda row: row[2] == '0') assert (len(pos_dataset) <= len(neg_dataset)) sampler = Sampler() neg_dataset = sampler.sample_rows(neg_dataset, len(pos_dataset)) pos_ids = transform.map_func(pos_dataset, lambda row: row[0]) neg_ids = transform.map_func(neg_dataset, lambda row: row[0]) select_id_set = set(pos_ids + neg_ids) final = transform.filter_func(dataset, lambda row: row[0] in select_id_set) csv_handler.csv_writelines(output_csv_path, final)
def extract(data, typename): dataset = csv_handler.csv_readlines("./dataset/" + data + "_raw.csv") output_path = "./" + data + ".csv" def func_1(triplet): label = 0 if triplet[2] == typename: label = 1 return (triplet[0], triplet[1], label) e_func = func_1 def func_2(triplet): label = 1 if triplet[2] == typename: label = 0 return (triplet[0], triplet[1], label) if typename == 'NoArgument': e_func = func_2 final = transformer.map_func(dataset, lambda triplet: e_func(triplet)) csv_handler.csv_writelines(output_path, final)
def evaluate(input_path, col_true, col_pred, metric, output_path, with_header): '''evaluate the quality of predictions with a metric (f1 by default), and output the metric scores''' result = [] dataset = csv_handler.csv_readlines(input_path) if with_header == True: dataset = dataset[1:] col_true = int(col_true) - 1 col_pred = int(col_pred) - 1 y_true = transform.map_func(dataset, lambda row : int(row[col_true])) y_pred = transform.map_func(dataset, lambda row : int(row[col_pred])) def check_validity(class_array): for cls in class_array: assert(cls == 0 or cls == 1) check_validity(y_true) check_validity(y_pred) support_set = {'f1', 'accuracy', 'cohen', 'quad'} if metric not in support_set: sys.exit('please specify a valid metric in terms of f1, accuracy, cohen, or quad (i.e. precision_recall_fscore_support)') elif metric == 'f1': result.append(['f1']) result.append([f1_score(y_true, y_pred)]) elif metric == 'accuracy': result.append(['accuracy']) result.append([accuracy_score(y_true, y_pred)]) elif metric == 'cohen': result.append([cohen_kappa_score(y_true, y_pred)]) elif metric == 'quad': (precision, recall, fscore, support) = precision_recall_fscore_support(y_true, y_pred) result.append(['class', 'precision', 'recall', 'fscore', 'support']) result.append([0, precision[0], recall[0], fscore[0], support[0]]) result.append([1, precision[1], recall[1], fscore[1], support[1]]) csv_handler.csv_writelines(output_path, result)
def write_preds(ori_preds, labels, dev_path , output_path = "./tmp.csv"): eval_examples = csv_handler.csv_readlines(dev_path) assert(len(ori_preds) == len(labels)) assert(len(ori_preds) == len(eval_examples)) # append header header = ['id', 'sent', 'label'] for i in range(len(ori_preds[0])): header.append('p' + str(i)) header.append('pred') final = [] final.append(header) # append data for i in range(len(labels)): row = [] sent_id = eval_examples[i][0] sent = eval_examples[i][1] sent_label = int(eval_examples[i][2]) assert(sent_label == labels[i]) row.append(sent_id) row.append(sent) row.append(sent_label) for j in range(len(ori_preds[i])): row.append(ori_preds[i][j]) pred_label = np.argmax(ori_preds[i]) row.append(pred_label) final.append(row) csv_handler.csv_writelines(output_path, final)
from sklearn.feature_extraction.text import TfidfTransformer import os file_path = os.path.dirname(os.path.abspath(__file__)) import sys sys.path.insert(0, '../../pyfunctor') import csv_handler as csv_handler import transform as transformer from sklearn import metrics import time import gensim from pytorch_pretrained_bert import BertTokenizer import os start_time = time.time() train_set = csv_handler.csv_readlines(sys.argv[1]) dev_set = csv_handler.csv_readlines(sys.argv[2]) output_path = sys.argv[3] if os.path.exists(output_path): os.remove(output_path) seed = int(sys.argv[4]) def sep(dataset): sents = transformer.map_func(dataset, lambda triplet: triplet[1]) labels = transformer.map_func(dataset, lambda triplet: (int)(triplet[2])) return (sents, labels)
sys.path.insert(0, "../../pyfunctor") import csv_handler as csv_handler import transform as transformer import csv class CSV_Split(csv_handler.CSV_Handler): def __init__(self, dataset, seed=0): self.seed = seed self.dataset = dataset input_dir = "./DEEPTip/dataset/" # paragraph sent_n = csv_handler.csv_readlines(input_dir + "sent_tip.neg", delimit='\t', quoter=csv.QUOTE_NONE) sent_n = transformer.map_func(sent_n, lambda p: (p[0], p[1], '0')) sent_p = csv_handler.csv_readlines(input_dir + "sent_tip.pos", delimit='\t', quoter=csv.QUOTE_NONE) sent_p = transformer.map_func(sent_p, lambda p: (p[0], p[1], '1')) sent = sent_n + sent_p for row in sent: assert (len(row) == 3) ids = set(transformer.map_func(sent, lambda p: p[0])) assert (len(ids) == len(sent)) splitter = CSV_Split(sent)
root.tag sents = [] for child in root: tid = child.attrib['id'] sentence = "" num_words = len(child) for i in range(num_words): sentence += child[i].text if i < num_words - 2: sentence += " " sents.append((tid, sentence)) import csv_handler as csv_handler golds = csv_handler.csv_readlines(gold_path, delimit='\t') import transform as transformer assert(len(sents) == len(golds)) for i in range(len(sents)): assert(sents[i][0] == golds[i][0]) final = transformer.map_func(range(len(sents)), lambda i : (sents[i][0], sents[i][1], golds[i][1])) import csv_handler as csv_handler class CSV_Split(csv_handler.CSV_Handler): def __init__(self, dataset, seed = 0): self.seed = seed self.dataset = dataset splitter = CSV_Split(final) splitter.csv_split(0.2, "dev.csv", "train.csv")
import transform as transform import csv_handler as csv_handler if __name__ == "__main__": result = [] first_file = sys.argv[1] first_idx = sys.argv[2].split(',') first_idx = transform.map_func(first_idx, lambda idx: int(idx)) second_file = sys.argv[3] second_idx = sys.argv[4].split(',') second_idx = transform.map_func(second_idx, lambda idx: int(idx)) output_csv_file = sys.argv[5] first_dataset = csv_handler.csv_readlines(first_file) result = transform.map_func(first_dataset, lambda row: [row[idx] for idx in first_idx]) second_dataset = csv_handler.csv_readlines(second_file) second_result = transform.map_func( second_dataset, lambda row: [row[idx] for idx in second_idx]) assert (len(first_dataset) == len(second_dataset)) final = transform.map_func(zip(result, second_result), lambda p: p[0] + p[1]) csv_handler.csv_writelines(output_csv_file, final)
import sys root_directory = '../../' sys.path.insert(0, root_directory + "pyfunctor") import transform as transformer import csv_handler as csv_handler import json_handler as json_handler review_input_path = "../FUNNY/yelp_academic_dataset_review.csv" review_output_path = "./all.csv" dataset = csv_handler.csv_readlines(review_input_path) # get idx for review_id, text, and funny_count def get_triplet_idx(header): idx_id = header.index('review_id') idx_text = header.index('text') idx_count = header.index('funny') return (idx_id, idx_text, idx_count) def selector(row, idx_id, idx_text, idx_count): r_id = row[idx_id] text = row[idx_text] funny_count = int(row[idx_count]) is_funny = None if (funny_count >= 5): is_funny = 1 elif (funny_count == 0):
import sys sys.path.insert(0, '../../pyfunctor') import transform as transformer import csv_handler as csv_handler input_file=sys.argv[1] output_file=sys.argv[2] dataset = csv_handler.csv_readlines(input_file) dataset = transformer.indexleft_func(dataset) final = transformer.map_func(dataset, lambda p: (p[0], p[1][2], int(p[1][0]) - 1)) csv_handler.csv_writelines(output_file, final)
def distill(input_path): dataset = csv_handler.csv_readlines(input_path, delimit='\t') return dataset
def __init__(self, csv_file_path): dataset = csv_handler.csv_readlines(csv_file_path) labels = transform.map_func(dataset, lambda t: t[2]) self.weight_map = weight_class(labels)
import csv_handler as csv_handler import transform as transformer data_root = "./dataset" data_dir = data_root + "/data/complete/" topics = [ 'abortion', 'cloning', 'death_penalty', 'gun_control', 'marijuana_legalization', 'minimum_wage', 'nuclear_energy', 'school_uniforms' ] dataset = [] for tp in topics: file_path = data_dir + tp + ".tsv" records = csv_handler.csv_readlines(file_path, delimit='\t', quoter=csv.QUOTE_NONE) records = records[1:] def row_functor(i, records): assert (i < len(records)) row = records[i] rid = row[0] + "_" + str(i) sent = row[4] label = row[5] split = row[6] return (rid, sent, label, split) records = transformer.map_func(range(len(records)),
csv_input_path = sys.argv[1] y_true_col = int(sys.argv[2]) y_pred_col = int(sys.argv[3]) num_threds = int(sys.argv[4]) csv_output_path = sys.argv[5] print_header = '1' if len(sys.argv) > 6: print_header = sys.argv[6] assert (print_header == '1' or print_header == '0') thred_method = "min_max_even" if os.path.exists(csv_output_path): os.remove(csv_output_path) csv_dataset = csv_handler.csv_readlines(csv_input_path) y_true = transform.map_func(csv_dataset, lambda row: int(row[y_true_col])) y_pred_score = transform.map_func(csv_dataset, lambda row: float(row[y_pred_col])) #y_pred_score = transform.map_func(y_pred_score, lambda score : 1 / (1 + math.exp(-score))) thred_col = [] if thred_method == "min_max_even": thred_col = get_threds_by_min_max_even(y_pred_score, num_threds) else: # sorted score even slot thred_col = get_threds_by_sorted_score_equal_length( y_pred_score, num_threds)
examples = [] for (i, txt) in enumerate(batch): guid = "%s" % (i) text_a = txt label = self.dummy_label examples.append( InputExample(guid=guid, text_a=text_a, text_b=None, label=label)) return examples if __name__ == "__main__": data_path = sys.argv[1] model_dir = sys.argv[2] output_path = sys.argv[3] # load test dataset raw_dataset = csv_handler.csv_readlines(data_path) ids = transform.map_func(raw_dataset, lambda row : row[0]) texts = transform.map_func(raw_dataset, lambda row : row[1]) # load model model = BertModel(model_dir) pred = model.predict(texts, 100) assert(len(ids) == len(pred)) output = transform.map_func(range(len(ids)), lambda idx : [ids[idx]] + pred[idx]) csv_handler.csv_writelines(output_path, output)
def distill(input_path, label_map): dataset = csv_handler.csv_readlines(input_path, delimit='\t') dataset = transform.filter_func(dataset, lambda row: len(row) >= 5) token_label = transform.map_func( dataset, lambda row: [row[0].strip(), label_map[row[4].strip()[0]]]) return token_label