def predict(self, all_texts, batch_size=32, text_col=1): if type(all_texts[0]) == tuple or type(all_texts[0]) == list: all_texts = transform.map_func(all_texts, lambda row: row[text_col]) output = [] i = 0 while i < len(all_texts): print("start processing {} / {}".format(i, len(all_texts))) batch_texts = all_texts[i:(i + batch_size)] examples = self.__get_examples(batch_texts) pred = self.__predict_batch(examples) pred_and_class = self.__get_class_from_pred(pred) assert (len(pred_and_class) == len(batch_texts)) transform.map_func( range(len(pred_and_class)), lambda idx: pred_and_class[idx].append(batch_texts[idx])) output += pred_and_class i += batch_size return output
def __get_class_from_pred(self, pred): result = pred[1] result = result.detach().cpu().numpy() label = numpy.argmax(result, axis = 1) result = result.tolist() label = label.tolist() transform.map_func(range(len(result)), lambda idx : result[idx].append(label[idx])) return result
def bert_estimate(input_path, text_col, output_path, model_dir, gpu, with_header): # assign GPU os.environ['CUDA_VISIBLE_DEVICES'] = gpu # load model if model_dir == '': model_dir = os.path.join(os.environ['FACTMINE_HOME'], 'cache/company_model') print("Loading model from %s ...\n" % (model_dir)) model = BertModel(model_dir) print("Calculating negative score, positive score, and argmax class...\n") dataset = csv_handler.csv_readlines(input_path) header = None if with_header == True: header = dataset[0] dataset = dataset[1:] text_col = text_col - 1 texts = transform.map_func(dataset, lambda row: row[text_col]) preds = model.predict(texts) assert (len(dataset) == len(preds)) # apply sigmoid preds = transform.map_func( preds, lambda quad: [util.sigmoid(quad[0]), util.sigmoid(quad[1]), quad[2], quad[3]]) for i in range(len(dataset)): dataset[i].append(preds[i][0]) dataset[i].append(preds[i][1]) dataset[i].append(preds[i][2]) if with_header == True: header.append('score_0') header.append('score_1') header.append('arg_class') dataset = [header] + dataset csv_handler.csv_writelines(output_path, dataset) if output_path != "": print("Finished, Results are ready at %s " % (output_path))
def extract(data, typename): dataset = csv_handler.csv_readlines("./dataset/" + data + "set.csv") dataset = transformer.indexleft_func(dataset) dataset = transformer.map_func(dataset, lambda row: (row[0], row[1][1], row[1][2])) output_path = "./" + data + ".csv" def e_func(triplet): label = 0 if triplet[2] == typename: label = 1 return (triplet[0], triplet[1], label) final = transformer.map_func(dataset, lambda triplet: e_func(triplet)) csv_handler.csv_writelines(output_path, final)
def project_batch(self, texts): batch_tokens = transform.map_func(texts, lambda text: self.__tokenize(text, pad_to_max_length=True)) batch_tensor = torch.tensor(batch_tokens).to(self.device) self.model.eval() output = self.model(batch_tensor) token_embeddings = output[0].detach().cpu().numpy() result = transform.map_func(token_embeddings, lambda row : row[0]) #result = output[0][0][0].detach().cpu().numpy() return result
def finetune(input_path, output_model_dir, text_col, label_col, model_dir, gpu, with_header): '''Train a new model or finetune an existing model with labels, output fine-tuned model''' # assign GPU os.environ['CUDA_VISIBLE_DEVICES'] = gpu dataset = csv_handler.csv_readlines(input_path) header = None if with_header == True: header = dataset[0] dataset = dataset[1:] print("Loading source model from %s ...\n" % (model_dir)) model = BertModel(model_dir) text_col = text_col - 1 label_col = label_col - 1 labels = transform.map_func(range(len(dataset)), lambda i : [i, dataset[i][text_col], dataset[i][label_col]]) print("Fine-tuning with input labels") model.train(labels) model.checkpoint(output_model_dir) print("Finished. Fine-tuned model is ready at " + output_model_dir)
def log_to_csv_with_auc_accuracy(y_true, y_pred, y_score, csv_log_file_path, identity_info="dataset"): labels = [0, 1] result = precision_recall_fscore_support(y_true, y_pred) row = [] row.append(identity_info) # neg row.append('label ' + str(labels[0]) + ":") row.append(result[0][0]) row.append(result[1][0]) row.append(result[2][0]) row.append(result[3][0]) row.append(' ') # pos row.append('label ' + str(labels[1]) + ":") row.append(result[0][1]) row.append(result[1][1]) row.append(result[2][1]) row.append(result[3][1]) row.append(' ') # auc and accuracy y_pos_score = transform.map_func(y_score, lambda p : p[1]) auc = metrics.roc_auc_score(y_true, y_pos_score) row.append(auc) accuracy = metrics.accuracy_score(y_true, y_pred) row.append(accuracy) csv_handler.append_row(csv_log_file_path, row)
def sample_rows(self, dataset, num_samples): assert (num_samples < len(dataset)) idx_set = self.distinct_ints(num_samples, 0, len(dataset) - 1) idx_set = list(idx_set) idx_set.sort() result = transformer.map_func(idx_set, lambda idx: dataset[idx]) return result
def max_balancer(input_csv_path, output_csv_path='./output.csv'): dataset = csv_handler.csv_readlines(input_csv_path) pos_dataset = transform.filter_func(dataset, lambda row: row[2] == '1') neg_dataset = transform.filter_func(dataset, lambda row: row[2] == '0') assert (len(pos_dataset) <= len(neg_dataset)) sampler = Sampler() neg_dataset = sampler.sample_rows(neg_dataset, len(pos_dataset)) pos_ids = transform.map_func(pos_dataset, lambda row: row[0]) neg_ids = transform.map_func(neg_dataset, lambda row: row[0]) select_id_set = set(pos_ids + neg_ids) final = transform.filter_func(dataset, lambda row: row[0] in select_id_set) csv_handler.csv_writelines(output_csv_path, final)
def transform(input_path, output_path): dataset = csv_handler.csv_readlines(input_path) dataset = dataset[1:] dataset = transformer.map_func( range(len(dataset)), lambda i: (i, dataset[i][0], 1 if dataset[i][1] == "True" else 0)) csv_handler.csv_writelines(output_path, dataset)
def csv_split(self, percentage, first_output_path, second_output_path): assert (percentage > 0) assert (percentage < 1) first_size = percentage * len(self.dataset) first_idx = self.__sampled_idx(first_size) # first output first_dataset = transformer.map_func(first_idx, lambda idx: self.dataset[idx]) csv_writelines(first_output_path, first_dataset) # second output first_idx_set = set(first_idx) second_idx = transformer.filter_func( range(len(self.dataset)), lambda idx: idx not in first_idx_set) second_dataset = transformer.map_func(second_idx, lambda idx: self.dataset[idx]) csv_writelines(second_output_path, second_dataset)
def extractor(anno_dir, id_to_file, paper_id): file_path = anno_dir + id_to_file[paper_id] label_sent_dataset = csv_handler.csv_readlines(file_path, delimit='\t') indexed_result = transformer.indexleft_func(label_sent_dataset) final = transformer.map_func( indexed_result, lambda p: (paper_id + "_" + str(p[0]), p[1][1], p[1][0])) return final
def evaluate(input_path, col_true, col_pred, metric, output_path, with_header): '''evaluate the quality of predictions with a metric (f1 by default), and output the metric scores''' result = [] dataset = csv_handler.csv_readlines(input_path) if with_header == True: dataset = dataset[1:] col_true = int(col_true) - 1 col_pred = int(col_pred) - 1 y_true = transform.map_func(dataset, lambda row : int(row[col_true])) y_pred = transform.map_func(dataset, lambda row : int(row[col_pred])) def check_validity(class_array): for cls in class_array: assert(cls == 0 or cls == 1) check_validity(y_true) check_validity(y_pred) support_set = {'f1', 'accuracy', 'cohen', 'quad'} if metric not in support_set: sys.exit('please specify a valid metric in terms of f1, accuracy, cohen, or quad (i.e. precision_recall_fscore_support)') elif metric == 'f1': result.append(['f1']) result.append([f1_score(y_true, y_pred)]) elif metric == 'accuracy': result.append(['accuracy']) result.append([accuracy_score(y_true, y_pred)]) elif metric == 'cohen': result.append([cohen_kappa_score(y_true, y_pred)]) elif metric == 'quad': (precision, recall, fscore, support) = precision_recall_fscore_support(y_true, y_pred) result.append(['class', 'precision', 'recall', 'fscore', 'support']) result.append([0, precision[0], recall[0], fscore[0], support[0]]) result.append([1, precision[1], recall[1], fscore[1], support[1]]) csv_handler.csv_writelines(output_path, result)
def extract(data, typename): dataset = csv_handler.csv_readlines("./dataset/" + data + "_raw.csv") output_path = "./" + data + ".csv" def func_1(triplet): label = 0 if triplet[2] == typename: label = 1 return (triplet[0], triplet[1], label) e_func = func_1 def func_2(triplet): label = 1 if triplet[2] == typename: label = 0 return (triplet[0], triplet[1], label) if typename == 'NoArgument': e_func = func_2 final = transformer.map_func(dataset, lambda triplet: e_func(triplet)) csv_handler.csv_writelines(output_path, final)
def csv_shuf(self, num_samples, output_path): assert (len(self.dataset) >= num_samples) sampled_idx = self.__sampled_idx(num_samples) samples = transformer.map_func(sampled_idx, lambda idx: self.dataset[idx]) csv_writelines(output_path, samples)
def sep(dataset): sents = transformer.map_func(dataset, lambda triplet: triplet[1]) labels = transformer.map_func(dataset, lambda triplet: (int)(triplet[2])) return (sents, labels)
train_duration = train_finish_time - start_time print("train time is " + str(train_finish_time - start_time)) print("predicting...") predicted = text_clf.predict(X_dev) predicted_proba = text_clf.predict_proba(X_dev) assert (len(predicted_proba) == len(X_dev)) assert (len(X_dev) == len(y_dev)) print("logging...") (precision, recall, fscore, support) = metrics.precision_recall_fscore_support(y_dev, predicted) row = [] row.append(sys.argv[1]) row.append(precision[1]) row.append(recall[1]) row.append(fscore[1]) pos_predicted = transform.map_func(predicted_proba, lambda p: p[1]) auc = metrics.roc_auc_score(y_dev, pos_predicted) row.append(auc) accuracy = metrics.accuracy_score(y_dev, predicted) row.append(accuracy) csv_handler.append_row(log_file_path, row)
root.tag sents = [] for child in root: tid = child.attrib['id'] sentence = "" num_words = len(child) for i in range(num_words): sentence += child[i].text if i < num_words - 2: sentence += " " sents.append((tid, sentence)) import csv_handler as csv_handler golds = csv_handler.csv_readlines(gold_path, delimit='\t') import transform as transformer assert(len(sents) == len(golds)) for i in range(len(sents)): assert(sents[i][0] == golds[i][0]) final = transformer.map_func(range(len(sents)), lambda i : (sents[i][0], sents[i][1], golds[i][1])) import csv_handler as csv_handler class CSV_Split(csv_handler.CSV_Handler): def __init__(self, dataset, seed = 0): self.seed = seed self.dataset = dataset splitter = CSV_Split(final) splitter.csv_split(0.2, "dev.csv", "train.csv")
def select(split, dataset): final = transformer.filter_func(dataset, lambda row: row[3] == split) final = transformer.map_func(final, lambda row: (row[0], row[1], row[2])) return final
examples = [] for (i, txt) in enumerate(batch): guid = "%s" % (i) text_a = txt label = self.dummy_label examples.append( InputExample(guid=guid, text_a=text_a, text_b=None, label=label)) return examples if __name__ == "__main__": data_path = sys.argv[1] model_dir = sys.argv[2] output_path = sys.argv[3] # load test dataset raw_dataset = csv_handler.csv_readlines(data_path) ids = transform.map_func(raw_dataset, lambda row : row[0]) texts = transform.map_func(raw_dataset, lambda row : row[1]) # load model model = BertModel(model_dir) pred = model.predict(texts, 100) assert(len(ids) == len(pred)) output = transform.map_func(range(len(ids)), lambda idx : [ids[idx]] + pred[idx]) csv_handler.csv_writelines(output_path, output)
def get_weights(self, targets): return transform.map_func(targets, lambda lb: self.weight_map[lb])
# get idx for review_id, text, and funny_count def get_triplet_idx(header): idx_id = header.index('review_id') idx_text = header.index('text') idx_count = header.index('funny') return (idx_id, idx_text, idx_count) def selector(row, idx_id, idx_text, idx_count): r_id = row[idx_id] text = row[idx_text] funny_count = int(row[idx_count]) is_funny = None if (funny_count >= 5): is_funny = 1 elif (funny_count == 0): is_funny = 0 return (r_id, text, is_funny) (idx_id, idx_text, idx_count) = get_triplet_idx(dataset[0]) selected_datasets = transformer.map_func( dataset[1:], lambda line: selector(line, idx_id, idx_text, idx_count)) final_datasets = transformer.filter_func(selected_datasets, lambda row: row[2] != None) csv_handler.csv_writelines(review_output_path, final_datasets)
y_pred_col = int(sys.argv[3]) num_threds = int(sys.argv[4]) csv_output_path = sys.argv[5] print_header = '1' if len(sys.argv) > 6: print_header = sys.argv[6] assert (print_header == '1' or print_header == '0') thred_method = "min_max_even" if os.path.exists(csv_output_path): os.remove(csv_output_path) csv_dataset = csv_handler.csv_readlines(csv_input_path) y_true = transform.map_func(csv_dataset, lambda row: int(row[y_true_col])) y_pred_score = transform.map_func(csv_dataset, lambda row: float(row[y_pred_col])) #y_pred_score = transform.map_func(y_pred_score, lambda score : 1 / (1 + math.exp(-score))) thred_col = [] if thred_method == "min_max_even": thred_col = get_threds_by_min_max_even(y_pred_score, num_threds) else: # sorted score even slot thred_col = get_threds_by_sorted_score_equal_length( y_pred_score, num_threds) if print_header == '1':
def __init__(self, csv_file_path): dataset = csv_handler.csv_readlines(csv_file_path) labels = transform.map_func(dataset, lambda t: t[2]) self.weight_map = weight_class(labels)
delimit='\t', quoter=csv.QUOTE_NONE) records = records[1:] def row_functor(i, records): assert (i < len(records)) row = records[i] rid = row[0] + "_" + str(i) sent = row[4] label = row[5] split = row[6] return (rid, sent, label, split) records = transformer.map_func(range(len(records)), lambda i: row_functor(i, records)) print(len(records)) dataset = dataset + records[1:] print(len(dataset)) print(dataset[0]) def select(split, dataset): final = transformer.filter_func(dataset, lambda row: row[3] == split) final = transformer.map_func(final, lambda row: (row[0], row[1], row[2])) return final train_set = select("train", dataset) print(len(train_set))
def evaluate(self, dev_dataset): preds = self.predict(dev_dataset) ground = transform.map_func(dev_dataset, lambda row: int(row[2])) (precision, recall, fscore, support) = precision_recall_fscore_support(ground, preds) return fscore
test_id_dataset = open(test_id_path, "r").read().splitlines() from os import listdir from os.path import isfile, join files = listdir(anno_dir) files = transformer.filter_func(files, lambda name: 'rating' in name) def get_id(file_name): idx = file_name.index("rating") return file_name[:idx - 1] # check completeness anno_ids = transformer.map_func(files, lambda file_name: get_id(file_name)) anno_ids.sort() train_test_ids = train_id_dataset + test_id_dataset train_test_ids.sort() assert (anno_ids == train_test_ids) id_to_file = transformer.map_func(files, lambda file: (get_id(file), file)) id_to_file = dict(id_to_file) def extractor(anno_dir, id_to_file, paper_id): file_path = anno_dir + id_to_file[paper_id] label_sent_dataset = csv_handler.csv_readlines(file_path, delimit='\t') indexed_result = transformer.indexleft_func(label_sent_dataset)
def train(self, labeled_dataset, train_batch_size=32, num_epoch=5, adam_lr=2e-5, adam_epsilon=1e-8, scheduler_warmup_steps=0, text_col=1, label_col=2): # prepare training data texts = transform.map_func(labeled_dataset, lambda tri: tri[text_col]) labels = transform.map_func(labeled_dataset, lambda tri: tri[label_col]) train_examples = self.__get_examples(texts, labels) train_dataset = self.__get_inputs(train_examples, ["0", "1"]) train_dataset = TensorDataset(train_dataset['input_ids'], train_dataset['attention_mask'], train_dataset['token_type_ids'], train_dataset['labels']) train_sampler = RandomSampler(train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=train_batch_size) # prepare optimizer optimizer = AdamW(self.model.parameters(), lr=adam_lr, eps=adam_epsilon) # prepare scheduler t_total = len(train_dataloader) * num_epoch scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=t_total) # start training self.model.zero_grad() for _ in trange(0, num_epoch, desc="Training Epoch"): num_step_per_epoch = len(train_dataloader) for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration")): self.model.train() batch = tuple(t.to(self.device) for t in batch) inputs = { "input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3] } outputs = self.model(**inputs) loss = outputs[0] if step in { int(num_step_per_epoch / 4), int(num_step_per_epoch * 2 / 4), int(num_step_per_epoch * 3 / 4), num_step_per_epoch - 1 }: print("\n training loss is " + str(loss.item())) loss.backward() optimizer.step() scheduler.step() self.model.zero_grad() # return trained model return self.model
import sys sys.path.insert(0, "../../pyfunctor") import transform as transform import csv_handler as csv_handler if __name__ == "__main__": result = [] first_file = sys.argv[1] first_idx = sys.argv[2].split(',') first_idx = transform.map_func(first_idx, lambda idx: int(idx)) second_file = sys.argv[3] second_idx = sys.argv[4].split(',') second_idx = transform.map_func(second_idx, lambda idx: int(idx)) output_csv_file = sys.argv[5] first_dataset = csv_handler.csv_readlines(first_file) result = transform.map_func(first_dataset, lambda row: [row[idx] for idx in first_idx]) second_dataset = csv_handler.csv_readlines(second_file) second_result = transform.map_func( second_dataset, lambda row: [row[idx] for idx in second_idx]) assert (len(first_dataset) == len(second_dataset)) final = transform.map_func(zip(result, second_result), lambda p: p[0] + p[1])
root_directory = '../../' sys.path.insert(0, root_directory + "pyfunctor") import transform as transformer import csv_handler as csv_handler input_path = "./goodreads_reviews_spoiler.json" output_path = "spoiler.csv" fin = open(input_path, "r") dataset = [] for row in fin.readlines(): dataset.append(row) json_dataset = transformer.map_func(dataset, lambda line: json.loads(line)) def format_func(line): num_true = line.count('"has_spoiler": true,') num_false = line.count('"has_spoiler": false,') if num_true == 0: assert (num_false == 1) line = line.replace('"has_spoiler": false,', '"has_spoiler": "false",') elif num_true == 1: assert (num_false == 0) line = line.replace('"has_spoiler": true,', '"has_spoiler": "true",') else: assert (False) line = json.loads(line)