def compute_metrics_token(p): predictions, labels = p predictions = np.argmax(predictions, axis=2) ## batch_size, seq_length offset_wise_scores = [] # print(len(predictions)) for i, prediction in enumerate(predictions): ## Batch Wise # print(len(prediction)) ground_spans = eval(validation_spans[i]) predicted_spans = [] for j, tokenwise_prediction in enumerate( prediction[:len(validation_offsets_mapping[i])]): if tokenwise_prediction == 1: predicted_spans += list( range( validation_offsets_mapping[i][j][0], validation_offsets_mapping[i][j][1], )) offset_wise_scores.append(f1(predicted_spans, ground_spans)) results_offset = np.mean(offset_wise_scores) true_predictions = [[p for (p, l) in zip(pred, label) if l != -100] for pred, label in zip(predictions, labels)] true_labels = [[l for (p, l) in zip(pred, label) if l != -100] for pred, label in zip(predictions, labels)] results = np.mean([ f1_score(true_label, true_preds) for true_label, true_preds in zip(true_labels, true_predictions) ]) return {"Token-Wise F1": results, "Offset-Wise F1": results_offset}
def calculate_f1(preds_file, ground_file, out_file): ground_spans = pd.read_csv(ground_file)["spans"].apply( lambda x: eval(x)).values pred_spans = [] with open(preds_file, "r") as f: for line in f: line_split = line.split("\t") pred_spans.append(eval(line_split[1])) f1_score = np.mean( [f1(pred, gold) for pred, gold in zip(pred_spans, ground_spans)]) with open(out_file, "w") as f: f.write(str(f1_score))
def predict(train_file, dev_file, test_files, max_length, save_dir, max_epochs=100): # train_file = "./data/tsd_train.csv" # dev_file = "./data/tsd_trial.csv" # test_file = "./data/tsd_test.csv" # clean_train_file = "./data/clean_train.csv" # clean_dev_file = "./data/clean_trial.csv" train = read_datafile(train_file) dev = read_datafile(dev_file) # reduced_train = [] # for i in train: # if i not in dev: # reduced_train.append(i) ## Tune Threshold on Dev train_token_labels, train_offset_mapping = list( zip(*[ convert_spans_to_token_labels(text, spans) for spans, text in train ])) dev_token_labels, dev_offset_mapping = list( zip(*[ convert_spans_to_token_labels(text, spans) for spans, text in dev ])) train_tokens = [ [ word.lower().translate( str.maketrans("", "", string.punctuation) ) ## Remove Punctuation and make into lower case for word in text.split() ] for spans, text in train ] dev_tokens = [ [ word.lower().translate( str.maketrans("", "", string.punctuation) ) ## Remove Punctuation and make into lower case for word in text.split() ] for spans, text in dev ] train_token_labels_oh = [ to_categorical(train_token_label, num_classes=3) for train_token_label in train_token_labels ] dev_token_labels_oh = [ to_categorical(dev_token_label, num_classes=3) for dev_token_label in dev_token_labels ] rnnsl = RNNSL(max_epochs=max_epochs) run_df = rnnsl.fit( train_tokens, train_token_labels_oh, validation_data=(dev_tokens, dev_token_labels_oh), ) if not os.path.exists(save_dir): os.makedirs(save_dir) run_df.to_csv(os.path.join(save_dir, "RNNSL_Run.csv"), index=False) # rnnsl.set_up_preprocessing(reduced_train_tokens) # rnnsl.model = rnnsl.build() val_data = (dev_tokens, dev_token_labels) rnnsl.tune_threshold(val_data, f1_score) print("=" * 80) print("Threshold: ", rnnsl.threshold) with open(os.path.join(save_dir, "thresh.txt"), "w") as f: f.write(str(rnnsl.threshold)) token_predictions = rnnsl.get_toxic_offsets(val_data[0], ) ## Word Level Toxic Offsets print("=" * 80) print( "F1_score Word Wise on Dev Tokens :", np.mean([ f1_score(token_predictions[i], val_data[1][i][:max_length]) for i in range(len(val_data[1])) ]), ) print("=" * 80) # dev_offset_mapping #map token index to offsets offset_predictions = [] for example in range(len(dev_tokens)): offset_predictions.append([]) for token in range(len(dev_tokens[example][:max_length])): if token_predictions[example][token] == rnnsl.toxic_label: offset_predictions[-1] += list( range( dev_offset_mapping[example][token][0], dev_offset_mapping[example][token][1], )) dev_spans = [spans for spans, text in dev] dev_texts = [text for spans, text in dev] new_offset_predictions = [ clean_predicted_text(text, offsets) for offsets, text in zip(offset_predictions, dev_texts) ] # for i in range(20): # ground_offsets = dev_spans[i] # old_offsets = offset_predictions[i] # new_offsets = new_offset_predictions[i] # text = dev_texts[i] # print("Text: ", text) # print("Ground: ", get_text_spans(text, ground_offsets)) # print("Preds: ", get_text_spans(text, old_offsets)) # print("Clean Preds: ", get_text_spans(text, new_offsets)) avg_dice_score = np.mean([ f1(preds, gold) for preds, gold in zip(new_offset_predictions, dev_spans) ]) print("=" * 80) print("Avg Dice Score on Dev: ", avg_dice_score) print("=" * 80) ## Test predictions # print("=" * 80) # print("Training on both train and dev for predictions!") # print("=" * 80) # combo = train + dev # combo_token_labels, combo_offset_mapping = list( # zip(*[convert_spans_to_token_labels(text, spans) for spans, text in combo]) # ) # combo_tokens = [ # [ # word.lower().translate( # str.maketrans("", "", string.punctuation) # ) ## Remove Punctuation and make into lower case # for word in text.split() # ] # for spans, text in combo # ] # combo_token_labels_oh = [ # to_categorical(combo_token_label, num_classes=3) # for combo_token_label in combo_token_labels # ] # rnnsl_2 = RNNSL(max_epochs=10) # pred_df = rnnsl_2.fit(combo_tokens, combo_token_labels_oh) # pred_df.to_csv("RNNSL_Pred.csv", index=False) # rnnsl_2.threshold = rnnsl.threshold ##Replace with tuned threshold # rnnsl_2.set_up_preprocessing(combo_tokens) # rnnsl_2.model = rnnsl_2.build() rnnsl.model.save(os.path.join(save_dir, "model")) for test_file in test_files: print(f"Predicting on {test_file}") test = read_datafile(test_file) test_token_labels, test_offset_mapping = list( zip(*[ convert_spans_to_token_labels(text, spans) for spans, text in test ])) test_tokens = [ [ word.lower().translate( str.maketrans("", "", string.punctuation) ) ## Remove Punctuation and make into lower case for word in text.split() ] for spans, text in test ] test_token_labels_oh = [ to_categorical(test_token_label, num_classes=3) for test_token_label in test_token_labels ] test_spans = [spans for spans, text in test] test_texts = [text for spans, text in test] check_for_mismatch(test_tokens, test_texts, test_offset_mapping) final_token_predictions = rnnsl.get_toxic_offsets(test_tokens) print("=" * 80) print( f"F1_score Word Wise on {test_file} Tokens :", np.mean([ f1_score(final_token_predictions[i], test_token_labels[i][:max_length]) for i in range(len(test_token_labels)) ]), ) print("=" * 80) final_offset_predictions = [] for example in range(len(test_tokens)): final_offset_predictions.append([]) for token in range(len( test_tokens[example][:max_length])): # max_length: 192 if final_token_predictions[example][ token] == rnnsl.toxic_label: final_offset_predictions[-1] += list( range( test_offset_mapping[example][token][0], test_offset_mapping[example][token][1], )) new_final_offset_predictions = [ clean_predicted_text(text, offsets) for offsets, text in zip(final_offset_predictions, test_texts) ] avg_dice_score = np.mean([ f1(preds, gold) for preds, gold in zip(new_final_offset_predictions, test_spans) ]) print("=" * 80) print("Avg Dice Score on Dev: ", avg_dice_score) print("=" * 80) with open( os.path.join( save_dir, f"eval_scores_{test_file.split('/')[-1].split('.')[0]}.txt" ), "w", ) as f: f.write(str(avg_dice_score)) # for i in range(20): # old_offsets = final_offset_predictions[i] # new_offsets = new_final_offset_predictions[i] # text = test_texts[i] # print("Text: ", text) # print("Preds: ", get_text_spans(text, old_offsets)) # print("Clean Preds: ", get_text_spans(text, new_offsets)) with open( os.path.join( save_dir, f"spans-pred-{test_file.split('/')[-1].split('.')[0]}.txt" ), "w", ) as f: for i, spans in enumerate(new_final_offset_predictions): f.write(f"{i}\t{str(spans)}\n")
def dev(): train_file = "./data/tsd_train.csv" dev_file = "./data/tsd_trial.csv" train = read_datafile(train_file) dev = read_datafile(dev_file) reduced_train = [] for i in train: if i not in dev: reduced_train.append(i) ## Tune Threshold on Dev reduced_train_token_labels, reduced_train_offset_mapping = list( zip(*[ convert_spans_to_token_labels(text, spans) for spans, text in reduced_train ])) dev_token_labels, dev_offset_mapping = list( zip(*[ convert_spans_to_token_labels(text, spans) for spans, text in dev ])) reduced_train_tokens = [ [ word.lower().translate( str.maketrans("", "", string.punctuation) ) ## Remove Punctuation and make into lower case for word in text.split() ] for spans, text in reduced_train ] dev_tokens = [ [ word.lower().translate( str.maketrans("", "", string.punctuation) ) ## Remove Punctuation and make into lower case for word in text.split() ] for spans, text in dev ] reduced_train_token_labels_oh = [ to_categorical(train_token_label, num_classes=3) for train_token_label in reduced_train_token_labels ] dev_token_labels_oh = [ to_categorical(dev_token_label, num_classes=3) for dev_token_label in dev_token_labels ] rnnsl = RNNSL() run_df = rnnsl.fit( reduced_train_tokens, reduced_train_token_labels_oh, validation_data=(dev_tokens, dev_token_labels_oh), ) run_df.to_csv("RNNSL_Run.csv", index=False) # rnnsl.set_up_preprocessing(reduced_train_tokens) # rnnsl.model = rnnsl.build() val_data = (dev_tokens, dev_token_labels) rnnsl.tune_threshold(val_data, f1_score) print("=" * 80) print("Threshold: ", rnnsl.threshold) token_predictions = rnnsl.get_toxic_offsets(val_data[0], ) ## Word Level Toxic Offsets print("=" * 80) print( "F1_score Word Wise on Dev Tokens :", np.mean([ f1_score(token_predictions[i], val_data[1][i][:192]) for i in range(len(val_data[1])) ]), ) print("=" * 80) # dev_offset_mapping #map token index to offsets offset_predictions = [] for example in range(len(dev_tokens)): offset_predictions.append([]) for token in range(len(dev_tokens[example][:192])): if token_predictions[example][token] == rnnsl.toxic_label: offset_predictions[-1] += list( range( dev_offset_mapping[example][token][0], dev_offset_mapping[example][token][1], )) dev_spans = [spans for spans, text in dev] dev_texts = [text for spans, text in dev] new_offset_predictions = [ clean_predicted_text(text, offsets) for offsets, text in zip(offset_predictions, dev_texts) ] for i in range(20): ground_offsets = dev_spans[i] old_offsets = offset_predictions[i] new_offsets = new_offset_predictions[i] text = dev_texts[i] print("Text: ", text) print("Ground: ", get_text_spans(text, ground_offsets)) print("Preds: ", get_text_spans(text, old_offsets)) print("Clean Preds: ", get_text_spans(text, new_offsets)) avg_dice_score = np.mean([ f1(preds, gold) for preds, gold in zip(new_offset_predictions, dev_spans) ]) print("=" * 80) print("Avg Dice Score on Dev: ", avg_dice_score) print("=" * 80)
def main(): """Train and eval a spacy named entity tagger for toxic spans.""" # Read training data print('loading training data') train = read_datafile('data/tsd_train.csv') # Read test data print('loading test data') test = read_datafile('data/tsd_test.csv') # Convert training data to Spacy Entities nlp = spacy.load("en_core_web_sm") print('preparing training data') training_data = [] for n, (spans, text) in enumerate(train): doc = nlp(text) ents = spans_to_ents(doc, set(spans), 'TOXIC') training_data.append((doc.text, {'entities': ents})) toxic_tagging = spacy.blank('en') toxic_tagging.vocab.strings.add('TOXIC') ner = nlp.create_pipe("ner") toxic_tagging.add_pipe(ner, last=True) ner.add_label('TOXIC') pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"] unaffected_pipes = [ pipe for pipe in toxic_tagging.pipe_names if pipe not in pipe_exceptions ] print('training') with toxic_tagging.disable_pipes(*unaffected_pipes): toxic_tagging.begin_training() for iteration in range(30): random.shuffle(training_data) losses = {} batches = spacy.util.minibatch(training_data, size=spacy.util.compounding( 4.0, 32.0, 1.001)) for batch in batches: texts, annotations = zip(*batch) toxic_tagging.update(texts, annotations, drop=0.5, losses=losses) print("Losses", losses) # Score on test data print('evaluation') scores = [] for spans, text in test: pred_spans = [] doc = toxic_tagging(text) for ent in doc.ents: pred_spans.extend( range(ent.start_char, ent.start_char + len(ent.text))) score = f1(pred_spans, spans) scores.append(score) test_f1 = pd.DataFrame({'spacy_f1': scores}) print( f"Spacy tagging baseline F1 = {test_f1.spacy_f1.mean():.2f} ± {sem(test_f1.spacy_f1):.2f}" ) test_f1.to_csv('spacy_f1.csv')
break else: if k >= len(preds): break if preds[k] == 1: predicted_spans[-1] += list( range(offsets[0], offsets[1])) k += 1 spans = [ eval(temp_dataset[i]["spans"]) for i in range(len(temp_dataset)) ] avg_f1_score = np.mean([ f1(preds, ground) for preds, ground in zip(predicted_spans, spans) ]) with open( os.path.join(eval_config.save_dir, f"spans-pred-{key}.txt"), "w") as f: for i, pred in enumerate(predicted_spans): if i == len(preds) - 1: f.write(f"{i}\t{str(pred)}") else: f.write(f"{i}\t{str(pred)}\n") with open( os.path.join(eval_config.save_dir, f"eval_scores_{key}.txt"), "w") as f: f.write(str(avg_f1_score)) else:
def main(): """Train and eval a spacy named entity tagger for toxic spans.""" # Read training data print("loading training data") datasets = {} # datasets["clean_train"] = read_datafile("../data/clean_train.csv") # Read trial data for test. # print("loading dev data") # dev = read_datafile("../data/tsd_trial.csv") # train = train + dev # datasets["clean_trial"] = read_datafile("../data/clean_trial.csv") datasets["tsd_train"] = read_datafile("../data/tsd_train.csv") datasets["tsd_trial"] = read_datafile("../data/tsd_trial.csv") datasets["tsd_test"] = read_datafile("../data/tsd_test_spans.csv") # Convert training data to Spacy Entities nlp = spacy.load("en_core_web_sm") print("preparing training data") training_data = [] for n, (spans, text) in enumerate(datasets["tsd_train"]): doc = nlp(text) ents = spans_to_ents(doc, set(spans), "TOXIC") training_data.append((doc.text, {"entities": ents})) toxic_tagging = spacy.blank("en") toxic_tagging.vocab.strings.add("TOXIC") ner = nlp.create_pipe("ner") toxic_tagging.add_pipe(ner, last=True) ner.add_label("TOXIC") pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"] unaffected_pipes = [ pipe for pipe in toxic_tagging.pipe_names if pipe not in pipe_exceptions ] print("training") with toxic_tagging.disable_pipes(*unaffected_pipes): toxic_tagging.begin_training() for iteration in range(30): random.shuffle(training_data) losses = {} batches = spacy.util.minibatch(training_data, size=spacy.util.compounding( 4.0, 32.0, 1.001)) for batch in batches: texts, annotations = zip(*batch) toxic_tagging.update(texts, annotations, drop=0.5, losses=losses) print("Losses", losses) # Score on dev data. print("evaluation") for dataset in datasets.keys(): scores = [] with open(f"spans-pred-{dataset}.txt", "w") as f: for i, (spans, text) in enumerate(datasets[dataset]): pred_spans = [] doc = toxic_tagging(text) for ent in doc.ents: pred_spans.extend( range(ent.start_char, ent.start_char + len(ent.text))) score = semeval2021.f1(pred_spans, spans) f.write(f"{i}\t{str(pred_spans)}\n") scores.append(score) with open(f"eval_scores_{dataset}.txt", "w") as f: f.write(str(statistics.mean(scores)))