def _evaluate_with_span_F1(self, data_loader, embedding_storage_mode, mini_batch_size, out_path): eval_loss = 0 batch_no: int = 0 metric = Metric("Evaluation", beta=self.beta) lines: List[str] = [] y_true = [] y_pred = [] for batch in data_loader: # predict for batch loss = self.predict(batch, embedding_storage_mode=embedding_storage_mode, mini_batch_size=mini_batch_size, label_name='predicted', return_loss=True) eval_loss += loss batch_no += 1 for sentence in batch: # make list of gold tags gold_spans = sentence.get_spans(self.tag_type) gold_tags = [(span.tag, repr(span)) for span in gold_spans] # make list of predicted tags predicted_spans = sentence.get_spans("predicted") predicted_tags = [(span.tag, repr(span)) for span in predicted_spans] # check for true positives, false positives and false negatives for tag, prediction in predicted_tags: if (tag, prediction) in gold_tags: metric.add_tp(tag) else: metric.add_fp(tag) for tag, gold in gold_tags: if (tag, gold) not in predicted_tags: metric.add_fn(tag) tags_gold = [] tags_pred = [] # also write to file in BIO format to use old conlleval script if out_path: for token in sentence: # check if in gold spans gold_tag = 'O' for span in gold_spans: if token in span: gold_tag = 'B-' + span.tag if token == span[ 0] else 'I-' + span.tag tags_gold.append(gold_tag) predicted_tag = 'O' # check if in predicted spans for span in predicted_spans: if token in span: predicted_tag = 'B-' + span.tag if token == span[ 0] else 'I-' + span.tag tags_pred.append(predicted_tag) lines.append( f'{token.text} {gold_tag} {predicted_tag}\n') lines.append('\n') y_true.append(tags_gold) y_pred.append(tags_pred) if out_path: with open(Path(out_path), "w", encoding="utf-8") as outfile: outfile.write("".join(lines)) eval_loss /= batch_no detailed_result = ( "\nResults:" f"\n- F1-score (micro) {metric.micro_avg_f_score():.4f}" f"\n- F1-score (macro) {metric.macro_avg_f_score():.4f}" '\n\nBy class:') for class_name in metric.get_classes(): detailed_result += ( f"\n{class_name:<10} tp: {metric.get_tp(class_name)} - fp: {metric.get_fp(class_name)} - " f"fn: {metric.get_fn(class_name)} - precision: " f"{metric.precision(class_name):.4f} - recall: {metric.recall(class_name):.4f} - " f"f1-score: " f"{metric.f_score(class_name):.4f}") result = Result( main_score=metric.micro_avg_f_score(), log_line= f"{metric.precision():.4f}\t{metric.recall():.4f}\t{metric.micro_avg_f_score():.4f}", log_header="PRECISION\tRECALL\tF1", detailed_results=detailed_result, ) return result, eval_loss
def evaluate(self, evaluation: List[Sentence], out_path=None, evaluation_method: str = 'F1', eval_batch_size: int = 32, embeddings_in_memory: bool = True): batch_no: int = 0 batches = [ evaluation[x:x + eval_batch_size] for x in range(0, len(evaluation), eval_batch_size) ] metric = Metric('') lines: List[str] = [] for batch in batches: batch_no += 1 scores, tag_seq = self.model._predict_scores_batch(batch) predicted_ids = tag_seq all_tokens = [] for sentence in batch: all_tokens.extend(sentence.tokens) for (token, score, predicted_id) in zip(all_tokens, scores, predicted_ids): token: Token = token # get the predicted tag predicted_value = self.model.tag_dictionary.get_item_for_index( predicted_id) token.add_tag('predicted', predicted_value, score) for sentence in batch: # add predicted tags for token in sentence.tokens: predicted_tag: Label = token.get_tag('predicted') # append both to file for evaluation eval_line = '{} {} {}\n'.format( token.text, token.get_tag(self.model.tag_type).value, predicted_tag.value) lines.append(eval_line) lines.append('\n') # make list of gold tags gold_tags = [ str(tag) for tag in sentence.get_spans(self.model.tag_type) ] # make list of predicted tags predicted_tags = [ str(tag) for tag in sentence.get_spans('predicted') ] # check for true positives, false positives and false negatives for prediction in predicted_tags: if prediction in gold_tags: metric.tp() else: metric.fp() for gold in gold_tags: if gold not in predicted_tags: metric.fn() if not embeddings_in_memory: self.clear_embeddings_in_batch(batch) if out_path is not None: test_tsv = os.path.join(out_path, "test.tsv") with open(test_tsv, "w", encoding='utf-8') as outfile: outfile.write(''.join(lines)) if evaluation_method == 'accuracy': score = metric.accuracy() return score, metric if evaluation_method == 'F1': score = metric.f_score() return score, metric
def evaluate( self, data_loader: DataLoader, out_path: Path = None, embedding_storage_mode: str = "none", ) -> (Result, float): with torch.no_grad(): eval_loss = 0 metric = Metric("Evaluation") lines: List[str] = [] batch_count: int = 0 for batch in data_loader: batch_count += 1 labels, loss = self.forward_labels_and_loss(batch) eval_loss += loss sentences_for_batch = [ sent.to_plain_string() for sent in batch ] confidences_for_batch = [[ label.score for label in sent_labels ] for sent_labels in labels] predictions_for_batch = [[ label.value for label in sent_labels ] for sent_labels in labels] true_values_for_batch = [ sentence.get_label_names() for sentence in batch ] available_labels = self.label_dictionary.get_items() for sentence, confidence, prediction, true_value in zip( sentences_for_batch, confidences_for_batch, predictions_for_batch, true_values_for_batch, ): eval_line = "{}\t{}\t{}\t{}\n".format( sentence, true_value, prediction, confidence) lines.append(eval_line) for predictions_for_sentence, true_values_for_sentence in zip( predictions_for_batch, true_values_for_batch): for label in available_labels: if (label in predictions_for_sentence and label in true_values_for_sentence): metric.add_tp(label) elif (label in predictions_for_sentence and label not in true_values_for_sentence): metric.add_fp(label) elif (label not in predictions_for_sentence and label in true_values_for_sentence): metric.add_fn(label) elif (label not in predictions_for_sentence and label not in true_values_for_sentence): metric.add_tn(label) store_embeddings(batch, embedding_storage_mode) eval_loss /= batch_count detailed_result = ( f"\nMICRO_AVG: acc {metric.micro_avg_accuracy()} - f1-score {metric.micro_avg_f_score()}" f"\nMACRO_AVG: acc {metric.macro_avg_accuracy()} - f1-score {metric.macro_avg_f_score()}" ) for class_name in metric.get_classes(): detailed_result += ( f"\n{class_name:<10} tp: {metric.get_tp(class_name)} - fp: {metric.get_fp(class_name)} - " f"fn: {metric.get_fn(class_name)} - tn: {metric.get_tn(class_name)} - precision: " f"{metric.precision(class_name):.4f} - recall: {metric.recall(class_name):.4f} - " f"accuracy: {metric.accuracy(class_name):.4f} - f1-score: " f"{metric.f_score(class_name):.4f}") result = Result( main_score=metric.micro_avg_f_score(), log_line= f"{metric.precision()}\t{metric.recall()}\t{metric.micro_avg_f_score()}", log_header="PRECISION\tRECALL\tF1", detailed_results=detailed_result, ) if out_path is not None: with open(out_path, "w", encoding="utf-8") as outfile: outfile.write("".join(lines)) return result, eval_loss
def evaluate(self, data_loader: DataLoader, out_path: Path = None, embeddings_storage_mode: str = 'cpu') -> (Result, float): with torch.no_grad(): eval_loss = 0 metric = Metric('Evaluation') lines = [] batch_count = 0 for batch in data_loader: batch_count += 1 (labels, loss) = self.forward_labels_and_loss(batch) eval_loss += loss sentences_for_batch = [ sent.to_plain_string() for sent in batch ] confidences_for_batch = [[ label.score for label in sent_labels ] for sent_labels in labels] predictions_for_batch = [[ label.value for label in sent_labels ] for sent_labels in labels] true_values_for_batch = [ sentence.get_label_names() for sentence in batch ] available_labels = self.label_dictionary.get_items() for (sentence, confidence, prediction, true_value) in zip( sentences_for_batch, confidences_for_batch, predictions_for_batch, true_values_for_batch): eval_line = '{}\t{}\t{}\t{}\n'.format( sentence, true_value, prediction, confidence) lines.append(eval_line) for (predictions_for_sentence, true_values_for_sentence) in zip(predictions_for_batch, true_values_for_batch): for label in available_labels: if ((label in predictions_for_sentence) and (label in true_values_for_sentence)): metric.add_tp(label) elif ((label in predictions_for_sentence) and (label not in true_values_for_sentence)): metric.add_fp(label) elif ((label not in predictions_for_sentence) and (label in true_values_for_sentence)): metric.add_fn(label) elif ((label not in predictions_for_sentence) and (label not in true_values_for_sentence)): metric.add_tn(label) store_embeddings(batch, embeddings_storage_mode) eval_loss /= batch_count detailed_result = ''.join([ '\nMICRO_AVG: acc ', '{}'.format(metric.micro_avg_accuracy()), ' - f1-score ', '{}'.format(metric.micro_avg_f_score()), '\nMACRO_AVG: acc ', '{}'.format(metric.macro_avg_accuracy()), ' - f1-score ', '{}'.format(metric.macro_avg_f_score()) ]) for class_name in metric.get_classes(): detailed_result += ''.join([ '\n', '{:<10}'.format(class_name), ' tp: ', '{}'.format(metric.get_tp(class_name)), ' - fp: ', '{}'.format(metric.get_fp(class_name)), ' - fn: ', '{}'.format(metric.get_fn(class_name)), ' - tn: ', '{}'.format(metric.get_tn(class_name)), ' - precision: ', '{:.4f}'.format(metric.precision(class_name)), ' - recall: ', '{:.4f}'.format(metric.recall(class_name)), ' - accuracy: ', '{:.4f}'.format( metric.accuracy(class_name)), ' - f1-score: ', '{:.4f}'.format(metric.f_score(class_name)) ]) result = Result(main_score=metric.micro_avg_f_score(), log_line=''.join([ '{}'.format(metric.precision()), '\t', '{}'.format(metric.recall()), '\t', '{}'.format(metric.micro_avg_f_score()) ]), log_header='PRECISION\tRECALL\tF1', detailed_results=detailed_result) if (out_path is not None): with open(out_path, 'w', encoding='utf-8') as outfile: outfile.write(''.join(lines)) return (result, eval_loss)
def evaluate( self, sentences: Dataset, eval_mini_batch_size: int = 32, embeddings_in_memory: bool = True, out_path: Path = None, ) -> (Result, float): with torch.no_grad(): eval_loss = 0 batch_no: int = 0 batch_loader = torch.utils.data.DataLoader( sentences, batch_size=eval_mini_batch_size, shuffle=False, num_workers=4, collate_fn=list, ) metric = Metric("Evaluation") lines: List[str] = [] for batch in batch_loader: batch_no += 1 with torch.no_grad(): features = self.forward(batch) loss = self._calculate_loss(features, batch) tags, _ = self._obtain_labels(features, batch) eval_loss += loss for (sentence, sent_tags) in zip(batch, tags): for (token, tag) in zip(sentence.tokens, sent_tags): token: Token = token token.add_tag_label("predicted", tag) # append both to file for evaluation eval_line = "{} {} {} {}\n".format( token.text, token.get_tag(self.tag_type).value, tag.value, tag.score, ) lines.append(eval_line) lines.append("\n") for sentence in batch: # make list of gold tags gold_tags = [ (tag.tag, str(tag)) for tag in sentence.get_spans(self.tag_type) ] # make list of predicted tags predicted_tags = [ (tag.tag, str(tag)) for tag in sentence.get_spans("predicted") ] # check for true positives, false positives and false negatives for tag, prediction in predicted_tags: if (tag, prediction) in gold_tags: metric.add_tp(tag) else: metric.add_fp(tag) for tag, gold in gold_tags: if (tag, gold) not in predicted_tags: metric.add_fn(tag) else: metric.add_tn(tag) clear_embeddings( batch, also_clear_word_embeddings=not embeddings_in_memory ) eval_loss /= batch_no if out_path is not None: with open(out_path, "w", encoding="utf-8") as outfile: outfile.write("".join(lines)) detailed_result = ( f"\nMICRO_AVG: acc {metric.micro_avg_accuracy()} - f1-score {metric.micro_avg_f_score()}" f"\nMACRO_AVG: acc {metric.macro_avg_accuracy()} - f1-score {metric.macro_avg_f_score()}" ) for class_name in metric.get_classes(): detailed_result += ( f"\n{class_name:<10} tp: {metric.get_tp(class_name)} - fp: {metric.get_fp(class_name)} - " f"fn: {metric.get_fn(class_name)} - tn: {metric.get_tn(class_name)} - precision: " f"{metric.precision(class_name):.4f} - recall: {metric.recall(class_name):.4f} - " f"accuracy: {metric.accuracy(class_name):.4f} - f1-score: " f"{metric.f_score(class_name):.4f}" ) result = Result( main_score=metric.micro_avg_f_score(), log_line=f"{metric.precision()}\t{metric.recall()}\t{metric.micro_avg_f_score()}", log_header="PRECISION\tRECALL\tF1", detailed_results=detailed_result, ) return result, eval_loss
def train(self, base_path: str, learning_rate: float = 0.1, mini_batch_size: int = 32, max_epochs: int = 50, anneal_factor: float = 0.5, patience: int = 5, save_model: bool = True, embeddings_in_memory: bool = False, train_with_dev: bool = False, eval_on_train: bool = True): """ Trains the model using the training data of the corpus. :param patience: number of 'bad' epochs before learning rate gets decreased :param anneal_factor: learning rate will be decreased by this factor :param base_path: the directory to which any results should be written to :param learning_rate: the learning rate :param mini_batch_size: the mini batch size :param max_epochs: the maximum number of epochs to train :param save_model: boolean value indicating, whether the model should be saved or not :param embeddings_in_memory: boolean value indicating, if embeddings should be kept in memory or not :param train_with_dev: boolean value indicating, if the dev data set should be used for training or not :param eval_on_train: boolean value indicating, if evaluation metrics should be calculated on training data set or not """ loss_txt = init_output_file(base_path, 'loss.tsv') with open(loss_txt, 'a') as f: f.write( 'EPOCH\tTIMESTAMP\tTRAIN_LOSS\t{}\tDEV_LOSS\t{}\tTEST_LOSS\t{}\n' .format(Metric.tsv_header('TRAIN'), Metric.tsv_header('DEV'), Metric.tsv_header('TEST'))) weight_extractor = WeightExtractor(base_path) optimizer = torch.optim.SGD(self.model.parameters(), lr=learning_rate) anneal_mode = 'min' if train_with_dev else 'max' scheduler: ReduceLROnPlateau = ReduceLROnPlateau(optimizer, factor=anneal_factor, patience=patience, mode=anneal_mode) train_data = self.corpus.train # if training also uses dev data, include in training set if train_with_dev: train_data.extend(self.corpus.dev) # At any point you can hit Ctrl + C to break out of training early. try: # record overall best dev scores and best loss best_score = 0 for epoch in range(max_epochs): log.info('-' * 100) if not self.test_mode: random.shuffle(train_data) self.model.train() batches = [ self.corpus.train[x:x + mini_batch_size] for x in range(0, len(self.corpus.train), mini_batch_size) ] current_loss: float = 0 seen_sentences = 0 modulo = max(1, int(len(batches) / 10)) for group in optimizer.param_groups: learning_rate = group['lr'] for batch_no, batch in enumerate(batches): scores = self.model.forward(batch) loss = self.model.calculate_loss(scores, batch) optimizer.zero_grad() loss.backward() torch.nn.utils.clip_grad_norm_(self.model.parameters(), 5.0) optimizer.step() seen_sentences += len(batch) current_loss += loss.item() clear_embeddings( batch, also_clear_word_embeddings=not embeddings_in_memory) if batch_no % modulo == 0: log.info( "epoch {0} - iter {1}/{2} - loss {3:.8f}".format( epoch + 1, batch_no, len(batches), current_loss / seen_sentences)) iteration = epoch * len(batches) + batch_no weight_extractor.extract_weights( self.model.state_dict(), iteration) current_loss /= len(train_data) self.model.eval() log.info('-' * 100) log.info("EPOCH {0}: lr {1:.4f} - bad epochs {2}".format( epoch + 1, learning_rate, scheduler.num_bad_epochs)) dev_metric = train_metric = None dev_loss = '_' train_loss = current_loss if eval_on_train: train_metric, train_loss = self._calculate_evaluation_results_for( 'TRAIN', self.corpus.train, embeddings_in_memory, mini_batch_size) if not train_with_dev: dev_metric, dev_loss = self._calculate_evaluation_results_for( 'DEV', self.corpus.dev, embeddings_in_memory, mini_batch_size) with open(loss_txt, 'a') as f: train_metric_str = train_metric.to_tsv( ) if train_metric is not None else Metric.to_empty_tsv() dev_metric_str = dev_metric.to_tsv( ) if dev_metric is not None else Metric.to_empty_tsv() f.write('{}\t{:%H:%M:%S}\t{}\t{}\t{}\t{}\t{}\t{}\n'.format( epoch, datetime.datetime.now(), train_loss, train_metric_str, dev_loss, dev_metric_str, '_', Metric.to_empty_tsv())) # anneal against train loss if training with dev, otherwise anneal against dev score scheduler.step( current_loss) if train_with_dev else scheduler.step( dev_metric.f_score()) is_best_model_so_far: bool = False current_score = dev_metric.f_score( ) if not train_with_dev else train_metric.f_score() if current_score >= best_score: best_score = current_score is_best_model_so_far = True if is_best_model_so_far: if save_model: self.model.save(base_path + "/model.pt") self.model.save(base_path + "/final-model.pt") if save_model: self.model = TextClassifier.load_from_file(base_path + "/model.pt") log.info('-' * 100) log.info('Testing using best model ...') self.model.eval() test_metrics, test_loss = self.evaluate( self.corpus.test, mini_batch_size=mini_batch_size, eval_class_metrics=True, embeddings_in_memory=embeddings_in_memory) for metric in test_metrics.values(): metric.print() self.model.train() log.info('-' * 100) except KeyboardInterrupt: log.info('-' * 100) log.info('Exiting from training early.') log.info('Saving model ...') with open(base_path + "/final-model.pt", 'wb') as model_save_file: torch.save(self.model, model_save_file, pickle_protocol=4) model_save_file.close() log.info('Done.')
def test_metric_with_classes(): metric = Metric('Test') metric.add_tp('class-1') metric.add_tn('class-1') metric.add_tn('class-1') metric.add_fp('class-1') metric.add_tp('class-2') metric.add_tn('class-2') metric.add_tn('class-2') metric.add_fp('class-2') for i in range(0, 10): metric.add_tp('class-3') for i in range(0, 90): metric.add_fp('class-3') metric.add_tp('class-4') metric.add_tn('class-4') metric.add_tn('class-4') metric.add_fp('class-4') assert(metric.precision('class-1') == 0.5) assert(metric.precision('class-2') == 0.5) assert(metric.precision('class-3') == 0.1) assert(metric.precision('class-4') == 0.5) assert(metric.recall('class-1') == 1) assert(metric.recall('class-2') == 1) assert(metric.recall('class-3') == 1) assert(metric.recall('class-4') == 1) assert(metric.accuracy() == metric.micro_avg_accuracy()) assert(metric.f_score() == metric.micro_avg_f_score()) assert(metric.f_score('class-1') == 0.6667) assert(metric.f_score('class-2') == 0.6667) assert(metric.f_score('class-3') == 0.1818) assert(metric.f_score('class-4') == 0.6667) assert(metric.accuracy('class-1') == 0.75) assert(metric.accuracy('class-2') == 0.75) assert(metric.accuracy('class-3') == 0.1) assert(metric.accuracy('class-4') == 0.75) assert(metric.micro_avg_f_score() == 0.2184) assert(metric.macro_avg_f_score() == 0.5714) assert(metric.micro_avg_accuracy() == 0.1696) assert(metric.macro_avg_accuracy() == 0.5875) assert(metric.precision() == 0.1226) assert(metric.recall() == 1)
def evaluate(self, data_loader: DataLoader, out_path: Path = None, embeddings_storage_mode: str = 'cpu') -> (Result, float): with torch.no_grad(): eval_loss = 0 batch_no = 0 metric = Metric('Evaluation') lines = [] for batch in data_loader: batch_no += 1 with torch.no_grad(): features = self.forward(batch) loss = self._calculate_loss(features, batch) (tags, _) = self._obtain_labels(features, batch) eval_loss += loss for (sentence, sent_tags) in zip(batch, tags): for (token, tag) in zip(sentence.tokens, sent_tags): token = token token.add_tag_label('predicted', tag) eval_line = '{} {} {} {}\n'.format( token.text, token.get_tag(self.tag_type).value, tag.value, tag.score) lines.append(eval_line) lines.append('\n') for sentence in batch: gold_tags = [(tag.tag, str(tag)) for tag in sentence.get_spans(self.tag_type)] predicted_tags = [ (tag.tag, str(tag)) for tag in sentence.get_spans('predicted') ] for (tag, prediction) in predicted_tags: if ((tag, prediction) in gold_tags): metric.add_tp(tag) else: metric.add_fp(tag) for (tag, gold) in gold_tags: if ((tag, gold) not in predicted_tags): metric.add_fn(tag) else: metric.add_tn(tag) store_embeddings(batch, embeddings_storage_mode) eval_loss /= batch_no if (out_path is not None): with open(out_path, 'w', encoding='utf-8') as outfile: outfile.write(''.join(lines)) detailed_result = ''.join([ '\nMICRO_AVG: acc ', '{}'.format(metric.micro_avg_accuracy()), ' - f1-score ', '{}'.format(metric.micro_avg_f_score()), '\nMACRO_AVG: acc ', '{}'.format(metric.macro_avg_accuracy()), ' - f1-score ', '{}'.format(metric.macro_avg_f_score()) ]) for class_name in metric.get_classes(): detailed_result += ''.join([ '\n', '{:<10}'.format(class_name), ' tp: ', '{}'.format(metric.get_tp(class_name)), ' - fp: ', '{}'.format(metric.get_fp(class_name)), ' - fn: ', '{}'.format(metric.get_fn(class_name)), ' - tn: ', '{}'.format(metric.get_tn(class_name)), ' - precision: ', '{:.4f}'.format(metric.precision(class_name)), ' - recall: ', '{:.4f}'.format(metric.recall(class_name)), ' - accuracy: ', '{:.4f}'.format( metric.accuracy(class_name)), ' - f1-score: ', '{:.4f}'.format(metric.f_score(class_name)) ]) result = Result(main_score=metric.micro_avg_f_score(), log_line=''.join([ '{}'.format(metric.precision()), '\t', '{}'.format(metric.recall()), '\t', '{}'.format(metric.micro_avg_f_score()) ]), log_header='PRECISION\tRECALL\tF1', detailed_results=detailed_result) return (result, eval_loss)
def evaluate( self, data_loader: DataLoader, out_path: Path = None, embedding_storage_mode: str = "none", ) -> (Result, float): if type(out_path) == str: out_path = Path(out_path) metric = Metric("Evaluation", beta=self.beta) parsing_metric = ParsingMetric() lines: List[str] = [] eval_loss_arc = 0 eval_loss_rel = 0 for batch_idx, batch in enumerate(data_loader): with torch.no_grad(): score_arc, score_rel = self.forward(batch) loss_arc, loss_rel = self._calculate_loss( score_arc, score_rel, batch) arc_prediction, relation_prediction = self._obtain_labels_( score_arc, score_rel) parsing_metric(arc_prediction, relation_prediction, batch) eval_loss_arc += loss_arc eval_loss_rel += loss_rel for (sentence, arcs, sent_tags) in zip(batch, arc_prediction, relation_prediction): for (token, arc, tag) in zip(sentence.tokens, arcs, sent_tags): token: Token = token token.add_tag_label("predicted", Label(tag)) token.add_tag_label("predicted_head_id", Label(str(arc))) # append both to file for evaluation eval_line = "{} {} {} {} {}\n".format( token.text, token.tags['dependency'].value, str(token.head_id), tag, str(arc), ) lines.append(eval_line) lines.append("\n") for sentence in batch: # make list of gold tags gold_tags = [ token.tags['dependency'].value for token in sentence.tokens ] # make list of predicted tags predicted_tags = [ tag.tag for tag in sentence.get_spans("predicted") ] # check for true positives, false positives and false negatives for tag_indx, predicted_tag in enumerate(predicted_tags): if predicted_tag == gold_tags[tag_indx]: metric.add_tp(tag) else: metric.add_fp(tag) for tag_indx, label_tag in enumerate(gold_tags): if label_tag != predicted_tags[tag_indx]: metric.add_fn(tag) else: metric.add_tn(tag) store_embeddings(batch, embedding_storage_mode) eval_loss_arc /= len(data_loader) eval_loss_rel /= len(data_loader) if out_path is not None: with open(out_path, "w", encoding="utf-8") as outfile: outfile.write("".join(lines)) detailed_result = ( f"\nUAS : {parsing_metric.get_uas():.4f} - LAS : {parsing_metric.get_las():.4f}" f"\neval loss rel : {eval_loss_rel:.4f} - eval loss arc : {eval_loss_arc:.4f}" f"\nMICRO_AVG: acc {metric.micro_avg_accuracy()} - f1-score {metric.micro_avg_f_score()}" f"\nMACRO_AVG: acc {metric.macro_avg_accuracy()} - f1-score {metric.macro_avg_f_score()}" ) for class_name in metric.get_classes(): detailed_result += ( f"\n{class_name:<10} tp: {metric.get_tp(class_name)} - fp: {metric.get_fp(class_name)} - " f"fn: {metric.get_fn(class_name)} - tn: {metric.get_tn(class_name)} - precision: " f"{metric.precision(class_name):.4f} - recall: {metric.recall(class_name):.4f} - " f"accuracy: {metric.accuracy(class_name):.4f} - f1-score: " f"{metric.f_score(class_name):.4f}") result = Result( main_score=metric.micro_avg_f_score(), log_line= f"{metric.precision()}\t{metric.recall()}\t{metric.micro_avg_f_score()}", log_header="PRECISION\tRECALL\tF1", detailed_results=detailed_result, ) return result, eval_loss_arc + eval_loss_rel
def eval_flair_spans(data, predicted_list, batch_size, out_path=None): metric = Metric('Evaluation') mini_batch_size = batch_size batches = [ data[x:x + mini_batch_size] for x in range(0, len(data), mini_batch_size) ] lines: List[str] = [] word_counter = 0 for batch in batches: for sentence in batch: for token in sentence.tokens: tag = Label(predicted_list[word_counter]) word_counter += 1 token.add_tag_label('predicted', tag) # append both to file for evaluation eval_line = '{} {} {} {}\n'.format(token.text, token.get_tag('ner').value, tag.value, tag.score) lines.append(eval_line) lines.append('\n') for sentence in batch: # make list of gold tags gold_tags = [(tag.tag, str(tag)) for tag in sentence.get_spans('ner')] # make list of predicted tags predicted_tags = [(tag.tag, str(tag)) for tag in sentence.get_spans('predicted')] # check for true positives, false positives and false negatives for tag, prediction in predicted_tags: if (tag, prediction) in gold_tags: metric.add_tp(tag) else: metric.add_fp(tag) for tag, gold in gold_tags: if (tag, gold) not in predicted_tags: metric.add_fn(tag) else: metric.add_tn(tag) # add metrics scores at the beginning of the file lines.insert(0, str(metric) + "\n\n") if out_path is not None: # create folder for json and corresponding output if not os.path.exists(os.path.dirname(out_path)): try: os.makedirs(os.path.dirname(out_path)) except OSError as exc: # Guard against race condition if exc.errno != errno.EEXIST: raise with open(out_path, "w", encoding='utf-8') as outfile: outfile.write(''.join(lines)) # # esnWrapper.model.output_activation = output_activation_training return metric
def _evaluate_text_classifier(model, sentences, eval_mini_batch_size=32, embeddings_in_memory=False, out_path=None): with torch.no_grad(): eval_loss = 0 batches = [ sentences[x:x + eval_mini_batch_size] for x in range(0, len(sentences), eval_mini_batch_size) ] metric = Metric('Evaluation') lines = [] for batch in batches: labels, loss = model.forward_labels_and_loss(batch) clear_embeddings( batch, also_clear_word_embeddings=not embeddings_in_memory) eval_loss += loss sentences_for_batch = [ sent.to_plain_string() for sent in batch ] confidences_for_batch = [[ label.score for label in sent_labels ] for sent_labels in labels] predictions_for_batch = [[ label.value for label in sent_labels ] for sent_labels in labels] true_values_for_batch = [ sentence.get_label_names() for sentence in batch ] available_labels = model.label_dictionary.get_items() for sentence, confidence, prediction, true_value in zip( sentences_for_batch, confidences_for_batch, predictions_for_batch, true_values_for_batch): eval_line = '{}\t{}\t{}\t{}\n'.format( sentence, true_value, prediction, confidence) lines.append(eval_line) for predictions_for_sentence, true_values_for_sentence in zip( predictions_for_batch, true_values_for_batch): ModelTrainer._evaluate_sentence_for_text_classification( metric, available_labels, predictions_for_sentence, true_values_for_sentence) eval_loss /= len(sentences) if out_path is not None: with open(out_path, "w", encoding='utf-8') as outfile: outfile.write(''.join(lines)) return metric, eval_loss
def _evaluate_sequence_tagger(model, sentences, eval_mini_batch_size=32, embeddings_in_memory=True, out_path=None): with torch.no_grad(): eval_loss = 0 batch_no = 0 batches = [ sentences[x:x + eval_mini_batch_size] for x in range(0, len(sentences), eval_mini_batch_size) ] metric = Metric('Evaluation') lines = [] for batch in batches: batch_no += 1 tags, loss = model.forward_labels_and_loss(batch) eval_loss += loss for (sentence, sent_tags) in zip(batch, tags): for (token, tag) in zip(sentence.tokens, sent_tags): token = token token.add_tag_label('predicted', tag) # append both to file for evaluation eval_line = '{} {} {} {}\n'.format( token.text, token.get_tag(model.tag_type).value, tag.value, tag.score) lines.append(eval_line) lines.append('\n') for sentence in batch: # make list of gold tags gold_tags = [(tag.tag, str(tag)) for tag in sentence.get_spans(model.tag_type)] # make list of predicted tags predicted_tags = [ (tag.tag, str(tag)) for tag in sentence.get_spans('predicted') ] # check for true positives, false positives and false negatives for tag, prediction in predicted_tags: if (tag, prediction) in gold_tags: metric.add_tp(tag) else: metric.add_fp(tag) for tag, gold in gold_tags: if (tag, gold) not in predicted_tags: metric.add_fn(tag) else: metric.add_tn(tag) clear_embeddings( batch, also_clear_word_embeddings=not embeddings_in_memory) eval_loss /= len(sentences) if out_path is not None: with open(out_path, "w", encoding='utf-8') as outfile: outfile.write(''.join(lines)) return metric, eval_loss
def train(self, base_path, evaluation_metric=EvaluationMetric.MICRO_F1_SCORE, learning_rate=0.1, mini_batch_size=32, eval_mini_batch_size=None, max_epochs=100, anneal_factor=0.5, patience=3, anneal_against_train_loss=True, train_with_dev=False, monitor_train=False, embeddings_in_memory=True, checkpoint=False, save_final_model=True, anneal_with_restarts=False, test_mode=False, param_selection_mode=False, **kwargs): if eval_mini_batch_size is None: eval_mini_batch_size = mini_batch_size # cast string to Path if type(base_path) is str: base_path = Path(base_path) add_file_handler(log, base_path / 'training.log') log_line(log) log.info(f'Evaluation method: {evaluation_metric.name}') if not param_selection_mode: loss_txt = init_output_file(base_path, 'loss.tsv') with open(loss_txt, 'a') as f: f.write( f'EPOCH\tTIMESTAMP\tBAD_EPOCHS\tLEARNING_RATE\tTRAIN_LOSS\t{Metric.tsv_header("TRAIN")}\tDEV_LOSS\t{Metric.tsv_header("DEV")}' f'\tTEST_LOSS\t{Metric.tsv_header("TEST")}\n') weight_extractor = WeightExtractor(base_path) optimizer = self.optimizer(self.model.parameters(), lr=learning_rate, **kwargs) if self.optimizer_state is not None: optimizer.load_state_dict(self.optimizer_state) # annealing scheduler anneal_mode = 'min' if anneal_against_train_loss else 'max' if isinstance(optimizer, (AdamW, SGDW)): scheduler = ReduceLRWDOnPlateau(optimizer, factor=anneal_factor, patience=patience, mode=anneal_mode, verbose=True) else: scheduler = ReduceLROnPlateau(optimizer, factor=anneal_factor, patience=patience, mode=anneal_mode, verbose=True) if self.scheduler_state is not None: scheduler.load_state_dict(self.scheduler_state) train_data = self.corpus.train # if training also uses dev data, include in training set if train_with_dev: train_data.extend(self.corpus.dev) dev_score_history = [] dev_loss_history = [] train_loss_history = [] # At any point you can hit Ctrl + C to break out of training early. try: previous_learning_rate = learning_rate for epoch in range(0 + self.epoch, max_epochs + self.epoch): log_line(log) try: bad_epochs = scheduler.num_bad_epochs except: bad_epochs = 0 for group in optimizer.param_groups: learning_rate = group['lr'] # reload last best model if annealing with restarts is enabled if learning_rate != previous_learning_rate and anneal_with_restarts and\ (base_path / 'best-model.pt').exists(): log.info('resetting to best model') self.model.load_from_file(base_path / 'best-model.pt') previous_learning_rate = learning_rate # stop training if learning rate becomes too small if learning_rate < 0.0001: log_line(log) log.info('learning rate too small - quitting training!') log_line(log) break if not test_mode: random.shuffle(train_data) batches = [ train_data[x:x + mini_batch_size] for x in range(0, len(train_data), mini_batch_size) ] self.model.train() train_loss = 0 seen_sentences = 0 modulo = max(1, int(len(batches) / 10)) for batch_no, batch in enumerate(batches): loss = self.model.forward_loss(batch) optimizer.zero_grad() loss.backward() torch.nn.utils.clip_grad_norm_(self.model.parameters(), 5.0) optimizer.step() seen_sentences += len(batch) train_loss += loss.item() clear_embeddings( batch, also_clear_word_embeddings=not embeddings_in_memory) if batch_no % modulo == 0: log.info( f'epoch {epoch + 1} - iter {batch_no}/{len(batches)} - loss ' f'{train_loss / seen_sentences:.8f}') iteration = epoch * len(batches) + batch_no if not param_selection_mode: weight_extractor.extract_weights( self.model.state_dict(), iteration) train_loss /= len(train_data) self.model.eval() log_line(log) log.info( f'EPOCH {epoch + 1} done: loss {train_loss:.4f} - lr {learning_rate:.4f} - bad epochs {bad_epochs}' ) dev_metric = None dev_loss = '_' train_metric = None test_metric = None if monitor_train: train_metric, train_loss = self._calculate_evaluation_results_for( 'TRAIN', self.corpus.train, evaluation_metric, embeddings_in_memory, eval_mini_batch_size) if not train_with_dev: dev_metric, dev_loss = self._calculate_evaluation_results_for( 'DEV', self.corpus.dev, evaluation_metric, embeddings_in_memory, eval_mini_batch_size) if not param_selection_mode and self.corpus.test: test_metric, test_loss = self._calculate_evaluation_results_for( 'TEST', self.corpus.test, evaluation_metric, embeddings_in_memory, eval_mini_batch_size, base_path / 'test.tsv') if not param_selection_mode: with open(loss_txt, 'a') as f: train_metric_str = train_metric.to_tsv( ) if train_metric is not None else Metric.to_empty_tsv( ) dev_metric_str = dev_metric.to_tsv( ) if dev_metric is not None else Metric.to_empty_tsv() test_metric_str = test_metric.to_tsv( ) if test_metric is not None else Metric.to_empty_tsv( ) f.write( f'{epoch}\t{datetime.datetime.now():%H:%M:%S}\t{bad_epochs}\t{learning_rate:.4f}\t' f'{train_loss}\t{train_metric_str}\t{dev_loss}\t{dev_metric_str}\t_\t{test_metric_str}\n' ) # calculate scores using dev data if available dev_score = 0. if not train_with_dev: if evaluation_metric == EvaluationMetric.MACRO_ACCURACY: dev_score = dev_metric.macro_avg_accuracy() elif evaluation_metric == EvaluationMetric.MICRO_ACCURACY: dev_score = dev_metric.micro_avg_accuracy() elif evaluation_metric == EvaluationMetric.MACRO_F1_SCORE: dev_score = dev_metric.macro_avg_f_score() else: dev_score = dev_metric.micro_avg_f_score() # append dev score to score history dev_score_history.append(dev_score) dev_loss_history.append(dev_loss.item()) # anneal against train loss if training with dev, otherwise anneal against dev score current_score = train_loss if anneal_against_train_loss else dev_score scheduler.step(current_score) train_loss_history.append(train_loss) # if checkpoint is enable, save model at each epoch if checkpoint and not param_selection_mode: self.model.save_checkpoint(base_path / 'checkpoint.pt', optimizer.state_dict(), scheduler.state_dict(), epoch + 1, train_loss) # if we use dev data, remember best model based on dev evaluation score if not train_with_dev and not param_selection_mode and current_score == scheduler.best: self.model.save(base_path / 'best-model.pt') # if we do not use dev data for model selection, save final model if save_final_model and not param_selection_mode: self.model.save(base_path / 'final-model.pt') except KeyboardInterrupt: log_line(log) log.info('Exiting from training early.') if not param_selection_mode: log.info('Saving model ...') self.model.save(base_path / 'final-model.pt') log.info('Done.') # test best model if test data is present if self.corpus.test: final_score = self.final_test(base_path, embeddings_in_memory, evaluation_metric, eval_mini_batch_size) else: final_score = 0 log.info('Test data not provided setting final score to 0') return { 'test_score': final_score, 'dev_score_history': dev_score_history, 'train_loss_history': train_loss_history, 'dev_loss_history': dev_loss_history }
def train( self, base_path: str, learning_rate: float = 0.1, mini_batch_size: int = 32, max_epochs: int = 100, anneal_factor: float = 0.5, patience: int = 4, train_with_dev: bool = False, embeddings_in_memory: bool = True, checkpoint: bool = False, save_final_model: bool = True, anneal_with_restarts: bool = False, ): evaluation_method = 'F1' if self.model.tag_type in ['pos', 'upos']: evaluation_method = 'accuracy' log.info('Evaluation method: {}'.format(evaluation_method)) loss_txt = init_output_file(base_path, 'loss.tsv') with open(loss_txt, 'a') as f: f.write( 'EPOCH\tTIMESTAMP\tTRAIN_LOSS\t{}\tDEV_LOSS\t{}\tTEST_LOSS\t{}\n' .format(Metric.tsv_header('TRAIN'), Metric.tsv_header('DEV'), Metric.tsv_header('TEST'))) weight_extractor = WeightExtractor(base_path) optimizer = torch.optim.SGD(self.model.parameters(), lr=learning_rate) # annealing scheduler anneal_mode = 'min' if train_with_dev else 'max' scheduler = ReduceLROnPlateau(optimizer, factor=anneal_factor, patience=patience, mode=anneal_mode, verbose=True) train_data = self.corpus.train # if training also uses dev data, include in training set if train_with_dev: train_data.extend(self.corpus.dev) # At any point you can hit Ctrl + C to break out of training early. try: previous_learning_rate = learning_rate for epoch in range(0, max_epochs): bad_epochs = scheduler.num_bad_epochs for group in optimizer.param_groups: learning_rate = group['lr'] # reload last best model if annealing with restarts is enabled if learning_rate != previous_learning_rate and anneal_with_restarts: log.info('resetting to best model') self.model.load_from_file(base_path + "/best-model.pt") previous_learning_rate = learning_rate # stop training if learning rate becomes too small if learning_rate < 0.001: log.info('learning rate too small - quitting training!') break if not self.test_mode: random.shuffle(train_data) batches = [ train_data[x:x + mini_batch_size] for x in range(0, len(train_data), mini_batch_size) ] self.model.train() current_loss: float = 0 seen_sentences = 0 modulo = max(1, int(len(batches) / 10)) for batch_no, batch in enumerate(batches): batch: List[Sentence] = batch optimizer.zero_grad() # Step 4. Compute the loss, gradients, and update the parameters by calling optimizer.step() loss = self.model.neg_log_likelihood(batch) current_loss += loss.item() seen_sentences += len(batch) loss.backward() torch.nn.utils.clip_grad_norm_(self.model.parameters(), 5.0) optimizer.step() if not embeddings_in_memory: self.clear_embeddings_in_batch(batch) if batch_no % modulo == 0: log.info( "epoch {0} - iter {1}/{2} - loss {3:.8f}".format( epoch + 1, batch_no, len(batches), current_loss / seen_sentences)) iteration = epoch * len(batches) + batch_no weight_extractor.extract_weights( self.model.state_dict(), iteration) current_loss /= len(train_data) # switch to eval mode self.model.eval() # if checkpointing is enable, save model at each epoch if checkpoint: self.model.save(base_path + "/checkpoint.pt") log.info('-' * 100) dev_score = dev_metric = None if not train_with_dev: dev_score, dev_metric = self.evaluate( self.corpus.dev, base_path, evaluation_method=evaluation_method, embeddings_in_memory=embeddings_in_memory) test_score, test_metric = self.evaluate( self.corpus.test, base_path, evaluation_method=evaluation_method, embeddings_in_memory=embeddings_in_memory) # anneal against train loss if training with dev, otherwise anneal against dev score scheduler.step( current_loss) if train_with_dev else scheduler.step( dev_score) # logging info log.info("EPOCH {0}: lr {1:.4f} - bad epochs {2}".format( epoch + 1, learning_rate, bad_epochs)) if not train_with_dev: log.info( "{0:<4}: f-score {1:.4f} - acc {2:.4f} - tp {3} - fp {4} - fn {5} - tn {6}" .format('DEV', dev_metric.f_score(), dev_metric.accuracy(), dev_metric._tp, dev_metric._fp, dev_metric._fn, dev_metric._tn)) log.info( "{0:<4}: f-score {1:.4f} - acc {2:.4f} - tp {3} - fp {4} - fn {5} - tn {6}" .format('TEST', test_metric.f_score(), test_metric.accuracy(), test_metric._tp, test_metric._fp, test_metric._fn, test_metric._tn)) with open(loss_txt, 'a') as f: dev_metric_str = dev_metric.to_tsv( ) if dev_metric is not None else Metric.to_empty_tsv() f.write('{}\t{:%H:%M:%S}\t{}\t{}\t{}\t{}\t{}\t{}\n'.format( epoch, datetime.datetime.now(), '_', Metric.to_empty_tsv(), '_', dev_metric_str, '_', test_metric.to_tsv())) # if we use dev data, remember best model based on dev evaluation score if not train_with_dev and dev_score == scheduler.best: self.model.save(base_path + "/best-model.pt") # if we do not use dev data for model selection, save final model if save_final_model: if train_with_dev: self.model.save(base_path + "/final-model.pt") except KeyboardInterrupt: log.info('-' * 100) log.info('Exiting from training early.') log.info('Saving model ...') self.model.save(base_path + "/final-model.pt") log.info('Done.')
def test_metric_get_classes(): metric = Metric('Test') metric.add_fn('class-1') metric.add_fn('class-3') metric.add_tn('class-1') metric.add_tp('class-2') assert(3 == len(metric.get_classes())) assert('class-1' in metric.get_classes()) assert('class-2' in metric.get_classes()) assert('class-3' in metric.get_classes())
def evaluate( self, data_loader: DataLoader, out_path: Path = None, embedding_storage_mode: str = "none", ) -> (Result, float): if type(out_path) == str: out_path = Path(out_path) with torch.no_grad(): eval_loss = 0 batch_no: int = 0 metric = Metric("Evaluation", beta=self.beta) lines: List[str] = [] if self.use_crf: transitions = self.transitions.detach().cpu().numpy() else: transitions = None for batch in data_loader: batch_no += 1 with torch.no_grad(): features = self.forward(batch) loss = self._calculate_loss(features, batch) tags, _ = self._obtain_labels( feature=features, batch_sentences=batch, transitions=transitions, get_all_tags=False, ) eval_loss += loss for (sentence, sent_tags) in zip(batch, tags): for (token, tag) in zip(sentence.tokens, sent_tags): token: Token = token token.add_tag("predicted", tag.value, tag.score) # append both to file for evaluation eval_line = "{} {} {} {}\n".format( token.text, token.get_tag(self.tag_type).value, tag.value, tag.score, ) lines.append(eval_line) lines.append("\n") for sentence in batch: # make list of gold tags gold_tags = [(tag.tag, str(tag)) for tag in sentence.get_spans(self.tag_type)] # make list of predicted tags predicted_tags = [ (tag.tag, str(tag)) for tag in sentence.get_spans("predicted") ] # check for true positives, false positives and false negatives for tag, prediction in predicted_tags: if (tag, prediction) in gold_tags: metric.add_tp(tag) else: metric.add_fp(tag) for tag, gold in gold_tags: if (tag, gold) not in predicted_tags: metric.add_fn(tag) else: metric.add_tn(tag) store_embeddings(batch, embedding_storage_mode) eval_loss /= batch_no if out_path is not None: with open(out_path, "w", encoding="utf-8") as outfile: outfile.write("".join(lines)) detailed_result = ( f"\nMICRO_AVG: acc {metric.micro_avg_accuracy()} - f1-score {metric.micro_avg_f_score()}" f"\nMACRO_AVG: acc {metric.macro_avg_accuracy()} - f1-score {metric.macro_avg_f_score()}" ) for class_name in metric.get_classes(): detailed_result += ( f"\n{class_name:<10} tp: {metric.get_tp(class_name)} - fp: {metric.get_fp(class_name)} - " f"fn: {metric.get_fn(class_name)} - tn: {metric.get_tn(class_name)} - precision: " f"{metric.precision(class_name):.4f} - recall: {metric.recall(class_name):.4f} - " f"accuracy: {metric.accuracy(class_name):.4f} - f1-score: " f"{metric.f_score(class_name):.4f}") result = Result( main_score=metric.micro_avg_f_score(), log_line= f"{metric.precision()}\t{metric.recall()}\t{metric.micro_avg_f_score()}", log_header="PRECISION\tRECALL\tF1", detailed_results=detailed_result, ) return result, eval_loss
def evaluate( self, sentences: List[Sentence], eval_mini_batch_size: int = 32, embeddings_in_memory: bool = False, out_path: Path = None, ) -> (Result, float): with torch.no_grad(): eval_loss = 0 batches = [ sentences[x:x + eval_mini_batch_size] for x in range(0, len(sentences), eval_mini_batch_size) ] metric = Metric("Evaluation") lines: List[str] = [] for batch in batches: labels, loss = self.forward_labels_and_loss(batch) clear_embeddings( batch, also_clear_word_embeddings=not embeddings_in_memory) eval_loss += loss sentences_for_batch = [ sent.to_plain_string() for sent in batch ] confidences_for_batch = [[ label.score for label in sent_labels ] for sent_labels in labels] predictions_for_batch = [[ label.value for label in sent_labels ] for sent_labels in labels] true_values_for_batch = [ sentence.get_label_names() for sentence in batch ] available_labels = self.label_dictionary.get_items() for sentence, confidence, prediction, true_value in zip( sentences_for_batch, confidences_for_batch, predictions_for_batch, true_values_for_batch, ): eval_line = "{}\t{}\t{}\t{}\n".format( sentence, true_value, prediction, confidence) lines.append(eval_line) for predictions_for_sentence, true_values_for_sentence in zip( predictions_for_batch, true_values_for_batch): for label in available_labels: if (label in predictions_for_sentence and label in true_values_for_sentence): metric.add_tp(label) elif (label in predictions_for_sentence and label not in true_values_for_sentence): metric.add_fp(label) elif (label not in predictions_for_sentence and label in true_values_for_sentence): metric.add_fn(label) elif (label not in predictions_for_sentence and label not in true_values_for_sentence): metric.add_tn(label) eval_loss /= len(sentences) detailed_result = ( f"\nMICRO_AVG: acc {metric.micro_avg_accuracy()} - f1-score {metric.micro_avg_f_score()}" f"\nMACRO_AVG: acc {metric.macro_avg_accuracy()} - f1-score {metric.macro_avg_f_score()}" ) for class_name in metric.get_classes(): detailed_result += ( f"\n{class_name:<10} tp: {metric.get_tp(class_name)} - fp: {metric.get_fp(class_name)} - " f"fn: {metric.get_fn(class_name)} - tn: {metric.get_tn(class_name)} - precision: " f"{metric.precision(class_name):.4f} - recall: {metric.recall(class_name):.4f} - " f"accuracy: {metric.accuracy(class_name):.4f} - f1-score: " f"{metric.f_score(class_name):.4f}") result = Result( main_score=metric.micro_avg_f_score(), log_line= f"{metric.precision()}\t{metric.recall()}\t{metric.micro_avg_f_score()}", log_header="PRECISION\tRECALL\tF1", detailed_results=detailed_result, ) if out_path is not None: with open(out_path, "w", encoding="utf-8") as outfile: outfile.write("".join(lines)) return result, eval_loss