def load_corpus(data_folder, tag_column, attr, downsample_perc): corpus = ColumnCorpus(Path(data_folder) / attr, {0: 'text', tag_column: 'ner'}, train_file='train.txt', test_file='test.txt', dev_file='dev.txt') if downsample_perc > 0: print('Downsampling: ', downsample_perc) corpus = corpus.downsample(percentage=downsample_perc, only_downsample_train=True) return corpus
def __init__(self, dataset_name, output_path="/kaggle/working"): input_path = f"/kaggle/input/{dataset_name}" self.tag_type = "ner" self.corpus = ColumnCorpus( data_folder=input_path, column_format={0: "text", 1: "ner"} ) self.tag_dictionary = self.corpus.make_tag_dictionary(tag_type=self.tag_type) self.dataset_name = dataset_name self.output_path = output_path
def build_conll03en_corpus(base_path: str): document_as_sequence = False corpus = ColumnCorpus( base_path, column_format={0: "text", 1: "pos", 2: "np", 3: "ner"}, train_file="train.txt", dev_file="dev.txt", test_file="test.txt", tag_to_bioes="ner", document_separator_token=None if not document_as_sequence else "-DOCSTART-", ) tag_type = "ner" tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type) return corpus, tag_type, tag_dictionary
def parse_arguments(dataset, model): # Adjust logging level logging.getLogger("flair").setLevel(level="ERROR") columns = {0: "text", 1: "ner"} if dataset == "lft": corpus: ColumnCorpus = ColumnCorpus( Path("./data"), columns, train_file="./enp_DE.lft.mr.tok.train.bio", dev_file="./enp_DE.lft.mr.tok.dev.bio", test_file="./enp_DE.lft.mr.tok.test.bio", tag_to_bioes="ner", ) elif dataset == "onb": corpus: ColumnCorpus = ColumnCorpus( Path("./data"), columns, train_file="./enp_DE.onb.mr.tok.train.bio", dev_file="./enp_DE.onb.mr.tok.dev.bio", test_file="./enp_DE.onb.mr.tok.test.bio", tag_to_bioes="ner", ) tagger: SequenceTagger = SequenceTagger.load(model) for test_sentence in corpus.test: tokens = test_sentence.tokens gold_tags = [token.tags["ner"].value for token in tokens] tagged_sentence = Sentence() tagged_sentence.tokens = tokens # Tag sentence with model tagger.predict(tagged_sentence) predicted_tags = [token.tags["ner"].value for token in tagged_sentence.tokens] assert len(tokens) == len(gold_tags) assert len(gold_tags) == len(predicted_tags) for token, gold_tag, predicted_tag in zip(tokens, gold_tags, predicted_tags): gold_tag = iobes_to_iob(gold_tag) predicted_tag = iobes_to_iob(predicted_tag) print(f"{token.text} {gold_tag} {predicted_tag}") print("")
def create_flair_corpus(data_folder, column_format, percentage=1, only_downsample_train=True): """ Instantiates a Corpus from CoNLL column-formatted task data such as CoNLL03 or CoNLL2000. :param data_folder: base folder with the task dat :param column_format: a map specifying the column format :param percentage: Percentage of corpus to be used for training :param only_downsample_train: Only apply downsample to training set (recommended) :return: a Corpus with annotated train, dev and test data """ corpus: Corpus = ColumnCorpus(data_folder, column_format, train_file='train', test_file='test', dev_file='dev', in_memory=False) if percentage < 1: corpus.downsample(percentage=percentage, only_downsample_train=only_downsample_train) return corpus
def onto_ner_mapped(): corpus_mapped: Corpus = ColumnCorpus( "resources/tasks/onto-ner", column_format={ 0: "text", 1: "pos", 2: "upos", 3: "ner" }, #tag_to_bioes="ner", label_name_map={ 'NORP': 'MISC', 'FAC': 'LOC', 'GPE': 'LOC', 'CARDINAL': 'O', 'DATE': 'O', 'EVENT': 'MISC', 'LANGUAGE': 'MISC', 'LAW': 'MISC', 'MONEY': 'O', 'ORDINAL': 'O', 'PERCENT': 'O', 'PRODUCT': 'MISC', 'QUANTITY': 'O', 'TIME': 'O', 'WORK_OF_ART': 'MISC', 'LOC': 'LOC', 'PERSON': 'PER' }) return corpus_mapped
def train(): columns = {0: 'text', 1: 'pos'} # init a corpus using column format, data folder and the names of the train, dev and test files corpus: Corpus = ColumnCorpus('', columns, train_file=args.train, test_file=args.test, dev_file=args.dev) tag_dictionary = corpus.make_tag_dictionary(tag_type='pos') # initialize embeddings embedding_types: List[TokenEmbeddings] = [ CharacterEmbeddings(), FlairEmbeddings('news-forward'), FlairEmbeddings('news-backward'), ] embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types) tagger: SequenceTagger = SequenceTagger(hidden_size=256, embeddings=embeddings, tag_dictionary=tag_dictionary, tag_type='pos', use_crf=True) trainer: ModelTrainer = ModelTrainer(tagger, corpus) trainer.train(args.model, learning_rate=0.1, mini_batch_size=32, max_epochs=150)
def read_corpus(data_folder) -> Corpus: columns = {0: 'text', 1: 'pos', 2: 'ner'} corpus: Corpus = ColumnCorpus(data_folder, columns, train_file='flair_train.txt', test_file='flair_val.txt', dev_file='flair_test.txt') return corpus
def test(self, test_data): path = "./src/tmp/" filepath = path + self.model_name + '/best-model.pt' data = self.convert_format(test_data) model = SequenceTagger.load(filepath) corpus: Corpus = ColumnCorpus(".", {0: 'text', 1: 'ner'}, train_file=data, test_file=data ) result, eval_loss = model.evaluate(corpus.test) results = result.detailed_results global_res = result.log_line global_res = global_res.split("\t") res={"model_name" : self.model_name} res["precision"] = float(global_res[0])*100 res["recall"] = float(global_res[1])*100 res["f_score"] = float(global_res[2])*100 results = results.split("\n") results = results[6:] score_by_labels = {} for scores in results: scores = scores.replace("/s"," ") scores = scores.split() score_by_labels[scores[0]] = {'p' : float(scores[11])*100, 'r' : float(scores[14])*100, 'f' : float(scores[17])*100} res["score_by_label"] = score_by_labels loss_path = path + self.model_name + "/loss.tsv" df = pd.read_csv(loss_path, sep = "\t" ) self.losses = df["TRAIN_LOSS"].tolist() return res
def predict(args): """Predict.""" model = SequenceTagger.load(os.path.join(args.model_dir, args.model_file)) logger.info(f'Model: "{model}"') corpus: Corpus = ColumnCorpus( args.data_dir, column_format=model.column_format, test_file=args.test_file, comment_symbol=args.comment_symbol, ) fout = io.open(args.output_file, "w", encoding="utf-8", errors="ignore") logger.info("Saving to %s", args.output_file) start_time = time.time() for i in range(len(corpus.test)): sentence = corpus.test[i] model.predict(sentence) lines = [] for token in sentence.tokens: lines.append( f"{token.text}\t{token.get_tag(model.tag_type).value}\n") lines.append("\n") fout.write("".join(lines)) fout.flush() logger.info("End of prediction: time %.1f min", (time.time() - start_time) / 60)
def test_load_universal_dependencies_conllu_corpus(tasks_base_path): """ This test only covers basic universal dependencies datasets. For example, multi-word tokens or the "deps" column sentence annotations are not supported yet. """ # Here, we use the default token annotation fields. corpus = ColumnCorpus( tasks_base_path / "conllu", train_file="universal_dependencies.conllu", dev_file="universal_dependencies.conllu", test_file="universal_dependencies.conllu", column_format={ 1: "text", 2: "lemma", 3: "upos", 4: "pos", 5: "feats", 6: "head", 7: "deprel", 8: "deps", 9: "misc", }, ) assert len(corpus.train) == 1 assert len(corpus.dev) == 1 assert len(corpus.test) == 1 _assert_universal_dependencies_conllu_dataset(corpus.train)
def train_NER(self, data_folder, train_file, dev_file, test_file, train_dict, is_gpu=False): columns = {0: 'text', 1: 'ner'} if is_gpu == True: flair.device = torch.device("cuda:0") corpus: Corpus = ColumnCorpus(data_folder, columns, train_file=train_file, test_file=test_file, dev_file=dev_file) tag_dictionary = corpus.make_tag_dictionary(tag_type=self.tag_type) tagger: SequenceTagger = SequenceTagger( hidden_size=train_dict["hidden_size"], embeddings=self.embeddings, tag_dictionary=tag_dictionary, tag_type=self.tag_type, use_crf=True) trainer: ModelTrainer = ModelTrainer(tagger, corpus) trainer.train(self.download_dir + '/resources/taggers/saved-models', learning_rate=train_dict["lr"], mini_batch_size=train_dict["batch_size"], max_epochs=train_dict["epochs"])
def process_file(tagger: SequenceTagger, file_path: Union[str, Path], out_path: Union[str, Path], print_corpus=None): try: corpus: ColumnCorpus = ColumnCorpus( data_folder=os.path.split(file_path)[0], train_file=os.path.split(file_path)[1], column_format={ 0: 'text', 1: 'begin', 2: 'end', 3: 'ner' }) if len(corpus.get_all_sentences()) == 0: return 0 if print_corpus is not None: results: List[Span] = [] data_loader = DataLoader(corpus.get_all_sentences()) result, loss = tagger.evaluate(data_loader) print(result.detailed_results) if not os.path.isfile(print_corpus): for sentence in corpus.train: for span in sentence.get_spans(tag_type): if span.tag is not "O": results.append(span) print_spans_in_brat_format(results, print_corpus) return tag_corpus(corpus, file_path, out_path, tagger) except IndexError: log.error(f'IndexError in file: "{file_path}"!') return 0
def train(): generate_datasets() DATA_FOLDER = '../content/data' # MAX_TOKENS = 500 columns = {0: 'text', 1: 'pos', 2: 'tag'} data_folder = DATA_FOLDER corpus: Corpus = ColumnCorpus(data_folder, columns, train_file='train-labelled.txt', test_file='dev-labelled.txt', in_memory=False) # corpus._train = [x for x in corpus.train if len(x) < MAX_TOKENS] # corpus._test = [x for x in corpus.test if len(x) < MAX_TOKENS] tag_type = 'tag' tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type) print(tag_dictionary) embeddings = TransformerWordEmbeddings('roberta-base', layers='-4', fine_tune=True) tagger: SequenceTagger = SequenceTagger(hidden_size=128, embeddings=embeddings, tag_dictionary=tag_dictionary, tag_type=tag_type, # dropout=0.3334816033039888, use_crf=True) trainer: ModelTrainer = ModelTrainer(tagger, corpus) trainer.train('resources/taggers/task-TC', learning_rate=0.2, mini_batch_size=64, max_epochs=100, embeddings_storage_mode='gpu'),
def flairInfer(model_path, test_or_train): from flair.data import Sentence from flair.models import SequenceTagger from flair.data import Corpus from flair.datasets import CONLL_03 from flair.datasets import ColumnCorpus model = SequenceTagger.load(model_path + '/final-model.pt') data_dir = '../GmbDataExperimentation/processed_data/1500_data' try: corpus: Corpus = CONLL_03(base_path=data_dir) except: pass columns = {0: 'text', 1: 'ner'} corpus: Corpus = ColumnCorpus(data_dir, columns) if test_or_train == 'train': testdata = corpus.train result_file = data_dir + '/train.tsv' else: testdata = corpus.test result_file = data_dir + '/test.tsv' test_result, test_loss = model.evaluate(testdata, out_path=result_file) result_line = f"\t{test_loss}\t{test_result.log_line}" print( f"TEST : loss {test_loss} - score {round(test_result.main_score, 4)}") print(f"TEST RESULT : {result_line}")
def prepare_flair_train_dev_corpus( spacy_model: Language, data_folder: str, dev_size: float, nb_segment: Optional[int], segment: Optional[int] ) -> Corpus: all_annotated_files: List[str] = [ os.path.join(data_folder, filename) for filename in os.listdir(data_folder) if filename.endswith(".txt") ] if nb_segment is None and segment is None: random.shuffle(all_annotated_files) nb_doc_dev_set: int = int(len(all_annotated_files) * dev_size) dev_file_names = all_annotated_files[0:nb_doc_dev_set] else: assert segment < nb_segment all_segments = np.array_split(all_annotated_files, nb_segment) dev_file_names = list(all_segments[segment]) print(f"dev set file names: {dev_file_names}") train_file_names = [file for file in all_annotated_files if file not in dev_file_names] train_path = export_data_set_flair_format(spacy_model, train_file_names) dev_path = export_data_set_flair_format(spacy_model, dev_file_names) corpus: Corpus = ColumnCorpus( data_folder=tempfile.gettempdir(), column_format={0: "text", 1: "ner"}, train_file=os.path.basename(train_path), dev_file=os.path.basename(dev_path), test_file=os.path.basename(dev_path), ) return corpus
def train(self): path = "./src/tmp/" self.training_data = self.convert_format(self.training_data) corpus: Corpus = ColumnCorpus(".", {0: 'text', 1: 'ner'}, train_file=self.training_data ) tag_dictionary = corpus.make_tag_dictionary(tag_type='ner') embedding_types: List[TokenEmbeddings] = [ WordEmbeddings('fr'), FlairEmbeddings('fr-forward'), FlairEmbeddings('fr-backward'), ] embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types) tagger: SequenceTagger = SequenceTagger(hidden_size=256, embeddings=embeddings, tag_dictionary=tag_dictionary, tag_type='ner', use_crf=True) self.trainer = ModelTrainer(tagger, corpus) save_path = path + self.model_name self.trainer.train(save_path,learning_rate=self.learning_rate,mini_batch_size=self.batch_size, max_epochs=self.nb_iter,embeddings_storage_mode=self.mode) self.is_ready = 1
def prepare_flair_train_test_corpus(spacy_model: Language, data_folder: str, dev_size: float) -> Corpus: all_annotated_files: List[str] = [ os.path.join(data_folder, filename) for filename in os.listdir(data_folder) if filename.endswith(".txt") ] random.shuffle(all_annotated_files) nb_doc_dev_set: int = int(len(all_annotated_files) * dev_size) dev_file_names = all_annotated_files[0:nb_doc_dev_set] train_file_names = [ file for file in all_annotated_files if file not in dev_file_names ] train_path = export_data_set_flair_format(spacy_model, train_file_names) dev_path = export_data_set_flair_format(spacy_model, dev_file_names) corpus: Corpus = ColumnCorpus(data_folder=tempfile.gettempdir(), column_format={ 0: 'text', 1: 'ner' }, train_file=os.path.basename(train_path), dev_file=os.path.basename(dev_path), test_file=os.path.basename(dev_path)) return corpus
def create_corpus(self) -> Corpus: corpus: Corpus = ColumnCorpus(data_folder = '{}/{}/'.format(self.data_folder, self.entity_type), column_format = {0: 'text', 1: 'ner'}, train_file = '{}_train.conll'.format(self.entity_type), test_file = '{}_test.conll'.format(self.entity_type), dev_file = '{}_dev.conll'.format(self.entity_type)) return corpus
def get_data(self, data_dir, tag_type='ner', train_file='train.txt', dev_file='dev.txt', test_file='test.txt'): # customize data format, see # https://github.com/zalandoresearch/flair/blob/master/resources/docs/TUTORIAL_6_CORPUS.md#reading-your-own-sequence-labeling-dataset columns = {0: 'text', 1: 'pos', 2: 'deprel', 3: 'ner'} corpus: Corpus = ColumnCorpus(data_dir, columns, train_file=train_file, dev_file=dev_file, test_file=test_file) # print(corpus) # make the tag dictionary from the corpus tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type) # print(tag_dictionary.idx2item) # (TODO: TEST IF WORKS) Some useful stats below: # Obtain statistics about the dataset stats = corpus.obtain_statistics() print(stats) # check how many sentences there are in the training split len(corpus.train) # You can also access a sentence and check out annotations print(corpus.train[0].to_tagged_string('ner')) return corpus, tag_dictionary
def testModel(model_dir, test_sent=None, test_file_dir=None): """ model_dir: directory contains 'final_model.pt' test_sent: one sentence to test test_file: one file of sentences to test """ if test_sent and test_file_dir: raise Exception("Argument conflicts, only one type of testing method is allowed.") elif not test_sent and not test_file_dir: raise Exception("Argument invalid, at least one testing method is required") model_path = model_dir + '/final-model.pt' model = SequenceTagger.load(model_path) if test_sent: print('Predicting in singular mode') test_sent = Sentence(test_sent) model.predict(test_sent) print(test_sent.to_tagged_string()) if test_file_dir: print('Predicting in plural mode') try: columns = {0: 'text', 1: 'ner'} corpus = ColumnCorpus(test_file_dir, columns) test_data = corpus.test except: raise Exception('Directory must contain `test.txt` file, one column `text` the other `ner`') test_result, test_loss = model.evaluate(test_data, out_path=test_file_dir + '/test.tsv') result_line = f"\t{test_loss}\t{test_result.log_line}" print(f"TEST : loss {test_loss} - score {round(test_result.main_score, 4)}") print(f"TEST RESULT : {result_line}") print('end')
def main(): args = parse_args() if not os.path.exists(args.data_dir): raise Exception(f'Path does not exist: {args.data_dir}') # 1. Build corpus columns = {0: 'text', 1: 'ner'} corpus: Corpus = ColumnCorpus(args.data_dir, columns, train_file=args.train_file, dev_file=args.dev_file, test_file=args.test_file) print(corpus) print(corpus.obtain_statistics()) # 2. What tag do we want to predict? tag_type = 'ner' # 3. Build tag dictionary tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type) # 4. Initialize embeddings embedding_types: List[TokenEmbeddings] = [ WordEmbeddings('crawl'), FlairEmbeddings(args.forward_flair_embeddings), FlairEmbeddings(args.backward_flair_embeddings), ] embeddings: StackedEmbeddings = StackedEmbeddings( embeddings=embedding_types) # 5. Initialize sequence tagger tagger: SequenceTagger = SequenceTagger(hidden_size=HIDDEN_SIZE, embeddings=embeddings, tag_dictionary=tag_dictionary, tag_type=tag_type) # 6. Initialize trainer trainer: ModelTrainer = ModelTrainer(tagger, corpus) if args.learning_rate_find: print('***** Plotting learning rate') # 7a. Find learning rate learning_rate_tsv = trainer.find_learning_rate( 'temp', 'learning_rate.tsv', mini_batch_size=MINI_BATCH_SIZE) else: print('***** Running train') # 7b. Run Training trainer.train( 'temp', learning_rate=0.1, mini_batch_size=MINI_BATCH_SIZE, # it's a big dataset so maybe set embeddings_in_memory to False embeddings_storage_mode='none') tag_and_output(corpus.test, tagger, os.path.join(args.data_dir, args.test_output_file), tag_type)
def read_in_CADEC_prev(): # define columns columns = { 0: 'text', 1: 'text_lower', 2: 'pos', 3: 'ner', 4: 'text_start', 5: 'text_end', 6: 'SNOMEDCT_id' } # this is the folder in which train, test and dev files reside data_folder = 'data/CADEC/NER/' # init a corpus using column format, data folder and the names of the train, dev and test files corpus: Corpus = ColumnCorpus(data_folder, columns, train_file='corpus-conll-train.txt', test_file='corpus-conll-test.txt', dev_file='corpus-conll-dev.txt') print(corpus) print(corpus.train[0].to_tagged_string('pos')) print(corpus.train[0].to_tagged_string('ner')) return corpus
def read_in_Micromed(): # define columns # avelox-51c3e5a853785f584a9a8c01 76 93 ADR connective tissue avelox avelox columns = {0: 'text', 1: 'ner'} # this is the folder in which train, test and dev files reside data_folder = 'data/Micromed/' # init a corpus using column format, data folder and the names of the train, dev and test files corpus: Corpus = ColumnCorpus(data_folder, columns, train_file='Micromed_train.csv', test_file='Micromed_test.csv', dev_file='Micromed_dev.csv') len(corpus.train) print(corpus.train[0].to_tagged_string('pos')) print(corpus.train[0].to_tagged_string('ner')) return corpus # read_in_CADEC() # read_in_Micromed() # read_in_AMT()
def run_splits(embedding_types, embeddings_name): embeddings : StackedEmbeddings = StackedEmbeddings( embeddings=embedding_types) for i in range(1,6): print('##########') print('Split', str(i)) print('##########') # define columns columns = {0 : 'text', 1 : 'pos', 2 : 'ner', 3 : 'event', 4 : 'when', 5 : 'who', 6 : 'core', 7 : 'eventtype'} # directory where the data resides data_folder = '<path_to_splits>/split_' + str(i) + '/' # initializing the corpus corpus: Corpus = ColumnCorpus(data_folder, columns, train_file = 'ner_train.csv', test_file = 'ner_test.csv', dev_file = 'ner_dev.csv') # tag to predict tag_type = 'ner' # make tag dictionary from the corpus tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type) tagger : SequenceTagger = SequenceTagger(hidden_size=256, embeddings=embeddings, tag_dictionary=tag_dictionary, tag_type=tag_type, use_crf=True) print(tagger) from flair.trainers import ModelTrainer trainer : ModelTrainer = ModelTrainer(tagger, corpus) trainer.train(data_folder + '/ner_' + embeddings_name, learning_rate=0.1, mini_batch_size=32, max_epochs=150)
def main(): parser = argparse.ArgumentParser() parser.add_argument( "--data_dir", default='./', type=str, help= "The parent dir of input data, should include folder named `conll_03` ") parser.add_argument("--model_dir", default=None, type=str, required=True, help="The model directory where model chekpoints stored") parser.add_argument( "--result_file", default='dev.tsv', type=str, required=True, help= "The name of prediction file, default is in the same dir of script file") parser.add_argument("--eval_on", default='dev', type=str, required=True, help="Whether to eval on dev set or test set") args = parser.parse_args() model_path = args.model_dir data_dir = args.data_dir model = SequenceTagger.load(model_path + '/final-model.pt') try: corpus: Corpus = CONLL_03(base_path=data_dir) except: pass columns = {0: 'text', 1: 'ner'} corpus: Corpus = ColumnCorpus(data_dir, columns) if args.eval_on == 'dev': testdata = corpus.dev elif args.eval_on == 'test': testdata = corpus.test elif args.eval_on == 'train': print('You are evaluating on training set!') testdata = corpus.train else: raise ValueError("Invalid argument, must specify evaluation on dev or test") test_result, test_loss = model.evaluate(testdata, out_path=args.result_file) result_line = f"\t{test_loss}\t{test_result.log_line}" # main score is micro averaged f1 score # result line is precision, recall, micro averaged score print(f"TEST : loss {test_loss} - score {round(test_result.main_score, 4)}") print(f"TEST RESULT: {result_line}") print(test_result.detailed_results)
def train(self, training_dir=None): from flair.trainers import ModelTrainer if training_dir is None: training_dir = flair_splitter_dep_dir # define columns columns = {0: "text", 1: "ner"} # this is the folder in which train, test and dev files reside data_folder = flair_splitter_dep_dir + "data" # init a corpus using column format, data folder and the names of the train, dev and test files # note that training data should be unescaped, i.e. tokens like "&", not "&" corpus: Corpus = ColumnCorpus( data_folder, columns, train_file="sent_train.txt", test_file="sent_test.txt", dev_file="sent_dev.txt", document_separator_token="-DOCSTART-", ) print(corpus) tag_type = "ner" tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type) print(tag_dictionary) # initialize embeddings embedding_types = [ # WordEmbeddings('glove'), # comment in this line to use character embeddings CharacterEmbeddings(), # comment in these lines to use flair embeddings #FlairEmbeddings("news-forward"), #FlairEmbeddings("news-backward"), # BertEmbeddings('distilbert-base-cased') TransformerWordEmbeddings('google/electra-base-discriminator') ] embeddings: StackedEmbeddings = StackedEmbeddings( embeddings=embedding_types) tagger: SequenceTagger = SequenceTagger( hidden_size=128, embeddings=embeddings, tag_dictionary=tag_dictionary, tag_type=tag_type, use_crf=True, ) trainer: ModelTrainer = ModelTrainer(tagger, corpus) trainer.train(training_dir, learning_rate=0.1, mini_batch_size=16, max_epochs=50) self.model = tagger
def test_train_load_use_classifier(results_base_path, tasks_base_path): corpus = ColumnCorpus( data_folder=tasks_base_path / "conllu", train_file="train.conllup", dev_file="train.conllup", test_file="train.conllup", column_format={1: "text", 2: "pos", 3: "ner"}, ) relation_label_dict = corpus.make_label_dictionary(label_type="relation") embeddings = TransformerWordEmbeddings() model: RelationExtractor = RelationExtractor( embeddings=embeddings, label_dictionary=relation_label_dict, label_type="relation", entity_label_type="ner", train_on_gold_pairs_only=True, ) # initialize trainer trainer: ModelTrainer = ModelTrainer(model, corpus) trainer.train( results_base_path, learning_rate=0.1, mini_batch_size=2, max_epochs=3, shuffle=False, ) del trainer, model, relation_label_dict, corpus loaded_model: RelationExtractor = RelationExtractor.load(results_base_path / "final-model.pt") loaded_model.train_on_gold_pairs_only = False sentence = Sentence(["Apple", "was", "founded", "by", "Steve", "Jobs", "."]) sentence[0:1].add_label("ner", "ORG") sentence[4:6].add_label("ner", "PER") loaded_model.predict(sentence) assert "founded_by" == sentence.get_labels("relation")[0].value del loaded_model
def __init__(self, path: Union[Path, str], columns: dict = None, tag_types: List = ['ner'], corpus: Corpus = None): if isinstance(path, str): path = Path(path) assert path.exists() self.path = path if corpus: self.corpus = corpus else: self.corpus = ColumnCorpus(self.path, columns) self.sentences = ColumnCorpusAnalysis.index_spans( self.corpus.get_all_sentences(), tag_types) print(self.corpus)
def train_model(directory='Data', use_BERT=True): # define columns columns = { 0: 'ID', 1: 'text', 2: 'empty_0', 3: 'pos', 4: 'empty_1', 5: 'empty_2', 6: 'empty_3', 7: 'empty_4', 8: 'empty_5', 9: 'tox' } # this is the folder in which train, test and dev files reside data_folder = directory # init a corpus using column format, data folder and the names of the train, dev and test files corpus: Corpus = ColumnCorpus(data_folder, columns, train_file='converted_data_train.conll', test_file='converted_data_test.conll', dev_file='converted_data_dev.conll') # tag to predict tag_type = 'tox' tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type) # embeddings if use_BERT: bert_embeddings = [ TransformerWordEmbeddings('bert-large-uncased', fine_tune=True) ] embeddings: StackedEmbeddings = StackedEmbeddings( embeddings=bert_embeddings) else: embedding_types = [WordEmbeddings('glove')] embeddings: StackedEmbeddings = StackedEmbeddings( embeddings=embedding_types) # initialize sequence tagger tagger: SequenceTagger = SequenceTagger(hidden_size=256, embeddings=embeddings, tag_dictionary=tag_dictionary, tag_type=tag_type, use_crf=True) # initialize trainer trainer: ModelTrainer = ModelTrainer(tagger, corpus) # start training trainer.train('resources/taggers/toxic_classifier_bert', learning_rate=0.1, mini_batch_size=32, max_epochs=5)