def tag_dir(model, input_dir, output_dir, tokenized_input, string=None, **kwargs): """ Tag a directory of texts :param model: Path to a model file :param input_dir: Path to a directory containing text files :param output_dir: Path to output tagged text files """ print('::: started :::') tagger = Tagger(load=True, model_dir=model, overwrite=kwargs) print('Tagger loaded, now annotating...') orig_path = input_dir new_path = output_dir for filename in os.listdir(orig_path): if not filename.endswith('.txt'): continue print('\t +', filename) unseen_tokens = pandora.utils.load_unannotated_file( orig_path + filename, nb_instances=None, tokenized_input=tokenized_input ) annotations = tagger.annotate(unseen_tokens) keys = list(annotations.keys()) print("Keys :" + "\t".join(keys)) with codecs.open(new_path + filename + ".tsv", 'w', 'utf8') as f: f.write("\t".join(keys) + "\n") for x in zip(*tuple([annotations[k] for k in keys])): f.write('\t'.join(list(x)) + '\n') print('::: ended :::')
def main(): print('::: started :::') tagger = Tagger(load=True, model_dir='models/wilhelmus_full') print('Tagger loaded, now annotating...') orig_path = 'data/wilhelmus/orig/' new_path = 'data/wilhelmus/tagged/' for filename in os.listdir(orig_path): if not filename.endswith('.txt'): continue print('\t +', filename) unseen_tokens = pandora.utils.load_unannotated_file(orig_path + filename, nb_instances=None, tokenized_input=False) annotations = tagger.annotate(unseen_tokens) with codecs.open(new_path + filename, 'w', 'utf8') as f: #for t, l, p in zip(annotations['tokens'], annotations['postcorrect_lemmas'], annotations['pos']): for t, l, p in zip(annotations['tokens'], annotations['lemmas'], annotations['pos']): #for t, l in zip(annotations['tokens'], annotations['lemmas']): f.write('\t'.join((t, l, p))+'\n') print('::: ended :::')
def tag_string(model, input_dir, output_dir=None, string=None, **kwargs): """ Tag a directory of texts :param model: Path to a model file :param input_dir: Untokenized string to tag """ print('::: started :::') tagger = Tagger(load=True, model_dir=model, overwrite=kwargs) print('Tagger loaded, now annotating...') unseen_tokens = tokenize.split(input_dir) print(unseen_tokens) annotations = tagger.annotate(unseen_tokens) keys = list(annotations.keys()) print("--------------------") print('\t'.join(keys)) print("--------------------") for x in zip(*tuple([annotations[k] for k in keys])): print('\t'.join(list(x))) print('::: ended :::')
def main(): print('::: started :::') tagger = Tagger(load=True, model_dir='models/wilhelmus_full') print('Tagger loaded, now annotating...') orig_path = 'data/wilhelmus/orig/' new_path = 'data/wilhelmus/tagged/' for filename in os.listdir(orig_path): if not filename.endswith('.txt'): continue print('\t +', filename) unseen_tokens = pandora.utils.load_unannotated_file( orig_path + filename, nb_instances=None, tokenized_input=False) annotations = tagger.annotate(unseen_tokens) with codecs.open(new_path + filename, 'w', 'utf8') as f: #for t, l, p in zip(annotations['tokens'], annotations['postcorrect_lemmas'], annotations['pos']): for t, l, p in zip(annotations['tokens'], annotations['lemmas'], annotations['pos']): #for t, l in zip(annotations['tokens'], annotations['lemmas']): f.write('\t'.join((t, l, p)) + '\n') print('::: ended :::')
def main(): print('::: started :::') cf_path = sys.argv[1] params = pandora.utils.get_param_dict(cf_path) params['config_path'] = cf_path #train_data = pandora.utils.load_annotated_file('data/capitula_classic/train0.tsv', #train_data = pandora.utils.load_annotated_file('data/mdu/relig/relig_train.tab', train_data = pandora.utils.load_annotated_dir('data/wilhelmus/all_train', format='tab', extension='.tab', include_pos=params['include_pos'], include_lemma=params['include_lemma'], include_morph=params['include_morph'], nb_instances=None) #dev_data = pandora.utils.load_annotated_file('data/mdu/relig/relig_dev.tab', dev_data = pandora.utils.load_annotated_dir('data/wilhelmus/all_dev', format='tab', extension='.tab', include_pos=params['include_pos'], include_lemma=params['include_lemma'], include_morph=params['include_morph'], nb_instances=None) #test_data = pandora.utils.load_annotated_file('data/capitula_classic/test0.tsv', #dev_data = pandora.utils.load_annotated_file('data/mdu/cg-lit/cg-lit_dev.tab', #dev_data = pandora.utils.load_annotated_file('data/EMDu/train.txt', #test_data = pandora.utils.load_annotated_file('data/mdu/cg-lit/cg-lit_test.tab', # format='tab', # include_pos=params['include_pos'], # include_lemma=params['include_lemma'], # include_morph=params['include_morph'], # nb_instances=None) tagger = Tagger(**params) tagger.setup_to_train(train_data=train_data, dev_data=dev_data) for i in range(int(params['nb_epochs'])): tagger.epoch() tagger.save() tagger.save() #tagger = Tagger(load=True, model_dir='models/mdu_all') print('::: ended :::')
def test_load(self): """ Ensure params are correctly loaded """ tagger = Tagger(config_path="./tests/test_configs/config_chrestien.txt") self.assertEqual(tagger.nb_encoding_layers, 2, "nb_encoding_layers should be correctly loaded") self.assertEqual(tagger.nb_epochs, 3, "nb_epochs should be correctly loaded") self.assertEqual(tagger.nb_dense_dims, 1000, "nb_dense_dims should be correctly loaded") self.assertEqual(tagger.batch_size, 100, "batch_size should be correctly loaded") self.assertEqual(tagger.nb_left_tokens, 2, "nb_left_tokens should be correctly loaded") self.assertEqual(tagger.nb_right_tokens, 1, "nb_right_tokens should be correctly loaded") self.assertEqual(tagger.nb_context_tokens, 3, "nb_context_tokens should be correctly computed") self.assertEqual(tagger.nb_embedding_dims, 100, "nb_embedding_dims should be correctly loaded") self.assertEqual(tagger.model_dir, "fake_model", "model_dir should be correctly loaded") self.assertEqual(tagger.postcorrect, False, "postcorrect should be correctly loaded") self.assertEqual(tagger.nb_filters, 100, "nb_filters should be correctly loaded") self.assertEqual(tagger.filter_length, 3, "filter_length should be correctly loaded") self.assertEqual(tagger.focus_repr, "convolutions", "focus_repr should be correctly loaded") self.assertEqual(tagger.dropout_level, 0.15, "dropout_level should be correctly loaded") self.assertEqual(tagger.include_token, True, "include_token should be correctly loaded") self.assertEqual(tagger.include_context, True, "include_context should be correctly loaded") self.assertEqual(tagger.include_lemma, "label", "include_lemma should be correctly loaded") self.assertEqual(tagger.include_pos, True, "include_pos should be correctly loaded") self.assertEqual(tagger.include_morph, False, "include_morph should be correctly loaded") self.assertEqual(tagger.include_dev, True, "include_dev should be correctly loaded") self.assertEqual(tagger.include_test, True, "include_test should be correctly loaded") self.assertEqual(tagger.min_token_freq_emb, 5, "min_token_freq_emb should be correctly loaded") self.assertEqual(tagger.halve_lr_at, 75, "halve_lr_at should be correctly loaded") self.assertEqual(tagger.max_token_len, 20, "max_token_len should be correctly loaded") self.assertEqual(tagger.min_lem_cnt, 1, "min_lem_cnt should be correctly loaded")
def test_load_after_save(self): """ Ensure param are correctly saved """ tagger = Tagger.setup_from_disk( config_path="./tests/test_configs/config_chrestien.txt", train_data=TRAIN, dev_data=DEV, test_data=TEST ) tagger.include_pos = False tagger.curr_nb_epochs = 10 tagger.save_params() self.assertEqual(tagger.pretrainer.nb_workers, 1, "Pretrainer Workers should be correctly loaded") del tagger tagger = Tagger(config_path="./fake_model/config.txt") self.assertEqual(tagger.nb_encoding_layers, 2, "nb_encoding_layers should be correctly loaded") self.assertEqual(tagger.nb_epochs, 3, "nb_epochs should be correctly loaded") self.assertEqual(tagger.nb_dense_dims, 1000, "nb_dense_dims should be correctly loaded") self.assertEqual(tagger.batch_size, 100, "batch_size should be correctly loaded") self.assertEqual(tagger.nb_left_tokens, 2, "nb_left_tokens should be correctly loaded") self.assertEqual(tagger.nb_right_tokens, 1, "nb_right_tokens should be correctly loaded") self.assertEqual(tagger.nb_context_tokens, 3, "nb_context_tokens should be correctly computed") self.assertEqual(tagger.nb_embedding_dims, 100, "nb_embedding_dims should be correctly loaded") self.assertEqual(tagger.model_dir, "fake_model", "model_dir should be correctly loaded") self.assertEqual(tagger.postcorrect, False, "postcorrect should be correctly loaded") self.assertEqual(tagger.nb_filters, 100, "nb_filters should be correctly loaded") self.assertEqual(tagger.filter_length, 3, "filter_length should be correctly loaded") self.assertEqual(tagger.focus_repr, "convolutions", "focus_repr should be correctly loaded") self.assertEqual(tagger.dropout_level, 0.15, "dropout_level should be correctly loaded") self.assertEqual(tagger.include_token, True, "include_token should be correctly loaded") self.assertEqual(tagger.include_context, True, "include_context should be correctly loaded") self.assertEqual(tagger.include_lemma, "label", "include_lemma should be correctly loaded") self.assertEqual(tagger.include_pos, False, "include_pos should be correctly loaded") self.assertEqual(tagger.include_morph, False, "include_morph should be correctly loaded") self.assertEqual(tagger.include_dev, True, "include_dev should be correctly loaded") self.assertEqual(tagger.include_test, True, "include_test should be correctly loaded") self.assertEqual(tagger.min_token_freq_emb, 5, "min_token_freq_emb should be correctly loaded") self.assertEqual(tagger.halve_lr_at, 75, "halve_lr_at should be correctly loaded") self.assertEqual(tagger.max_token_len, 20, "max_token_len should be correctly loaded") self.assertEqual(tagger.min_lem_cnt, 1, "min_lem_cnt should be correctly loaded") self.assertEqual(tagger.curr_nb_epochs, 10, "Current number of epochs should be correctly loaded") self.assertEqual(tagger.model, "PyTorch", "PyTorch implementation is loaded") tagger = Tagger(config_path="./fake_model/config.txt", load=True) self.assertIsInstance(tagger.model, MODELS["PyTorch"], "PyTorch implementation is loaded")
def train_func(config, train, dev=None, test=None, load=False, verbose=True, first=1, each=1, eval_file=None, no_shell=False, **kwargs): """ Main CLI Interface (training) :param config: Path to retrieve configuration file :type config: str :param train: Path to directory containing dev files :type train: str :param dev: Path to directory containing test files :type dev: str :param test: Path to directory containing train files :type test: str :param embed: Path to directory containing files for embeddings :type embed: str :param load: Whether to load an existing model to train on top of it (default: False) :type load: bool :param nb_epochs: Number of epoch :type nb_epochs: int :param verbose: (Overwrite the next few) Print only the first and last if False :param first: Evaluate first N epochs :param each: Evaluate each Nth epoch :param eval_file: Store evaluation into a file :param no_shell: Do not print to shell :param kwargs: Other arguments :type kwargs: dict :return: """ tagger = Tagger.setup_from_disk(config, train, dev, test, verbose=True, load=load, **kwargs) nb_epochs = tagger.nb_epochs # Set up the logger logger_params = dict( shell=not no_shell, file=eval_file, first=first, nb_epochs=nb_epochs, each=each ) if verbose is False: # Print each total number of epoch + 1 to not print any logger_params = dict(shell=True, file=logger_params["file"], first=1, nb_epochs=nb_epochs, each=nb_epochs+1) tagger.logger = Logger(**logger_params) for i in range(nb_epochs): tagger.epoch(autosave=True, eval_test=tagger.include_test) tagger.save() print('::: ended :::')