def tag_dir(model, input_dir, output_dir, tokenized_input, string=None, **kwargs): """ Tag a directory of texts :param model: Path to a model file :param input_dir: Path to a directory containing text files :param output_dir: Path to output tagged text files """ print('::: started :::') tagger = Tagger(load=True, model_dir=model, overwrite=kwargs) print('Tagger loaded, now annotating...') orig_path = input_dir new_path = output_dir for filename in os.listdir(orig_path): if not filename.endswith('.txt'): continue print('\t +', filename) unseen_tokens = pandora.utils.load_unannotated_file( orig_path + filename, nb_instances=None, tokenized_input=tokenized_input ) annotations = tagger.annotate(unseen_tokens) keys = list(annotations.keys()) print("Keys :" + "\t".join(keys)) with codecs.open(new_path + filename + ".tsv", 'w', 'utf8') as f: f.write("\t".join(keys) + "\n") for x in zip(*tuple([annotations[k] for k in keys])): f.write('\t'.join(list(x)) + '\n') print('::: ended :::')
def main(): print('::: started :::') tagger = Tagger(load=True, model_dir='models/wilhelmus_full') print('Tagger loaded, now annotating...') orig_path = 'data/wilhelmus/orig/' new_path = 'data/wilhelmus/tagged/' for filename in os.listdir(orig_path): if not filename.endswith('.txt'): continue print('\t +', filename) unseen_tokens = pandora.utils.load_unannotated_file(orig_path + filename, nb_instances=None, tokenized_input=False) annotations = tagger.annotate(unseen_tokens) with codecs.open(new_path + filename, 'w', 'utf8') as f: #for t, l, p in zip(annotations['tokens'], annotations['postcorrect_lemmas'], annotations['pos']): for t, l, p in zip(annotations['tokens'], annotations['lemmas'], annotations['pos']): #for t, l in zip(annotations['tokens'], annotations['lemmas']): f.write('\t'.join((t, l, p))+'\n') print('::: ended :::')
def tag_string(model, input_dir, output_dir=None, string=None, **kwargs): """ Tag a directory of texts :param model: Path to a model file :param input_dir: Untokenized string to tag """ print('::: started :::') tagger = Tagger(load=True, model_dir=model, overwrite=kwargs) print('Tagger loaded, now annotating...') unseen_tokens = tokenize.split(input_dir) print(unseen_tokens) annotations = tagger.annotate(unseen_tokens) keys = list(annotations.keys()) print("--------------------") print('\t'.join(keys)) print("--------------------") for x in zip(*tuple([annotations[k] for k in keys])): print('\t'.join(list(x))) print('::: ended :::')
def main(): print('::: started :::') tagger = Tagger(load=True, model_dir='models/wilhelmus_full') print('Tagger loaded, now annotating...') orig_path = 'data/wilhelmus/orig/' new_path = 'data/wilhelmus/tagged/' for filename in os.listdir(orig_path): if not filename.endswith('.txt'): continue print('\t +', filename) unseen_tokens = pandora.utils.load_unannotated_file( orig_path + filename, nb_instances=None, tokenized_input=False) annotations = tagger.annotate(unseen_tokens) with codecs.open(new_path + filename, 'w', 'utf8') as f: #for t, l, p in zip(annotations['tokens'], annotations['postcorrect_lemmas'], annotations['pos']): for t, l, p in zip(annotations['tokens'], annotations['lemmas'], annotations['pos']): #for t, l in zip(annotations['tokens'], annotations['lemmas']): f.write('\t'.join((t, l, p)) + '\n') print('::: ended :::')