def __init__(self, emb_file): sys.stderr.write(f"Loading embeddings from {emb_file} ... ") sys.stderr.flush() self.word2i, self.i2word, self.emb_space = load_embeddings(emb_file) sys.stderr.write("DONE!\n") self.vocab_size, self.emb_size = self.emb_space.size() sys.stderr.write( "-" * len("Vocabulary size:{:>8d}\n".format(self.vocab_size)) + "\n") sys.stderr.write("Vocabulary size:{:>8d}\n".format(self.vocab_size)) sys.stderr.write("Embedding size: {:>8d}\n".format(self.emb_size)) sys.stderr.write( "-" * len("Vocabulary size:{:>8d}\n".format(self.vocab_size)) + "\n") sys.stderr.flush()
def main(): """ Load a trained model from file (-m) and use it to PoS-tag input file. If no file is specified, read from stdin. :return: """ # parse args (file/stdin, ) input_file = args.input_file model_file = args.model_file language = args.language # load model stderr_print(f"Loading model from {model_file} ... ") loaded = torch.load(model_file) try: model = loaded["model"] except KeyError: raise Exception("Failed to load model.") try: emb_file = loaded["emb_file"] stderr_print(f"Embedding file: {emb_file}") except KeyError: raise Exception("No embedding file specified.") try: tag_file = loaded["tag_file"] stderr_print(f"Tag file: {tag_file}") except KeyError: raise Exception("No tag file specified.") try: padding_emb = loaded["padding_emb"] stderr_print(f"Padding embedding loaded.") except KeyError: padding_emb = None warn(f"No padding embedding specified, defaulting to embedding[0].") try: unknown_emb = loaded["unknown_emb"] stderr_print(f"'Unknown' embedding loaded.") except KeyError: unknown_emb = None warn(f"No 'unknown' embedding specified, defaulting to embedding[1].") if language is None: try: language = loaded["language"] stderr_print(f"Language: {language}") except KeyError: language = "english" warn(f"No language specified, defaulting to english.") stderr_print("DONE") # First we read the word embeddings file # This function returns a word-to-index dictionary and the embedding tensor stderr_print(f"Loading embeddings from {emb_file} ... ", end="") word2i, _, embeddings = datautil.load_embeddings(emb_file) if padding_emb is not None: embeddings[0] = padding_emb if unknown_emb is not None: embeddings[1] = unknown_emb stderr_print("DONE") # Load and index POS tag list stderr_print(f"Loading tagset from {tag_file} ... ", end="") tag2i, i2tag = datautil.load_postags(tag_file) tagset_size = len(tag2i) stderr_print("DONE") # Read input text from file # Read from stdin if no file (-f) is specified if input_file is not None: stderr_print(f"Reading from {input_file}...") fin = open(input_file, "r") else: stderr_print("Reading from standard input...") fin = sys.stdin sent_ids, X, L, X_words = datautil.prepare_raw_text(fin, word2i, pad_id=0, unk_id=1, language=language) sent_ids, X, L = datautil.sort_batch(sent_ids, X, L) if input_file is not None: fin.close() # Predict Y_h = predict(model, X, L) # Reshape flattened output tensor, match tag labels # and pair them with input words Y_h = Y_h.view(len(X_words), -1) Y_h, L, sent_ids = datautil.sort_batch(Y_h, L, sent_ids, descending=False) paired = datautil.pair_words_with_tags(X_words, Y_h, L, i2tag) # Print to output in the word_TAG format datautil.print_words_with_tags(paired)
def main(): # Prepare data # First we read the word embeddings file # This function returns a word-to-index dictionary and the embedding tensor stderr_print("Loading embeddings ... ", end="") word2i, _, embeddings = datautil.load_embeddings(dataparams["emb_file"]) stderr_print("DONE") # Load and index POS tag list stderr_print("Loading tagset ... ", end="") tag2i, i2tag = datautil.load_postags(dataparams["tag_file"]) hyperparams["tagset_size"] = len(tag2i) hyperparams["padding_id"] = 0 stderr_print("DONE") # Read and index datasets, create tensors # Each dataset is a tuple: (input_tensor, targets_tensor, sentence_length_tensor) stderr_print("Loading datasets ... ", end="") train_data = datautil.prepare_data(dataparams["train_file"], word2i, tag2i, dataparams["input_len"]) dev_data = datautil.prepare_data(dataparams["dev_file"], word2i, tag2i, dataparams["input_len"]) test_data = datautil.prepare_data(dataparams["test_file"], word2i, tag2i, dataparams["input_len"]) # Create dataloaders # These object will create batches of data train_loader = torch.utils.data.DataLoader( train_data, batch_size=hyperparams["batch_size"], shuffle=True, num_workers=8, collate_fn=datautil.pad_sort_batch) dev_loader = torch.utils.data.DataLoader( dev_data, batch_size=hyperparams["batch_size"], shuffle=False, num_workers=8, collate_fn=datautil.pad_sort_batch) test_loader = torch.utils.data.DataLoader( test_data, batch_size=hyperparams["batch_size"], shuffle=False, num_workers=8, collate_fn=datautil.pad_sort_batch) stderr_print("DONE") # Set up the model hyperparams["loss_function"] = hyperparams["loss_function"](ignore_index=0) model = tagger.RNNTagger(embedding_tensor=embeddings, **hyperparams) print() print("Hyperparameters:") print("\n".join([f"{k}: {v}" for k, v in hyperparams.items()])) print() print("Number of trainable parameters:", sum(p.numel() for p in model.parameters() if p.requires_grad)) # Set up the confusion matrix to record our predictions conf_matrix = util.ConfusionMatrix(hyperparams["tagset_size"], ignore_index=0, class_dict=i2tag) # Train the model model, training_log = train(model, train_loader=train_loader, dev_loader=dev_loader, conf_matrix=conf_matrix, **hyperparams, **dataparams) # Save model and training log torch.save( { "model": model, "emb_file": dataparams["emb_file"], "tag_file": dataparams["tag_file"], "padding_emb": embeddings[0], "unknown_emb": embeddings[1], "language": dataparams["language"] }, f"{dataparams['output_dir']}/{TIMESTAMP}.model") print(f"Best model saved to {dataparams['output_dir']}/{TIMESTAMP}.model") if dataparams["save_log"]: util.dictlist_to_csv( training_log, f"{dataparams['output_dir']}/{TIMESTAMP}-log.csv") print( f"Training log saved to {dataparams['output_dir']}/{TIMESTAMP}-log.csv" ) # Evaluate model on dev data print() print("Evaluating on dev data:") conf_matrix.reset() loss = batch_predict(model, data_loader=dev_loader, conf_matrix=conf_matrix, mean_loss=True, **hyperparams) conf_matrix.print_class_stats() print(f"Dev set accuracy: {conf_matrix.accuracy():.4f}") print(f"Dev set mean loss: {loss:8g}") print() if dataparams["save_conf_matrix"]: conf_matrix.matrix_to_csv( f"{dataparams['output_dir']}/{TIMESTAMP}-confmat-dev.csv") print( f"Confusion matrix saved to {dataparams['output_dir']}/{TIMESTAMP}-confmat-dev.csv" ) # Evaluate model on test data if evaluate_on_test_data: print() print("Evaluating on test data:") conf_matrix.reset() loss = batch_predict(model, data_loader=test_loader, conf_matrix=conf_matrix, mean_loss=True, **hyperparams) conf_matrix.print_class_stats() print(f"Test set accuracy: {conf_matrix.accuracy():.4f}") print(f"Test set mean loss: {loss:8g}") print() if dataparams["save_conf_matrix"]: conf_matrix.matrix_to_csv( f"{dataparams['output_dir']}/{TIMESTAMP}-confmat-test.csv") print( f"Confusion matrix saved to {dataparams['output_dir']}/{TIMESTAMP}-confmat-test.csv" )