def __init__(self, emb_file):
     sys.stderr.write(f"Loading embeddings from {emb_file} ... ")
     sys.stderr.flush()
     self.word2i, self.i2word, self.emb_space = load_embeddings(emb_file)
     sys.stderr.write("DONE!\n")
     self.vocab_size, self.emb_size = self.emb_space.size()
     sys.stderr.write(
         "-" * len("Vocabulary size:{:>8d}\n".format(self.vocab_size)) +
         "\n")
     sys.stderr.write("Vocabulary size:{:>8d}\n".format(self.vocab_size))
     sys.stderr.write("Embedding size: {:>8d}\n".format(self.emb_size))
     sys.stderr.write(
         "-" * len("Vocabulary size:{:>8d}\n".format(self.vocab_size)) +
         "\n")
     sys.stderr.flush()
Exemple #2
0
def main():
    """
    Load a trained model from file (-m) and use it to PoS-tag input file.
    If no file is specified, read from stdin.
    :return:
    """

    # parse args (file/stdin, )
    input_file = args.input_file
    model_file = args.model_file
    language = args.language

    # load model
    stderr_print(f"Loading model from {model_file} ... ")
    loaded = torch.load(model_file)

    try:
        model = loaded["model"]
    except KeyError:
        raise Exception("Failed to load model.")

    try:
        emb_file = loaded["emb_file"]
        stderr_print(f"Embedding file: {emb_file}")
    except KeyError:
        raise Exception("No embedding file specified.")

    try:
        tag_file = loaded["tag_file"]
        stderr_print(f"Tag file: {tag_file}")
    except KeyError:
        raise Exception("No tag file specified.")

    try:
        padding_emb = loaded["padding_emb"]
        stderr_print(f"Padding embedding loaded.")
    except KeyError:
        padding_emb = None
        warn(f"No padding embedding specified, defaulting to embedding[0].")

    try:
        unknown_emb = loaded["unknown_emb"]
        stderr_print(f"'Unknown' embedding loaded.")
    except KeyError:
        unknown_emb = None
        warn(f"No 'unknown' embedding specified, defaulting to embedding[1].")

    if language is None:
        try:
            language = loaded["language"]
            stderr_print(f"Language: {language}")
        except KeyError:
            language = "english"
            warn(f"No language specified, defaulting to english.")

    stderr_print("DONE")

    # First we read the word embeddings file
    # This function returns a word-to-index dictionary and the embedding tensor
    stderr_print(f"Loading embeddings from {emb_file} ... ", end="")
    word2i, _, embeddings = datautil.load_embeddings(emb_file)
    if padding_emb is not None:
        embeddings[0] = padding_emb
    if unknown_emb is not None:
        embeddings[1] = unknown_emb
    stderr_print("DONE")

    # Load and index POS tag list
    stderr_print(f"Loading tagset from {tag_file} ... ", end="")
    tag2i, i2tag = datautil.load_postags(tag_file)
    tagset_size = len(tag2i)
    stderr_print("DONE")

    # Read input text from file
    # Read from stdin if no file (-f) is specified
    if input_file is not None:
        stderr_print(f"Reading from {input_file}...")
        fin = open(input_file, "r")
    else:
        stderr_print("Reading from standard input...")
        fin = sys.stdin

    sent_ids, X, L, X_words = datautil.prepare_raw_text(fin,
                                                        word2i,
                                                        pad_id=0,
                                                        unk_id=1,
                                                        language=language)
    sent_ids, X, L = datautil.sort_batch(sent_ids, X, L)

    if input_file is not None:
        fin.close()

    # Predict
    Y_h = predict(model, X, L)

    # Reshape flattened output tensor, match tag labels
    # and pair them with input words
    Y_h = Y_h.view(len(X_words), -1)
    Y_h, L, sent_ids = datautil.sort_batch(Y_h, L, sent_ids, descending=False)
    paired = datautil.pair_words_with_tags(X_words, Y_h, L, i2tag)

    # Print to output in the word_TAG format
    datautil.print_words_with_tags(paired)
Exemple #3
0
def main():
    # Prepare data

    # First we read the word embeddings file
    # This function returns a word-to-index dictionary and the embedding tensor
    stderr_print("Loading embeddings ... ", end="")
    word2i, _, embeddings = datautil.load_embeddings(dataparams["emb_file"])
    stderr_print("DONE")

    # Load and index POS tag list
    stderr_print("Loading tagset ... ", end="")
    tag2i, i2tag = datautil.load_postags(dataparams["tag_file"])
    hyperparams["tagset_size"] = len(tag2i)
    hyperparams["padding_id"] = 0
    stderr_print("DONE")

    # Read and index datasets, create tensors
    # Each dataset is a tuple: (input_tensor, targets_tensor, sentence_length_tensor)
    stderr_print("Loading datasets ... ", end="")
    train_data = datautil.prepare_data(dataparams["train_file"], word2i, tag2i,
                                       dataparams["input_len"])
    dev_data = datautil.prepare_data(dataparams["dev_file"], word2i, tag2i,
                                     dataparams["input_len"])
    test_data = datautil.prepare_data(dataparams["test_file"], word2i, tag2i,
                                      dataparams["input_len"])

    # Create dataloaders
    # These object will create batches of data
    train_loader = torch.utils.data.DataLoader(
        train_data,
        batch_size=hyperparams["batch_size"],
        shuffle=True,
        num_workers=8,
        collate_fn=datautil.pad_sort_batch)
    dev_loader = torch.utils.data.DataLoader(
        dev_data,
        batch_size=hyperparams["batch_size"],
        shuffle=False,
        num_workers=8,
        collate_fn=datautil.pad_sort_batch)
    test_loader = torch.utils.data.DataLoader(
        test_data,
        batch_size=hyperparams["batch_size"],
        shuffle=False,
        num_workers=8,
        collate_fn=datautil.pad_sort_batch)
    stderr_print("DONE")

    # Set up the model
    hyperparams["loss_function"] = hyperparams["loss_function"](ignore_index=0)
    model = tagger.RNNTagger(embedding_tensor=embeddings, **hyperparams)
    print()
    print("Hyperparameters:")
    print("\n".join([f"{k}: {v}" for k, v in hyperparams.items()]))
    print()
    print("Number of trainable parameters:",
          sum(p.numel() for p in model.parameters() if p.requires_grad))

    # Set up the confusion matrix to record our predictions
    conf_matrix = util.ConfusionMatrix(hyperparams["tagset_size"],
                                       ignore_index=0,
                                       class_dict=i2tag)

    # Train the model
    model, training_log = train(model,
                                train_loader=train_loader,
                                dev_loader=dev_loader,
                                conf_matrix=conf_matrix,
                                **hyperparams,
                                **dataparams)

    # Save model and training log
    torch.save(
        {
            "model": model,
            "emb_file": dataparams["emb_file"],
            "tag_file": dataparams["tag_file"],
            "padding_emb": embeddings[0],
            "unknown_emb": embeddings[1],
            "language": dataparams["language"]
        }, f"{dataparams['output_dir']}/{TIMESTAMP}.model")
    print(f"Best model saved to {dataparams['output_dir']}/{TIMESTAMP}.model")

    if dataparams["save_log"]:
        util.dictlist_to_csv(
            training_log, f"{dataparams['output_dir']}/{TIMESTAMP}-log.csv")
        print(
            f"Training log saved to {dataparams['output_dir']}/{TIMESTAMP}-log.csv"
        )

    # Evaluate model on dev data
    print()
    print("Evaluating on dev data:")
    conf_matrix.reset()
    loss = batch_predict(model,
                         data_loader=dev_loader,
                         conf_matrix=conf_matrix,
                         mean_loss=True,
                         **hyperparams)

    conf_matrix.print_class_stats()
    print(f"Dev set accuracy: {conf_matrix.accuracy():.4f}")
    print(f"Dev set mean loss: {loss:8g}")
    print()
    if dataparams["save_conf_matrix"]:
        conf_matrix.matrix_to_csv(
            f"{dataparams['output_dir']}/{TIMESTAMP}-confmat-dev.csv")
        print(
            f"Confusion matrix saved to {dataparams['output_dir']}/{TIMESTAMP}-confmat-dev.csv"
        )

    # Evaluate model on test data
    if evaluate_on_test_data:
        print()
        print("Evaluating on test data:")
        conf_matrix.reset()
        loss = batch_predict(model,
                             data_loader=test_loader,
                             conf_matrix=conf_matrix,
                             mean_loss=True,
                             **hyperparams)

        conf_matrix.print_class_stats()
        print(f"Test set accuracy: {conf_matrix.accuracy():.4f}")
        print(f"Test set mean loss: {loss:8g}")
        print()
        if dataparams["save_conf_matrix"]:
            conf_matrix.matrix_to_csv(
                f"{dataparams['output_dir']}/{TIMESTAMP}-confmat-test.csv")
            print(
                f"Confusion matrix saved to {dataparams['output_dir']}/{TIMESTAMP}-confmat-test.csv"
            )