Beispiel #1
0
def _run_model(_, args):
    run_file = args.test
    out_file = args.output
    vocab_file = args.vocab
    model_file = args.parameter_file
    batch_size = args.batch_size
    word_embeddings = None

    model_class = INCLUDED_MODELS.get(args.model_name)

    vocab = Vocabulary().load(vocab_file)
    model = model_class(vocab, word_embeddings)
    parser = Model(
        model,
        decoder="eisner",
        loss="kiperwasser",
        optimizer="adam",
        strategy="bucket",
        vocab=vocab,
    )

    parser.load_from_file(model_file)

    run_data = vocab.tokenize_conll(run_file)
    predictions = parser.run(run_data, batch_size)
    write_predictions_to_file(predictions,
                              reference_file=run_file,
                              output_file=out_file,
                              vocab=vocab)

    print(">> Wrote predictions to conllu file %s" % out_file)
Beispiel #2
0
def _eval_model(_, args):
    test_file = args.filename
    vocab_file = args.vocab
    model_file = args.parameter_file
    batch_size = args.batch_size
    word_embeddings = None

    model_class = INCLUDED_MODELS.get(args.model_name)

    vocab = Vocabulary().load(vocab_file)
    model = model_class(vocab, word_embeddings)
    parser = Model(
        model,
        decoder="eisner",
        loss="kiperwasser",
        optimizer="adam",
        strategy="bucket",
        vocab=vocab,
    )

    parser.load_from_file(model_file)
    test_data = vocab.tokenize_conll(test_file)
    metrics = parser.evaluate(test_file, test_data, batch_size=batch_size)

    for key, value in metrics.items():
        print(key, round(value, 3))
 def _load_model(self):
     """ load original K&G model and  vocab
     """
     self.vocab = Vocabulary(self.model_config['only_words'])
     self.vocab.load(self.model_config['vocab_file'])
     self.parser = DependencyParserPytorch(self.vocab,
                                           self.model_config['upos_dim'],
                                           self.model_config['word_dim'],
                                           self.model_config['hidden_dim'])
     self.model = ParserModel(self.parser,
                              decoder="eisner",
                              loss="kiperwasser",
                              optimizer="adam",
                              strategy="bucket",
                              vocab=self.vocab)
     self.model.load_from_file(self.model_config['model_file'])
Beispiel #4
0
def _train_model(_, args):
    train_file = args.train
    dev_file = args.dev
    epochs = args.epochs
    vocab_dest = args.vocab
    model_dest = args.parameter_file
    batch_size = args.batch_size
    embedding_file = None

    model_class = INCLUDED_MODELS.get(args.model_name)

    if not model_class:
        raise ValueError("Model %s doesn't exist." % args.model)

    # Disable patience if there is no dev. set
    patience = args.patience if dev_file else -1

    vocab = Vocabulary().fit(train_file, embedding_file)
    word_embeddings = vocab.load_embedding() if embedding_file else None
    if word_embeddings:
        print("> Embedding shape", word_embeddings.shape)

    # save vocab for reproducability later
    print("> Saving vocabulary to", vocab_dest)
    vocab.save(vocab_dest)

    # prep data
    print(">> Loading in data")
    training_data = vocab.tokenize_conll(train_file)

    dev_data = vocab.tokenize_conll(dev_file) if dev_file else None

    # instantiate model
    model = model_class(vocab, word_embeddings)

    # 'best' only saves models that improve results on the dev. set
    # 'epoch' saves models on each epoch to a file appended with the epoch number
    save_mode = "best" if dev_file else "epoch"
    save_callback = ModelSaveCallback(model_dest, mode=save_mode)
    callbacks = [save_callback]

    # prep params
    parser = Model(
        model,
        decoder="eisner",
        loss="kiperwasser",
        optimizer="adam",
        strategy="bucket",
        vocab=vocab,
    )
    parser.train(
        training_data,
        dev_file,
        dev_data,
        epochs=epochs,
        batch_size=batch_size,
        callbacks=callbacks,
        patience=patience,
    )
Beispiel #5
0
def load_or_create_vocab(arguments):

    vocab = Vocabulary(arguments.only_words)

    # load or create vocabulary
    try:
        vocab.load(arguments.vocab_file)
    except:
        vocab = vocab.fit(arguments.train)

        # save vocab for reproducability later
        logging.info("> saving vocab to %s" % (arguments.vocab_file))
        vocab.save(arguments.vocab_file)

    return vocab
Beispiel #6
0
def load_or_create_vocab_and_embs(arguments):

    vocab = Vocabulary(arguments.only_words)

    # load or create vocabulary
    try:
        vocab.load(arguments.vocab_file)
    except:
        if arguments.embs == None:
            vocab = vocab.fit(arguments.train)
        else:
            vocab = vocab.fit(arguments.train, arguments.embs)

        # save vocab for reproducability later
        logging.info("> saving vocab to %s" % (arguments.vocab_file))
        vocab.save(arguments.vocab_file)

    if arguments.embs == None:
        embs = None
    else:
        embs = vocab.load_embedding()
        logging.info('shape %s' % (embs.shape))

    return vocab, embs
Beispiel #7
0
def write_predictions_to_file(predictions: Iterable, reference_file: str,
                              output_file: str, vocab: Vocabulary):
    indices, arcs, rels = zip(*predictions)
    flat_arcs = _flat_map(arcs)
    flat_rels = _flat_map(rels)

    idx = 0
    with open(reference_file,
              encoding="UTF-8") as f, open(output_file, 'w',
                                           encoding="UTF-8") as fo:
        for line in f.readlines():
            if re.match(r'\d+\t', line):
                info = line.strip().split()
                assert len(info) == 10, 'Illegal line: %s' % line
                info[6] = str(flat_arcs[idx])
                info[7] = vocab.id2rel(flat_rels[idx])
                fo.write('\t'.join(info) + '\n')
                idx += 1
            else:
                fo.write(line)
Beispiel #8
0
parser = argparse.ArgumentParser()
parser.add_argument("--train", required=True)
parser.add_argument("--dev", required=True)
parser.add_argument("--test", required=True)
parser.add_argument("--model", required=True)

arguments, unknown = parser.parse_known_args()

TRAIN_FILE = arguments.train
DEV_FILE = arguments.dev
TEST_FILE = arguments.test
MODEL_FILE = arguments.model
n_epochs = 5

vocab = Vocabulary()
vocab.fit(TRAIN_FILE)

print(">> Loading in data")
TRAIN = vocab.tokenize_conll(arguments.train)
DEV = vocab.tokenize_conll(arguments.dev)
TEST = vocab.tokenize_conll(arguments.test)

encoder = BetaEncodeHandler()
print("> pre-encoding edges")
s = time.time()
TRAIN = pre_encode(encoder, TRAIN, accumulate_vocab=True)
DEV = pre_encode(encoder, DEV)
TEST = pre_encode(encoder, TEST)
print(">> done pre-encoding", time.time() - s)
class EmbeddingsExtractor(object):
    def __init__(self, logging_file, model_config):

        # configure logging
        self.logging_file = logging_file
        self._configure_logging()

        self.model_config = model_config
        logging.info(model_config)

        # load vocabulary, parser and model
        self._load_model()

        # create lstms
        self._create_lstms()

    def _configure_logging(self):
        logging.basicConfig(filename=self.logging_file,
                            level=logging.DEBUG,
                            format="%(asctime)s:%(levelname)s:\t%(message)s")

    def _load_model(self):
        """ load original K&G model and  vocab
        """
        self.vocab = Vocabulary(self.model_config['only_words'])
        self.vocab.load(self.model_config['vocab_file'])
        self.parser = DependencyParserPytorch(self.vocab,
                                              self.model_config['upos_dim'],
                                              self.model_config['word_dim'],
                                              self.model_config['hidden_dim'])
        self.model = ParserModel(self.parser,
                                 decoder="eisner",
                                 loss="kiperwasser",
                                 optimizer="adam",
                                 strategy="bucket",
                                 vocab=self.vocab)
        self.model.load_from_file(self.model_config['model_file'])

    def _create_lstms(self):
        # create and initialize FWD and BWD biLSTMs with model parameters

        input_size = self.model_config['word_dim'] + self.model_config[
            'upos_dim']

        state_dict = self.parser.deep_bilstm.state_dict()

        self.lstm_fwd_0 = nn.LSTM(input_size=input_size,
                                  hidden_size=self.model_config['hidden_dim'],
                                  num_layers=1,
                                  batch_first=True,
                                  bidirectional=False)
        new_state_dict = collections.OrderedDict()
        new_state_dict['weight_hh_l0'] = state_dict['lstm.weight_hh_l0']
        new_state_dict['weight_ih_l0'] = state_dict['lstm.weight_ih_l0']
        new_state_dict['bias_hh_l0'] = state_dict['lstm.bias_hh_l0']
        new_state_dict['bias_ih_l0'] = state_dict['lstm.bias_ih_l0']
        self.lstm_fwd_0.load_state_dict(new_state_dict)

        self.lstm_bwd_0 = nn.LSTM(input_size=input_size,
                                  hidden_size=self.model_config['hidden_dim'],
                                  num_layers=1,
                                  batch_first=True,
                                  bidirectional=False)
        new_state_dict = collections.OrderedDict()
        new_state_dict['weight_hh_l0'] = state_dict[
            'lstm.weight_hh_l0_reverse']
        new_state_dict['weight_ih_l0'] = state_dict[
            'lstm.weight_ih_l0_reverse']
        new_state_dict['bias_hh_l0'] = state_dict['lstm.bias_hh_l0_reverse']
        new_state_dict['bias_ih_l0'] = state_dict['lstm.bias_ih_l0_reverse']
        self.lstm_bwd_0.load_state_dict(new_state_dict)

        # NOTICE! input_size = 2*hidden_dim?
        self.lstm_fwd_1 = nn.LSTM(input_size=2 *
                                  self.model_config['hidden_dim'],
                                  hidden_size=self.model_config['hidden_dim'],
                                  num_layers=1,
                                  batch_first=True,
                                  bidirectional=False)
        new_state_dict = collections.OrderedDict()
        new_state_dict['weight_hh_l0'] = state_dict['lstm.weight_hh_l1']
        new_state_dict['weight_ih_l0'] = state_dict['lstm.weight_ih_l1']
        new_state_dict['bias_hh_l0'] = state_dict['lstm.bias_hh_l1']
        new_state_dict['bias_ih_l0'] = state_dict['lstm.bias_ih_l1']
        self.lstm_fwd_1.load_state_dict(new_state_dict)

        # NOTICE! input_size = 2*hidden_dim?
        self.lstm_bwd_1 = nn.LSTM(input_size=2 *
                                  self.model_config['hidden_dim'],
                                  hidden_size=self.model_config['hidden_dim'],
                                  num_layers=1,
                                  batch_first=True,
                                  bidirectional=False)
        new_state_dict = collections.OrderedDict()
        new_state_dict['weight_hh_l0'] = state_dict[
            'lstm.weight_hh_l1_reverse']
        new_state_dict['weight_ih_l0'] = state_dict[
            'lstm.weight_ih_l1_reverse']
        new_state_dict['bias_hh_l0'] = state_dict['lstm.bias_hh_l1_reverse']
        new_state_dict['bias_ih_l0'] = state_dict['lstm.bias_ih_l1_reverse']
        self.lstm_bwd_1.load_state_dict(new_state_dict)

    def generate_embeddings(self, input_file):

        logging.info(
            "\n\n\n==================================================================================================="
        )
        logging.info("Generating K&G contextual embeddings for %s" %
                     input_file)
        logging.info(
            "===================================================================================================\n"
        )

        # generate tokenized data
        tokenized_sentences = self.vocab.tokenize_conll(input_file)

        embs = {}
        for i, sample in enumerate(tokenized_sentences):
            self.model.backend.renew_cg()  # for pytorch it is just 'pass'

            # get embeddings

            words, lemmas, tags, heads, rels, chars = sample

            words = self.model.backend.input_tensor(np.array([words]),
                                                    dtype="int")
            tags = self.model.backend.input_tensor(np.array([tags]),
                                                   dtype="int")

            word_embs = self.parser.wlookup(words)
            tags_embs = self.parser.tlookup(
                tags)  # TODO think if it makes sense to use tag_embs or not!

            input_data0 = torch.cat(
                [word_embs, tags_embs],
                dim=-1)  # dim 1x8x125 (if we have 8 words in the sentence)
            input_data0_reversed = torch.flip(input_data0, (1, ))

            # feed data

            out_lstm_fwd_0, hidden_lstm_fwd_0 = self.lstm_fwd_0(input_data0)
            out_lstm_bwd_0, hidden_lstm_bwd_0 = self.lstm_bwd_0(
                input_data0_reversed)

            input_data1 = torch.cat((out_lstm_fwd_0, out_lstm_bwd_0), 2)
            input_data1_reversed = torch.flip(input_data1, (1, ))
            out_lstm_fwd_1, hidden_lstm_fwd_1 = self.lstm_fwd_1(input_data1)
            out_lstm_bwd_1, hidden_lstm_bwd_1 = self.lstm_bwd_1(
                input_data1_reversed)

            # generate embeddings

            out_lstm_bwd_0 = torch.flip(out_lstm_bwd_0, (1, ))
            out_lstm_bwd_1 = torch.flip(out_lstm_bwd_1, (1, ))

            # TODO in ELMo they perform a task-dependant weighted sum of the concatenation of L0 (initial embeddings), L1 and L2;
            #  As our input has varying sizes and we are not weighting the layers, we'll just concatenate everything.
            # TODO for the syntactic probes, ELMo stores sepparately the three layers, so maybe we can do the same at least with layer 0 and layer1 ¿?
            sentence_embeddings = torch.cat(
                (input_data0, out_lstm_fwd_0, out_lstm_bwd_0, out_lstm_fwd_1,
                 out_lstm_bwd_1), 2)  # 1 x 8 x 125+100+100+100+100 = 525
            embs[i] = sentence_embeddings

        return embs

    @staticmethod
    def save_to_hdf5(embeddings, file_path, skip_root=False):
        # save embeddings in hdf5 format

        # Write contextual word representations to disk for each of the train, dev, and test split in hdf5 format, where the
        # index of the sentence in the conllx file is the key to the hdf5 dataset object. That is, your dataset file should
        # look a bit like {'0': <np.ndarray(size=(1,SEQLEN1,FEATURE_COUNT))>, '1':<np.ndarray(size=(1,SEQLEN1,FEATURE_COUNT))>...}, etc.
        # Note here that SEQLEN for each sentence must be the number of tokens in the sentence as specified by the conllx file.

        with h5py.File(file_path, 'w') as f:
            for k, v in embeddings.items():
                logging.info('creating dataset for k %s' % str(k))
                sentence_embs = v.detach().numpy()
                if skip_root:
                    sentence_embs = sentence_embs[:, 1:, :]
                f.create_dataset(str(k), data=sentence_embs)

    @staticmethod
    def check_hdf5_file(file_path):

        with h5py.File(file_path, 'r') as f:
            for item in f.items():
                logging.info(item)
Beispiel #10
0
def compute_vocab_size(vocab_file):
    vocab = Vocabulary()
    vocab.load(vocab_file)
    logging.info('Size of %s: %s' % (vocab_file, vocab.vocab_size))
def main():
    parser = argparse.ArgumentParser()

    parser.add_argument(
        "--train",
        dest="train",
        help="Annotated CONLL train file",
        metavar="FILE",
        required=True,
    )
    parser.add_argument(
        "--dev",
        dest="dev",
        help="Annotated CONLL dev file",
        metavar="FILE",
        required=True,
    )
    parser.add_argument(
        "--test",
        dest="test",
        help="Annotated CONLL dev test",
        metavar="FILE",
        required=True,
    )
    parser.add_argument("--epochs", dest="epochs", type=int, default=30)
    parser.add_argument("--tb_dest", dest="tb_dest")
    parser.add_argument("--vocab_dest", dest="vocab_dest")
    parser.add_argument("--model_dest", dest="model_dest", required=True)
    parser.add_argument(
        "--embs", dest="embs", help="pre-trained embeddings file name", required=False
    )
    parser.add_argument(
        "--no_update_pretrained_emb",
        dest="no_update_pretrained_emb",
        help="don't update the pretrained embeddings during training",
        default=False,
        action="store_true",
    )
    parser.add_argument("--patience", dest="patience", type=int, default=-1)

    arguments, unknown = parser.parse_known_args()

    n_epochs = arguments.epochs

    vocab = Vocabulary()
    if arguments.embs:
        vocab = vocab.fit(arguments.train, arguments.embs)
        embs = vocab.load_embedding()
        print("shape", embs.shape)
    else:
        vocab = vocab.fit(arguments.train)
        embs = None

    # save vocab for reproducability later
    if arguments.vocab_dest:
        print("> saving vocab to", arguments.vocab_dest)
        vocab.save(arguments.vocab_dest)

    # prep data
    print(">> Loading in data")
    training_data = vocab.tokenize_conll(arguments.train)
    dev_data = vocab.tokenize_conll(arguments.dev)
    test_data = vocab.tokenize_conll(arguments.test)

    # instantiate model
    model = DependencyParser(vocab, embs)

    callbacks = []
    tensorboard_logger = None
    if arguments.tb_dest:
        tensorboard_logger = TensorboardLoggerCallback(arguments.tb_dest)
        callbacks.append(tensorboard_logger)

    save_callback = ModelSaveCallback(arguments.model_dest)
    callbacks.append(save_callback)

    # prep params
    parser = Model(
        model,
        decoder="eisner",
        loss="kiperwasser",
        optimizer="adam",
        strategy="bucket",
        vocab=vocab,
    )
    parser.train(
        training_data,
        arguments.dev,
        dev_data,
        epochs=n_epochs,
        batch_size=32,
        callbacks=callbacks,
        patience=arguments.patience,
    )
    parser.load_from_file(arguments.model_dest)

    metrics = parser.evaluate(arguments.test, test_data, batch_size=32)
    test_UAS = metrics["nopunct_uas"]
    test_LAS = metrics["nopunct_las"]

    print(metrics)

    if arguments.tb_dest and tensorboard_logger:
        tensorboard_logger.raw_write("test_UAS", test_UAS)
        tensorboard_logger.raw_write("test_LAS", test_LAS)

    print()
    print(">>> Model maxed on dev at epoch", save_callback.best_epoch)
    print(">>> Test score:", test_UAS, test_LAS)
Beispiel #12
0
parser.add_argument("--dev", dest="dev", help="Annotated CONLL dev file", metavar="FILE", required=True)
parser.add_argument("--test", dest="test", help="Annotated CONLL dev test", metavar="FILE", required=True)
parser.add_argument("--epochs", dest="epochs", type=int, default=30)
parser.add_argument("--tb_dest", dest="tb_dest")
parser.add_argument("--vocab_dest", dest="vocab_dest")
parser.add_argument("--model_dest", dest="model_dest", required=True)
parser.add_argument("--embs", dest="embs", help="pre-trained embeddings file name", required=False)
parser.add_argument("--no_update_pretrained_emb", dest="no_update_pretrained_emb", help="don't update the pretrained embeddings during training", default=False, action='store_true')
parser.add_argument("--patience", dest='patience', type=int, default=-1)
parser.add_argument("--dev_mode", dest='dev_mode', default=False, help='small subset of training examples, for code testing', action='store_true')

arguments, unknown = parser.parse_known_args()

n_epochs = arguments.epochs

vocab = Vocabulary()
if arguments.embs == None:
    vocab = vocab.fit(arguments.train)
    embs = None
else:
    vocab = vocab.fit(arguments.train, arguments.embs)
    embs = vocab.load_embedding()
    print('shape',embs.shape)

# save vocab for reproducability later
if arguments.vocab_dest:
    print("> saving vocab to", arguments.vocab_dest)
    vocab.save(arguments.vocab_dest)

# prep data
print(">> Loading in data")
Beispiel #13
0
if __name__ == "__main__":
    ARGPARSER = argparse.ArgumentParser()
    ARGPARSER.add_argument("--train", required=True)
    ARGPARSER.add_argument("--dev", required=True)
    ARGPARSER.add_argument("--test", required=True)
    ARGPARSER.add_argument("--model", required=True)

    ARGUMENTS, UNK = ARGPARSER.parse_known_args()

    TRAIN_FILE = ARGUMENTS.train
    DEV_FILE = ARGUMENTS.dev
    TEST_FILE = ARGUMENTS.test
    MODEL_FILE = ARGUMENTS.model
    N_EPOCHS = 5

    VOCAB = Vocabulary()
    VOCAB.fit(TRAIN_FILE)

    print("> Loading in data")
    TRAIN = VOCAB.tokenize_conll(ARGUMENTS.train)
    DEV = VOCAB.tokenize_conll(ARGUMENTS.dev)
    TEST = VOCAB.tokenize_conll(ARGUMENTS.test)

    ENCODER = BetaEncodeHandler()
    print("> Pre-encoding edges")
    START_TIME = time.time()
    TRAIN = pre_encode(ENCODER, TRAIN, accumulate_vocab=True)
    DEV = pre_encode(ENCODER, DEV)
    TEST = pre_encode(ENCODER, TEST)
    print(">> Done pre-encoding edges", time.time() - START_TIME)
Beispiel #14
0
def main():
    """Main function."""
    argparser = argparse.ArgumentParser()

    argparser.add_argument("--train", required=True)
    argparser.add_argument("--dev", required=True)
    argparser.add_argument("--test", required=True)
    argparser.add_argument("--emb", dest="emb")
    argparser.add_argument("--epochs", dest="epochs", type=int, default=283)
    argparser.add_argument("--vocab_dest", dest="vocab_dest", required=True)
    argparser.add_argument("--model_dest", dest="model_dest", required=True)

    argparser.add_argument("--lstm_layers", dest="lstm_layers", type=int, default=3)
    argparser.add_argument("--dropout", type=int, default=0.33)

    arguments, _ = argparser.parse_known_args()

    # [Data]
    min_occur_count = 2
    train_file = arguments.train
    dev_file = arguments.dev
    vocab_destination = arguments.vocab_dest
    model_destination = arguments.model_dest

    # [Network]
    word_dims = 100
    tag_dims = 100
    lstm_hiddens = 400
    mlp_arc_size = 500
    mlp_rel_size = 100
    lstm_layers = arguments.lstm_layers
    dropout_emb = arguments.dropout
    dropout_lstm_input = arguments.dropout
    dropout_lstm_hidden = arguments.dropout
    dropout_mlp = arguments.dropout

    # [Hyperparameters for optimizer]
    learning_rate = 2e-3
    decay = 0.75
    decay_steps = 5000
    beta_1 = 0.9
    beta_2 = 0.9
    epsilon = 1e-12

    # [Run]
    batch_scale = 5000  # for scaled batching
    n_epochs = arguments.epochs

    vocab = Vocabulary()
    vocab = vocab.fit(train_file, arguments.emb, min_occur_count)
    embs = vocab.load_embedding(True) if arguments.emb else None

    vocab.save(vocab_destination)

    model = DozatManning(
        vocab,
        word_dims,
        tag_dims,
        dropout_emb,
        lstm_layers,
        lstm_hiddens,
        dropout_lstm_input,
        dropout_lstm_hidden,
        mlp_arc_size,
        mlp_rel_size,
        dropout_mlp,
        pretrained_embeddings=embs,
    )

    optimizer = dy.AdamTrainer(
        model.parameter_collection, learning_rate, beta_1, beta_2, epsilon
    )

    # Callbacks
    custom_learning_update_callback = UpdateParamsCallback(
        optimizer, learning_rate, decay, decay_steps
    )
    save_callback = ModelSaveCallback(model_destination)
    callbacks = [custom_learning_update_callback, save_callback]

    parser = Model(
        model,
        decoder="cle",
        loss="crossentropy",
        optimizer=optimizer,
        strategy="scaled_batch",
        vocab=vocab,
    )

    # Prep data
    training_data = vocab.tokenize_conll(arguments.train)
    dev_data = vocab.tokenize_conll(arguments.dev)
    test_data = vocab.tokenize_conll(arguments.test)

    parser.train(
        training_data,
        dev_file,
        dev_data,
        epochs=n_epochs,
        batch_size=batch_scale,
        callbacks=callbacks,
    )

    parser.load_from_file(model_destination)

    metrics = parser.evaluate(arguments.test, test_data, batch_size=batch_scale)
    test_uas = metrics["nopunct_uas"]
    test_las = metrics["nopunct_las"]

    print()
    print(metrics)
    print(">> Test score:", test_uas, test_las)
Beispiel #15
0
if arguments.lstm_layers != 3:
    print(
        ">> WARNING: running with more or less bilstm layers than origin (%d)"
        % arguments.lstm_layers)

if arguments.embedding_file is None:
    print(">> WARNING: Running without pretrained embeddings.")

if arguments.no_orth_init:
    print(">> Warning: running without orthogonal initilization on parameters")

if arguments.dropout < 0.33:
    print(">> Warning: running model with less dropout")

vocab = Vocabulary()
vocab = vocab.fit(train_file, pretrained_embeddings_file, min_occur_count)
embs = vocab.load_embedding(
    variance_normalize=True) if arguments.embedding_file else None

# save vocab for reproducing later
if vocab_destination:
    vocab.save(vocab_destination)
    print("> saving vocab to", vocab_destination)
""" """
model = BaseParser(vocab, word_dims, tag_dims, dropout_emb, lstm_layers,
                   lstm_hiddens, dropout_lstm_input, dropout_lstm_hidden,
                   mlp_arc_size, mlp_rel_size, dropout_mlp, embs, orth_init)
""" Instantiate custom optimizer """
optimizer = dy.AdamTrainer(model.parameter_collection, learning_rate, beta_1,
                           beta_2, epsilon)
Beispiel #16
0
                        format="%(asctime)s:%(levelname)s:\t%(message)s")

    logging.info(
        "\n\n\n==================================================================================================="
    )
    logging.info("kiperwasser_embeddings_extractor")
    logging.info(
        "===================================================================================================\n"
    )

    # load model and  vocab

    vocab_file = '/home/lpmayos/hd/code/UniParse/models/kiperwasser/1b/bpe/mini/only_words_true/run1/vocab.pkl'
    model_file = '/home/lpmayos/hd/code/UniParse/models/kiperwasser/1b/bpe/mini/only_words_true/run1/model.model'
    only_words = True
    vocab = Vocabulary(only_words)
    vocab.load(vocab_file)
    embs = None
    parser = DependencyParser(vocab, embs, False)

    model = ParserModel(parser,
                        decoder="eisner",
                        loss="kiperwasser",
                        optimizer="adam",
                        strategy="bucket",
                        vocab=vocab)
    model.load_from_file(model_file)

    # input_file = '/home/lpmayos/hd/code/cvt_text/data/raw_data/depparse/test_mini.txt'
    input_file = '/home/lpmayos/hd/code/structural-probes/example/data/en_ewt-ud-sample/en_ewt-ud-dev.conllu'