Exemple #1
0
def do_training(arguments, vocab):
    logging.debug("Init training")
    n_epochs = arguments.epochs
    batch_size = arguments.batch_size

    # prep data
    logging.info(">> Loading in data")

    logging.info("tokenizing train data ...")
    training_data = vocab.tokenize_conll(arguments.train)
    logging.info("... tokenized train data")

    if arguments.dev_mode:
        training_data = training_data[:100]

    logging.info("tokenizing dev data ...")
    dev_data = vocab.tokenize_conll(arguments.dev)
    logging.info("... tokenized dev data")

    # instantiate model
    logging.info("creating model ...")
    model = DependencyParser(vocab, arguments.upos_dim, arguments.word_dim,
                             arguments.hidden_dim)
    logging.info("... model created")

    callbacks = []
    tensorboard_logger = None
    if arguments.tb_dest:
        tensorboard_logger = TensorboardLoggerCallback(arguments.tb_dest)
        callbacks.append(tensorboard_logger)

    logging.info("creating ModelSaveCallback ...")
    save_callback = ModelSaveCallback(arguments.model_file)
    callbacks.append(save_callback)
    logging.info("... ModelSaveCallback created")

    # prep params
    logging.info("creating Model ...")
    parser = ParserModel(model,
                         decoder="eisner",
                         loss="kiperwasser",
                         optimizer="adam",
                         strategy="bucket",
                         vocab=vocab)
    logging.info("... Model created")

    logging.info("training Model ...")
    parser.train(training_data,
                 arguments.dev,
                 dev_data,
                 epochs=n_epochs,
                 batch_size=batch_size,
                 callbacks=callbacks,
                 patience=arguments.patience)
    logging.info("...Model trained")

    logging.info("Model maxed on dev at epoch %s " %
                 (save_callback.best_epoch))

    return parser
Exemple #2
0
def do_training_big_datasets(arguments, vocab, embs, subset_size):
    logging.debug("Init training with big dataset (there is no dev mode)")
    n_epochs = arguments.epochs
    batch_size = arguments.batch_size

    logging.info("tokenizing dev data ...")
    dev_data = vocab.tokenize_conll(arguments.dev)
    logging.info("... tokenized dev data")

    # instantiate model
    logging.info("creating model ...")
    model = DependencyParser(vocab, embs, arguments.no_update_pretrained_emb)
    logging.info("... model created")

    callbacks = []
    if arguments.tb_dest:
        tensorboard_logger = TensorboardLoggerCallback(arguments.tb_dest)
        callbacks.append(tensorboard_logger)

    logging.info("creating ModelSaveCallback ...")
    save_callback = ModelSaveCallback(arguments.model_file)
    callbacks.append(save_callback)
    logging.info("... ModelSaveCallback created")

    # prep params
    logging.info("creating Model ...")
    parser = ParserModel(model,
                         decoder="eisner",
                         loss="kiperwasser",
                         optimizer="adam",
                         strategy="bucket",
                         vocab=vocab)
    logging.info("... Model created")

    logging.info("training Model ...")
    parser.train_big_datasets(arguments.train,
                              arguments.dev,
                              dev_data,
                              epochs=n_epochs,
                              batch_size=batch_size,
                              callbacks=callbacks,
                              patience=arguments.patience,
                              subset_size=subset_size)
    logging.info("...Model trained")

    logging.info("Model maxed on dev at epoch %s " %
                 (save_callback.best_epoch))

    return parser
 def _load_model(self):
     """ load original K&G model and  vocab
     """
     self.vocab = Vocabulary(self.model_config['only_words'])
     self.vocab.load(self.model_config['vocab_file'])
     self.parser = DependencyParserPytorch(self.vocab,
                                           self.model_config['upos_dim'],
                                           self.model_config['word_dim'],
                                           self.model_config['hidden_dim'])
     self.model = ParserModel(self.parser,
                              decoder="eisner",
                              loss="kiperwasser",
                              optimizer="adam",
                              strategy="bucket",
                              vocab=self.vocab)
     self.model.load_from_file(self.model_config['model_file'])
Exemple #4
0
def main():

    parser = argparse.ArgumentParser()

    parser.add_argument(
        "--only_words",
        dest="only_words",
        type=str2bool,
        default=False,
        help=
        "Should we use only words to train? Lemmas and POS will be ignored",
        required=True)

    parser.add_argument("--do_training",
                        dest="do_training",
                        type=str2bool,
                        default=False,
                        help="Should we train the model?",
                        required=True)
    parser.add_argument("--train_file",
                        dest="train",
                        help="Annotated CONLL train file",
                        metavar="FILE",
                        required=False)
    parser.add_argument("--dev_file",
                        dest="dev",
                        help="Annotated CONLL dev file",
                        metavar="FILE",
                        required=False)
    parser.add_argument("--test_file",
                        dest="test",
                        help="Annotated CONLL dev test",
                        metavar="FILE",
                        required=True)

    parser.add_argument(
        "--results_folder",
        dest="results_folder",
        help="Folder to store log, model, vocabulary and output",
        metavar="FILE",
        required=True)
    parser.add_argument("--logging_file",
                        dest="logging_file",
                        help="File to store the logs",
                        metavar="FILE",
                        required=True)
    parser.add_argument("--output_file",
                        dest="output_file",
                        help="CONLL output file",
                        metavar="FILE",
                        required=True)
    parser.add_argument("--vocab_file", dest="vocab_file", required=True)
    parser.add_argument("--model_file", dest="model_file", required=True)

    parser.add_argument("--epochs", dest="epochs", type=int, default=30)
    parser.add_argument("--batch_size",
                        dest="batch_size",
                        type=int,
                        default=32)
    parser.add_argument("--tb_dest", dest="tb_dest")
    parser.add_argument("--embs",
                        dest="embs",
                        help="pre-trained embeddings file name",
                        required=False)
    parser.add_argument(
        "--no_update_pretrained_emb",
        dest="no_update_pretrained_emb",
        type=str2bool,
        default=False,
        help="don't update the pretrained embeddings during training")
    parser.add_argument("--patience", dest='patience', type=int, default=-1)
    parser.add_argument(
        "--dev_mode",
        dest='dev_mode',
        type=str2bool,
        default=False,
        help='small subset of training examples, for code testing')

    parser.add_argument(
        "--big_dataset",
        dest='big_dataset',
        type=str2bool,
        default=False,
        help='Are you training with a huge dataset? (i.e. 1B benchmark)')

    arguments, unknown = parser.parse_known_args()

    # create results folder if needed

    if not os.path.exists(arguments.results_folder):
        os.makedirs(arguments.results_folder)

    # configure logging

    logging.basicConfig(filename=arguments.logging_file,
                        level=logging.DEBUG,
                        format="%(asctime)s:%(levelname)s:\t%(message)s")

    logging.info(
        "\n\n\n==================================================================================================="
    )
    logging.info("kiperwasser_main")
    logging.info(
        "===================================================================================================\n"
    )

    logging.info("\nArguments:")
    logging.info(arguments)
    logging.info("\n")

    # load or create vocabulary and embeddings

    vocab, embs = load_or_create_vocab_and_embs(arguments)

    # transform input files into conllu if needed

    arguments.train = transform_to_conllu(arguments.train)
    arguments.dev = transform_to_conllu(arguments.dev)
    arguments.test = transform_to_conllu(arguments.test)

    # load or train parser

    if arguments.do_training:
        if not arguments.big_dataset:
            logging.info('Training with normal dataset')
            parser = do_training(arguments, vocab, embs)
        else:
            subset_size = 10000
            logging.info('Training with big dataset; subset_size = %i' %
                         subset_size)
            parser = do_training_big_datasets(arguments, vocab, embs,
                                              subset_size)

    else:
        logging.info('No training; loading model')
        model = DependencyParser(vocab, embs,
                                 arguments.no_update_pretrained_emb)
        parser = ParserModel(model,
                             decoder="eisner",
                             loss="kiperwasser",
                             optimizer="adam",
                             strategy="bucket",
                             vocab=vocab)

    parser.load_from_file(arguments.model_file)

    # parse test file

    test_data = vocab.tokenize_conll(arguments.test)
    output_file, temporal = parser.parse(arguments.test, test_data,
                                         arguments.batch_size,
                                         arguments.output_file)

    # evaluate output

    metrics = parser.evaluate(output_file, arguments.test)
    test_UAS = metrics["nopunct_uas"]
    test_LAS = metrics["nopunct_las"]

    logging.info(metrics)

    if arguments.tb_dest and tensorboard_logger:
        tensorboard_logger.raw_write("test_UAS", test_UAS)
        tensorboard_logger.raw_write("test_LAS", test_LAS)

    logging.info("\n--------------------------------------------------------")
    logging.info("Test score: %s %s" % (test_UAS, test_LAS))
    logging.info("--------------------------------------------------------\n")
class EmbeddingsExtractor(object):
    def __init__(self, logging_file, model_config):

        # configure logging
        self.logging_file = logging_file
        self._configure_logging()

        self.model_config = model_config
        logging.info(model_config)

        # load vocabulary, parser and model
        self._load_model()

        # create lstms
        self._create_lstms()

    def _configure_logging(self):
        logging.basicConfig(filename=self.logging_file,
                            level=logging.DEBUG,
                            format="%(asctime)s:%(levelname)s:\t%(message)s")

    def _load_model(self):
        """ load original K&G model and  vocab
        """
        self.vocab = Vocabulary(self.model_config['only_words'])
        self.vocab.load(self.model_config['vocab_file'])
        self.parser = DependencyParserPytorch(self.vocab,
                                              self.model_config['upos_dim'],
                                              self.model_config['word_dim'],
                                              self.model_config['hidden_dim'])
        self.model = ParserModel(self.parser,
                                 decoder="eisner",
                                 loss="kiperwasser",
                                 optimizer="adam",
                                 strategy="bucket",
                                 vocab=self.vocab)
        self.model.load_from_file(self.model_config['model_file'])

    def _create_lstms(self):
        # create and initialize FWD and BWD biLSTMs with model parameters

        input_size = self.model_config['word_dim'] + self.model_config[
            'upos_dim']

        state_dict = self.parser.deep_bilstm.state_dict()

        self.lstm_fwd_0 = nn.LSTM(input_size=input_size,
                                  hidden_size=self.model_config['hidden_dim'],
                                  num_layers=1,
                                  batch_first=True,
                                  bidirectional=False)
        new_state_dict = collections.OrderedDict()
        new_state_dict['weight_hh_l0'] = state_dict['lstm.weight_hh_l0']
        new_state_dict['weight_ih_l0'] = state_dict['lstm.weight_ih_l0']
        new_state_dict['bias_hh_l0'] = state_dict['lstm.bias_hh_l0']
        new_state_dict['bias_ih_l0'] = state_dict['lstm.bias_ih_l0']
        self.lstm_fwd_0.load_state_dict(new_state_dict)

        self.lstm_bwd_0 = nn.LSTM(input_size=input_size,
                                  hidden_size=self.model_config['hidden_dim'],
                                  num_layers=1,
                                  batch_first=True,
                                  bidirectional=False)
        new_state_dict = collections.OrderedDict()
        new_state_dict['weight_hh_l0'] = state_dict[
            'lstm.weight_hh_l0_reverse']
        new_state_dict['weight_ih_l0'] = state_dict[
            'lstm.weight_ih_l0_reverse']
        new_state_dict['bias_hh_l0'] = state_dict['lstm.bias_hh_l0_reverse']
        new_state_dict['bias_ih_l0'] = state_dict['lstm.bias_ih_l0_reverse']
        self.lstm_bwd_0.load_state_dict(new_state_dict)

        # NOTICE! input_size = 2*hidden_dim?
        self.lstm_fwd_1 = nn.LSTM(input_size=2 *
                                  self.model_config['hidden_dim'],
                                  hidden_size=self.model_config['hidden_dim'],
                                  num_layers=1,
                                  batch_first=True,
                                  bidirectional=False)
        new_state_dict = collections.OrderedDict()
        new_state_dict['weight_hh_l0'] = state_dict['lstm.weight_hh_l1']
        new_state_dict['weight_ih_l0'] = state_dict['lstm.weight_ih_l1']
        new_state_dict['bias_hh_l0'] = state_dict['lstm.bias_hh_l1']
        new_state_dict['bias_ih_l0'] = state_dict['lstm.bias_ih_l1']
        self.lstm_fwd_1.load_state_dict(new_state_dict)

        # NOTICE! input_size = 2*hidden_dim?
        self.lstm_bwd_1 = nn.LSTM(input_size=2 *
                                  self.model_config['hidden_dim'],
                                  hidden_size=self.model_config['hidden_dim'],
                                  num_layers=1,
                                  batch_first=True,
                                  bidirectional=False)
        new_state_dict = collections.OrderedDict()
        new_state_dict['weight_hh_l0'] = state_dict[
            'lstm.weight_hh_l1_reverse']
        new_state_dict['weight_ih_l0'] = state_dict[
            'lstm.weight_ih_l1_reverse']
        new_state_dict['bias_hh_l0'] = state_dict['lstm.bias_hh_l1_reverse']
        new_state_dict['bias_ih_l0'] = state_dict['lstm.bias_ih_l1_reverse']
        self.lstm_bwd_1.load_state_dict(new_state_dict)

    def generate_embeddings(self, input_file):

        logging.info(
            "\n\n\n==================================================================================================="
        )
        logging.info("Generating K&G contextual embeddings for %s" %
                     input_file)
        logging.info(
            "===================================================================================================\n"
        )

        # generate tokenized data
        tokenized_sentences = self.vocab.tokenize_conll(input_file)

        embs = {}
        for i, sample in enumerate(tokenized_sentences):
            self.model.backend.renew_cg()  # for pytorch it is just 'pass'

            # get embeddings

            words, lemmas, tags, heads, rels, chars = sample

            words = self.model.backend.input_tensor(np.array([words]),
                                                    dtype="int")
            tags = self.model.backend.input_tensor(np.array([tags]),
                                                   dtype="int")

            word_embs = self.parser.wlookup(words)
            tags_embs = self.parser.tlookup(
                tags)  # TODO think if it makes sense to use tag_embs or not!

            input_data0 = torch.cat(
                [word_embs, tags_embs],
                dim=-1)  # dim 1x8x125 (if we have 8 words in the sentence)
            input_data0_reversed = torch.flip(input_data0, (1, ))

            # feed data

            out_lstm_fwd_0, hidden_lstm_fwd_0 = self.lstm_fwd_0(input_data0)
            out_lstm_bwd_0, hidden_lstm_bwd_0 = self.lstm_bwd_0(
                input_data0_reversed)

            input_data1 = torch.cat((out_lstm_fwd_0, out_lstm_bwd_0), 2)
            input_data1_reversed = torch.flip(input_data1, (1, ))
            out_lstm_fwd_1, hidden_lstm_fwd_1 = self.lstm_fwd_1(input_data1)
            out_lstm_bwd_1, hidden_lstm_bwd_1 = self.lstm_bwd_1(
                input_data1_reversed)

            # generate embeddings

            out_lstm_bwd_0 = torch.flip(out_lstm_bwd_0, (1, ))
            out_lstm_bwd_1 = torch.flip(out_lstm_bwd_1, (1, ))

            # TODO in ELMo they perform a task-dependant weighted sum of the concatenation of L0 (initial embeddings), L1 and L2;
            #  As our input has varying sizes and we are not weighting the layers, we'll just concatenate everything.
            # TODO for the syntactic probes, ELMo stores sepparately the three layers, so maybe we can do the same at least with layer 0 and layer1 ¿?
            sentence_embeddings = torch.cat(
                (input_data0, out_lstm_fwd_0, out_lstm_bwd_0, out_lstm_fwd_1,
                 out_lstm_bwd_1), 2)  # 1 x 8 x 125+100+100+100+100 = 525
            embs[i] = sentence_embeddings

        return embs

    @staticmethod
    def save_to_hdf5(embeddings, file_path, skip_root=False):
        # save embeddings in hdf5 format

        # Write contextual word representations to disk for each of the train, dev, and test split in hdf5 format, where the
        # index of the sentence in the conllx file is the key to the hdf5 dataset object. That is, your dataset file should
        # look a bit like {'0': <np.ndarray(size=(1,SEQLEN1,FEATURE_COUNT))>, '1':<np.ndarray(size=(1,SEQLEN1,FEATURE_COUNT))>...}, etc.
        # Note here that SEQLEN for each sentence must be the number of tokens in the sentence as specified by the conllx file.

        with h5py.File(file_path, 'w') as f:
            for k, v in embeddings.items():
                logging.info('creating dataset for k %s' % str(k))
                sentence_embs = v.detach().numpy()
                if skip_root:
                    sentence_embs = sentence_embs[:, 1:, :]
                f.create_dataset(str(k), data=sentence_embs)

    @staticmethod
    def check_hdf5_file(file_path):

        with h5py.File(file_path, 'r') as f:
            for item in f.items():
                logging.info(item)
Exemple #6
0
vocab = Vocabulary()
vocab = vocab.fit(arguments.train)

# prep data
training_data = vocab.tokenize_conll(arguments.train)
dev_data = vocab.tokenize_conll(arguments.dev)
test_data = vocab.tokenize_conll(arguments.test)

model = DependencyParser(vocab)

save_callback = ModelSaveCallback(arguments.model)

# prep params
parser = ParserModel(model,
                     decoder=arguments.decoder,
                     loss="hinge",
                     optimizer="adam",
                     strategy="bucket",
                     vocab=vocab)

parser.train(training_data,
             arguments.dev,
             dev_data,
             epochs=arguments.epochs,
             batch_size=arguments.batch_size,
             callbacks=[save_callback])

# load best model
model.load_from_file(arguments.model)

metrics = parser.parse_and_evaluate(arguments.test,
                                    test_data,
Exemple #7
0
# instantiate model
model = DependencyParser(vocab, embs, arguments.no_update_pretrained_emb)

callbacks = []
tensorboard_logger = None
if arguments.tb_dest:
    tensorboard_logger = TensorboardLoggerCallback(arguments.tb_dest)
    callbacks.append(tensorboard_logger)


save_callback = ModelSaveCallback(arguments.model_dest)
callbacks.append(save_callback)

# prep params
parser = ParserModel(model, decoder="eisner", loss="kiperwasser", optimizer="adam", strategy="bucket", vocab=vocab)
parser.train(training_data, arguments.dev, dev_data, epochs=n_epochs, batch_size=32, callbacks=callbacks, patience=arguments.patience)
parser.load_from_file(arguments.model_dest)

metrics = parser.parse_and_evaluate(arguments.test, test_data, batch_size=32)
test_UAS = metrics["nopunct_uas"]
test_LAS = metrics["nopunct_las"]

print(metrics)

if arguments.tb_dest and tensorboard_logger:
    tensorboard_logger.raw_write("test_UAS", test_UAS)
    tensorboard_logger.raw_write("test_LAS", test_LAS)

print()
print(">>> Model maxed on dev at epoch", save_callback.best_epoch)
Exemple #8
0
optimizer = dy.AdamTrainer(model.parameter_collection, learning_rate, beta_1,
                           beta_2, epsilon)
""" Callbacks """
custom_learning_update_callback = UpdateParamsCallback()
save_callback = ModelSaveCallback(model_destination)
if arguments.tb_dest:
    tensorboard_logger = TensorboardLoggerCallback(tensorboard_destination)
    callbacks = [
        tensorboard_logger, custom_learning_update_callback, save_callback
    ]
else:
    callbacks = [custom_learning_update_callback, save_callback]

parser = ParserModel(model,
                     decoder="cle",
                     loss="crossentropy",
                     optimizer=optimizer,
                     strategy="scaled_batch",
                     vocab=vocab)
""" Prep data """
training_data = vocab.tokenize_conll(arguments.train)
dev_data = vocab.tokenize_conll(arguments.dev)
test_data = vocab.tokenize_conll(arguments.test)

parser.train(training_data,
             dev_file,
             dev_data,
             epochs=n_epochs,
             batch_size=batch_scale,
             callbacks=callbacks)

parser.load_from_file(model_destination)
Exemple #9
0
        "===================================================================================================\n"
    )

    # load model and  vocab

    vocab_file = '/home/lpmayos/hd/code/UniParse/models/kiperwasser/1b/bpe/mini/only_words_true/run1/vocab.pkl'
    model_file = '/home/lpmayos/hd/code/UniParse/models/kiperwasser/1b/bpe/mini/only_words_true/run1/model.model'
    only_words = True
    vocab = Vocabulary(only_words)
    vocab.load(vocab_file)
    embs = None
    parser = DependencyParser(vocab, embs, False)

    model = ParserModel(parser,
                        decoder="eisner",
                        loss="kiperwasser",
                        optimizer="adam",
                        strategy="bucket",
                        vocab=vocab)
    model.load_from_file(model_file)

    # input_file = '/home/lpmayos/hd/code/cvt_text/data/raw_data/depparse/test_mini.txt'
    input_file = '/home/lpmayos/hd/code/structural-probes/example/data/en_ewt-ud-sample/en_ewt-ud-dev.conllu'

    input_file = transform_to_conllu(input_file)
    input_data = vocab.tokenize_conll(input_file)

    embeddings = parser.extract_embeddings(
        input_data,
        model.backend,
        format='concat',
        save=True,