コード例 #1
0
ファイル: implementations.py プロジェクト: zfjmike/NLUProject
def create_shared_resources(
    resources_or_config: Union[dict,
                               SharedResources] = None) -> SharedResources:
    """
    Produces a SharedResources object based on the input.
    Args:
        resources_or_config: either nothing, a configuration dictionary, or shared resources

    Returns: a SharedResources object.
    """
    if resources_or_config is None:
        return SharedResources()
    elif isinstance(resources_or_config, SharedResources):
        return resources_or_config
    else:
        return SharedResources(config=resources_or_config)
コード例 #2
0
def smoke_test(reader_name):
    """Instantiate the reader, train for one epoch, and run inference."""

    data_set = [
        (QASetting(question="Which is it?",
                   support=["While b seems plausible, answer a is correct."],
                   id="1",
                   candidates=["a", "b", "c"]), [Answer("a", (6, 6))])
    ]
    questions = [q for q, _ in data_set]
    v, e = build_vocab(questions)
    shared_resources = SharedResources(v, {"repr_dim": 10, "dropout": 0.5}, e)
    tf.reset_default_graph()
    reader = readers.readers[reader_name](shared_resources)
    if isinstance(reader, TFReader):
        reader.train(tf.train.AdamOptimizer(),
                     data_set,
                     batch_size=1,
                     max_epochs=1)
    else:
        import torch
        reader.setup_from_data(data_set, is_training=True)
        params = list(reader.model_module.prediction_module.parameters())
        params.extend(reader.model_module.loss_module.parameters())
        optimizer = torch.optim.Adam(params, lr=0.01)
        reader.train(optimizer, data_set, batch_size=1, max_epochs=1)

    answers = reader(questions)

    assert answers, "{} should produce answers".format(reader_name)
コード例 #3
0
def smoke_test(reader_name):
    """Instantiate the reader, train for one epoch, and run inference."""

    data_set = [
        (QASetting(question="Which is it?",
                   support=["While b seems plausible, answer a is correct."],
                   id="1",
                   atomic_candidates=["a", "b", "c"]), [Answer("a", (6, 6))])
    ]
    questions = [q for q, _ in data_set]

    shared_resources = SharedResources(build_vocab(questions), {
        "repr_dim": 10,
        "repr_dim_input": 10,
        "dropout": 0.5,
        "batch_size": 1
    })
    tf.reset_default_graph()
    reader = readers.readers[reader_name](shared_resources)

    reader.train(tf.train.AdamOptimizer(),
                 data_set,
                 batch_size=1,
                 max_epochs=1)

    answers = reader(questions)

    assert answers, "{} should produce answers".format(reader_name)
コード例 #4
0
ファイル: jack-train.py プロジェクト: jg8610/jack
def main(config,
         loader,
         debug,
         debug_examples,
         embedding_file,
         embedding_format,
         experiments_db,
         reader,
         train,
         num_train_examples,
         dev,
         num_dev_examples,
         test,
         vocab_from_embeddings):
    logger.info("TRAINING")

    if 'JACK_TEMP' not in os.environ:
        jack_temp = os.path.join(tempfile.gettempdir(), 'jack', str(uuid.uuid4()))
        os.environ['JACK_TEMP'] = jack_temp
        logger.info("JACK_TEMP not set, setting it to %s. Might be used for caching." % jack_temp)
    else:
        jack_temp = os.environ['JACK_TEMP']
    if not os.path.exists(jack_temp):
        os.makedirs(jack_temp)

    if experiments_db is not None:
        ex.observers.append(SqlObserver.create('sqlite:///%s' % experiments_db))

    if debug:
        train_data = loaders[loader](train, debug_examples)

        logger.info('loaded {} samples as debug train/dev/test dataset '.format(debug_examples))

        dev_data = train_data
        test_data = train_data

        if embedding_file is not None and embedding_format is not None:
            emb_file = 'glove.6B.50d.txt'
            embeddings = load_embeddings(path.join('data', 'GloVe', emb_file), 'glove')
            logger.info('loaded pre-trained embeddings ({})'.format(emb_file))
            ex.current_run.config["repr_dim_input"] = 50
        else:
            embeddings = Embeddings(None, None)
    else:
        train_data = loaders[loader](train, num_train_examples)
        dev_data = loaders[loader](dev, num_dev_examples)
        test_data = loaders[loader](test) if test else None

        logger.info('loaded train/dev/test data')
        if embedding_file is not None and embedding_format is not None:
            embeddings = load_embeddings(embedding_file, embedding_format)
            logger.info('loaded pre-trained embeddings ({})'.format(embedding_file))
            ex.current_run.config["repr_dim_input"] = embeddings.lookup[0].shape[0]
        else:
            embeddings = None
            if ex.current_run.config["vocab_from_embeddings"]:
                raise RuntimeError("If you want to create vocab from embeddings, embeddings have to be provided")

    vocab = Vocab(emb=embeddings, init_from_embeddings=vocab_from_embeddings)

    # build JTReader
    checkpoint()
    parsed_config = ex.current_run.config
    ex.run('print_config', config_updates=parsed_config)

    # name defaults to name of the model
    if 'name' not in parsed_config or parsed_config['name'] is None:
        parsed_config['name'] = reader

    shared_resources = SharedResources(vocab, parsed_config)
    jtreader = readers.readers[reader](shared_resources)

    checkpoint()

    try:
        jtrain(jtreader, train_data, test_data, dev_data, parsed_config, debug=debug)
    finally:  # clean up temporary dir
        if os.path.exists(jack_temp):
            shutil.rmtree(jack_temp)
コード例 #5
0
def main(batch_size, clip_value, config, loader, debug, debug_examples, dev,
         embedding_file, embedding_format, experiments_db, epochs, l2,
         optimizer, learning_rate, learning_rate_decay, log_interval,
         validation_interval, model, model_dir, seed, tensorboard_folder, test,
         train, vocab_from_embeddings, write_metrics_to):
    logger.info("TRAINING")

    if experiments_db is not None:
        ex.observers.append(SqlObserver.create('sqlite:///%s' %
                                               experiments_db))

    if debug:
        train_data = loaders[loader](train, debug_examples)

        logger.info(
            'loaded {} samples as debug train/dev/test dataset '.format(
                debug_examples))

        dev_data = train_data
        test_data = train_data

        if embedding_file is not None and embedding_format is not None:
            emb_file = 'glove.6B.50d.txt'
            embeddings = load_embeddings(path.join('data', 'GloVe', emb_file),
                                         'glove')
            logger.info('loaded pre-trained embeddings ({})'.format(emb_file))
            ex.current_run.config["repr_dim_input"] = 50
        else:
            embeddings = Embeddings(None, None)
    else:
        train_data = loaders[loader](train)
        dev_data = loaders[loader](dev)
        test_data = loaders[loader](test) if test else None

        logger.info('loaded train/dev/test data')
        if embedding_file is not None and embedding_format is not None:
            embeddings = load_embeddings(embedding_file, embedding_format)
            logger.info(
                'loaded pre-trained embeddings ({})'.format(embedding_file))
            ex.current_run.config["repr_dim_input"] = embeddings.lookup[
                0].shape[0]
        else:
            embeddings = None
            if ex.current_run.config["vocab_from_embeddings"]:
                raise RuntimeError(
                    "If you want to create vocab from embeddings, embeddings have to be provided"
                )

    vocab = Vocab(emb=embeddings, init_from_embeddings=vocab_from_embeddings)

    # build JTReader
    checkpoint()
    parsed_config = ex.current_run.config
    ex.run('print_config', config_updates=parsed_config)

    # name defaults to name of the model
    if 'name' not in parsed_config or parsed_config['name'] is None:
        parsed_config['name'] = model

    shared_resources = SharedResources(vocab, parsed_config)
    reader = readers.readers[model](shared_resources)

    checkpoint()

    configuration = {
        'seed': seed,
        'clip_value': clip_value,
        'batch_size': batch_size,
        'epochs': epochs,
        'l2': l2,
        'optimizer': optimizer,
        'learning_rate': learning_rate,
        'learning_rate_decay': learning_rate_decay,
        'log_interval': log_interval,
        'validation_interval': validation_interval,
        'tensorboard_folder': tensorboard_folder,
        'model': model,
        'model_dir': model_dir,
        'write_metrics_to': write_metrics_to
    }

    jtrain(reader, train_data, test_data, dev_data, configuration, debug=debug)
コード例 #6
0
def run(loader, debug, debug_examples, embedding_file, embedding_format,
        repr_dim_task_embedding, reader, train, num_train_examples, dev,
        num_dev_examples, test, vocab_from_embeddings, **kwargs):
    logger.info("TRAINING")

    # build JTReader
    parsed_config = ex.current_run.config
    ex.run('print_config', config_updates=parsed_config)

    if 'JACK_TEMP' not in os.environ:
        jack_temp = os.path.join(tempfile.gettempdir(), 'jack',
                                 str(uuid.uuid4()))
        os.environ['JACK_TEMP'] = jack_temp
        logger.info(
            "JACK_TEMP not set, setting it to %s. Might be used for caching." %
            jack_temp)
    else:
        jack_temp = os.environ['JACK_TEMP']
    if not os.path.exists(jack_temp):
        os.makedirs(jack_temp)

    if debug:
        train_data = loaders[loader](train, debug_examples)

        logger.info(
            'loaded {} samples as debug train/dev/test dataset '.format(
                debug_examples))

        dev_data = train_data
        test_data = train_data

        if embedding_file is not None and embedding_format is not None:
            emb_file = 'glove.6B.50d.txt'
            embeddings = load_embeddings(path.join('data', 'GloVe', emb_file),
                                         'glove')
            logger.info('loaded pre-trained embeddings ({})'.format(emb_file))
        else:
            embeddings = None
    else:
        train_data = loaders[loader](train, num_train_examples)
        dev_data = loaders[loader](dev, num_dev_examples)
        test_data = loaders[loader](test) if test else None

        logger.info('loaded train/dev/test data')
        if embedding_file is not None and embedding_format is not None:
            embeddings = load_embeddings(embedding_file, embedding_format)
            logger.info(
                'loaded pre-trained embeddings ({})'.format(embedding_file))
        else:
            embeddings = None
            if vocab_from_embeddings:
                raise ValueError(
                    "If you want to create vocab from embeddings, embeddings have to be provided"
                )

    vocab = Vocab(vocab=embeddings.vocabulary if vocab_from_embeddings
                  and embeddings is not None else None)

    if repr_dim_task_embedding < 1 and embeddings is None:
        raise ValueError(
            "Either provide pre-trained embeddings or set repr_dim_task_embedding > 0."
        )

    # name defaults to name of the model
    if 'name' not in parsed_config or parsed_config['name'] is None:
        parsed_config['name'] = reader

    shared_resources = SharedResources(vocab, parsed_config, embeddings)
    jtreader = readers.readers[reader](shared_resources)

    try:
        jtrain(jtreader,
               train_data,
               test_data,
               dev_data,
               parsed_config,
               debug=debug)
    finally:  # clean up temporary dir
        if os.path.exists(jack_temp):
            shutil.rmtree(jack_temp)