def create_shared_resources( resources_or_config: Union[dict, SharedResources] = None) -> SharedResources: """ Produces a SharedResources object based on the input. Args: resources_or_config: either nothing, a configuration dictionary, or shared resources Returns: a SharedResources object. """ if resources_or_config is None: return SharedResources() elif isinstance(resources_or_config, SharedResources): return resources_or_config else: return SharedResources(config=resources_or_config)
def smoke_test(reader_name): """Instantiate the reader, train for one epoch, and run inference.""" data_set = [ (QASetting(question="Which is it?", support=["While b seems plausible, answer a is correct."], id="1", candidates=["a", "b", "c"]), [Answer("a", (6, 6))]) ] questions = [q for q, _ in data_set] v, e = build_vocab(questions) shared_resources = SharedResources(v, {"repr_dim": 10, "dropout": 0.5}, e) tf.reset_default_graph() reader = readers.readers[reader_name](shared_resources) if isinstance(reader, TFReader): reader.train(tf.train.AdamOptimizer(), data_set, batch_size=1, max_epochs=1) else: import torch reader.setup_from_data(data_set, is_training=True) params = list(reader.model_module.prediction_module.parameters()) params.extend(reader.model_module.loss_module.parameters()) optimizer = torch.optim.Adam(params, lr=0.01) reader.train(optimizer, data_set, batch_size=1, max_epochs=1) answers = reader(questions) assert answers, "{} should produce answers".format(reader_name)
def smoke_test(reader_name): """Instantiate the reader, train for one epoch, and run inference.""" data_set = [ (QASetting(question="Which is it?", support=["While b seems plausible, answer a is correct."], id="1", atomic_candidates=["a", "b", "c"]), [Answer("a", (6, 6))]) ] questions = [q for q, _ in data_set] shared_resources = SharedResources(build_vocab(questions), { "repr_dim": 10, "repr_dim_input": 10, "dropout": 0.5, "batch_size": 1 }) tf.reset_default_graph() reader = readers.readers[reader_name](shared_resources) reader.train(tf.train.AdamOptimizer(), data_set, batch_size=1, max_epochs=1) answers = reader(questions) assert answers, "{} should produce answers".format(reader_name)
def main(config, loader, debug, debug_examples, embedding_file, embedding_format, experiments_db, reader, train, num_train_examples, dev, num_dev_examples, test, vocab_from_embeddings): logger.info("TRAINING") if 'JACK_TEMP' not in os.environ: jack_temp = os.path.join(tempfile.gettempdir(), 'jack', str(uuid.uuid4())) os.environ['JACK_TEMP'] = jack_temp logger.info("JACK_TEMP not set, setting it to %s. Might be used for caching." % jack_temp) else: jack_temp = os.environ['JACK_TEMP'] if not os.path.exists(jack_temp): os.makedirs(jack_temp) if experiments_db is not None: ex.observers.append(SqlObserver.create('sqlite:///%s' % experiments_db)) if debug: train_data = loaders[loader](train, debug_examples) logger.info('loaded {} samples as debug train/dev/test dataset '.format(debug_examples)) dev_data = train_data test_data = train_data if embedding_file is not None and embedding_format is not None: emb_file = 'glove.6B.50d.txt' embeddings = load_embeddings(path.join('data', 'GloVe', emb_file), 'glove') logger.info('loaded pre-trained embeddings ({})'.format(emb_file)) ex.current_run.config["repr_dim_input"] = 50 else: embeddings = Embeddings(None, None) else: train_data = loaders[loader](train, num_train_examples) dev_data = loaders[loader](dev, num_dev_examples) test_data = loaders[loader](test) if test else None logger.info('loaded train/dev/test data') if embedding_file is not None and embedding_format is not None: embeddings = load_embeddings(embedding_file, embedding_format) logger.info('loaded pre-trained embeddings ({})'.format(embedding_file)) ex.current_run.config["repr_dim_input"] = embeddings.lookup[0].shape[0] else: embeddings = None if ex.current_run.config["vocab_from_embeddings"]: raise RuntimeError("If you want to create vocab from embeddings, embeddings have to be provided") vocab = Vocab(emb=embeddings, init_from_embeddings=vocab_from_embeddings) # build JTReader checkpoint() parsed_config = ex.current_run.config ex.run('print_config', config_updates=parsed_config) # name defaults to name of the model if 'name' not in parsed_config or parsed_config['name'] is None: parsed_config['name'] = reader shared_resources = SharedResources(vocab, parsed_config) jtreader = readers.readers[reader](shared_resources) checkpoint() try: jtrain(jtreader, train_data, test_data, dev_data, parsed_config, debug=debug) finally: # clean up temporary dir if os.path.exists(jack_temp): shutil.rmtree(jack_temp)
def main(batch_size, clip_value, config, loader, debug, debug_examples, dev, embedding_file, embedding_format, experiments_db, epochs, l2, optimizer, learning_rate, learning_rate_decay, log_interval, validation_interval, model, model_dir, seed, tensorboard_folder, test, train, vocab_from_embeddings, write_metrics_to): logger.info("TRAINING") if experiments_db is not None: ex.observers.append(SqlObserver.create('sqlite:///%s' % experiments_db)) if debug: train_data = loaders[loader](train, debug_examples) logger.info( 'loaded {} samples as debug train/dev/test dataset '.format( debug_examples)) dev_data = train_data test_data = train_data if embedding_file is not None and embedding_format is not None: emb_file = 'glove.6B.50d.txt' embeddings = load_embeddings(path.join('data', 'GloVe', emb_file), 'glove') logger.info('loaded pre-trained embeddings ({})'.format(emb_file)) ex.current_run.config["repr_dim_input"] = 50 else: embeddings = Embeddings(None, None) else: train_data = loaders[loader](train) dev_data = loaders[loader](dev) test_data = loaders[loader](test) if test else None logger.info('loaded train/dev/test data') if embedding_file is not None and embedding_format is not None: embeddings = load_embeddings(embedding_file, embedding_format) logger.info( 'loaded pre-trained embeddings ({})'.format(embedding_file)) ex.current_run.config["repr_dim_input"] = embeddings.lookup[ 0].shape[0] else: embeddings = None if ex.current_run.config["vocab_from_embeddings"]: raise RuntimeError( "If you want to create vocab from embeddings, embeddings have to be provided" ) vocab = Vocab(emb=embeddings, init_from_embeddings=vocab_from_embeddings) # build JTReader checkpoint() parsed_config = ex.current_run.config ex.run('print_config', config_updates=parsed_config) # name defaults to name of the model if 'name' not in parsed_config or parsed_config['name'] is None: parsed_config['name'] = model shared_resources = SharedResources(vocab, parsed_config) reader = readers.readers[model](shared_resources) checkpoint() configuration = { 'seed': seed, 'clip_value': clip_value, 'batch_size': batch_size, 'epochs': epochs, 'l2': l2, 'optimizer': optimizer, 'learning_rate': learning_rate, 'learning_rate_decay': learning_rate_decay, 'log_interval': log_interval, 'validation_interval': validation_interval, 'tensorboard_folder': tensorboard_folder, 'model': model, 'model_dir': model_dir, 'write_metrics_to': write_metrics_to } jtrain(reader, train_data, test_data, dev_data, configuration, debug=debug)
def run(loader, debug, debug_examples, embedding_file, embedding_format, repr_dim_task_embedding, reader, train, num_train_examples, dev, num_dev_examples, test, vocab_from_embeddings, **kwargs): logger.info("TRAINING") # build JTReader parsed_config = ex.current_run.config ex.run('print_config', config_updates=parsed_config) if 'JACK_TEMP' not in os.environ: jack_temp = os.path.join(tempfile.gettempdir(), 'jack', str(uuid.uuid4())) os.environ['JACK_TEMP'] = jack_temp logger.info( "JACK_TEMP not set, setting it to %s. Might be used for caching." % jack_temp) else: jack_temp = os.environ['JACK_TEMP'] if not os.path.exists(jack_temp): os.makedirs(jack_temp) if debug: train_data = loaders[loader](train, debug_examples) logger.info( 'loaded {} samples as debug train/dev/test dataset '.format( debug_examples)) dev_data = train_data test_data = train_data if embedding_file is not None and embedding_format is not None: emb_file = 'glove.6B.50d.txt' embeddings = load_embeddings(path.join('data', 'GloVe', emb_file), 'glove') logger.info('loaded pre-trained embeddings ({})'.format(emb_file)) else: embeddings = None else: train_data = loaders[loader](train, num_train_examples) dev_data = loaders[loader](dev, num_dev_examples) test_data = loaders[loader](test) if test else None logger.info('loaded train/dev/test data') if embedding_file is not None and embedding_format is not None: embeddings = load_embeddings(embedding_file, embedding_format) logger.info( 'loaded pre-trained embeddings ({})'.format(embedding_file)) else: embeddings = None if vocab_from_embeddings: raise ValueError( "If you want to create vocab from embeddings, embeddings have to be provided" ) vocab = Vocab(vocab=embeddings.vocabulary if vocab_from_embeddings and embeddings is not None else None) if repr_dim_task_embedding < 1 and embeddings is None: raise ValueError( "Either provide pre-trained embeddings or set repr_dim_task_embedding > 0." ) # name defaults to name of the model if 'name' not in parsed_config or parsed_config['name'] is None: parsed_config['name'] = reader shared_resources = SharedResources(vocab, parsed_config, embeddings) jtreader = readers.readers[reader](shared_resources) try: jtrain(jtreader, train_data, test_data, dev_data, parsed_config, debug=debug) finally: # clean up temporary dir if os.path.exists(jack_temp): shutil.rmtree(jack_temp)