def test_serialization(): all_readers = [ fastqa_reader, modular_qa_reader, # fastqa_reader_torch, dam_snli_reader, cbilstm_nli_reader, modular_nli_reader, distmult_reader, complex_reader, transe_reader, ] for reader in all_readers: vocab, config = {}, {} data = None if reader in {distmult_reader, complex_reader, transe_reader}: data = loaders['jack']( 'tests/test_data/WN18/wn18-snippet.jack.json') config['repr_dim'] = 50 elif reader in {cbilstm_nli_reader, dam_snli_reader}: data = loaders['snli']( 'tests/test_data/SNLI/1000_samples_snli_1.0_train.jsonl') embeddings = load_embeddings("data/GloVe/glove.the.50d.txt", 'glove') vocab = Vocab(emb=embeddings, init_from_embeddings=True) config['repr_dim_input'] = 50 config['repr_dim'] = 50 elif reader in {fastqa_reader}: data = loaders['squad']('data/SQuAD/snippet.json') embeddings = load_embeddings("data/GloVe/glove.the.50d.txt", 'glove') vocab = Vocab(emb=embeddings, init_from_embeddings=True) config['repr_dim_input'] = 50 config['repr_dim'] = 50 if data is not None: tf.reset_default_graph() shared_resources = SharedResources(vocab, config) reader_instance = reader(shared_resources) reader_instance.setup_from_data(data) temp_dir_path = tempfile.mkdtemp() reader_instance.store(temp_dir_path) reader_instance.load(temp_dir_path) assert reader_instance is not None
def load(self, path): """ Loads this (potentially empty) resource from path (all object attributes). Args: path: path to shared resources """ if os.path.exists(path): with open(path, 'rb') as f: self.__dict__.update(pickle.load(f)) for f in os.listdir(os.path.dirname(path)): if f.startswith(os.path.basename(path) + '_') and os.path.isdir(os.path.join(os.path.dirname(path), f)): key = f.split('_')[-1] v = Vocab() v.load(path + '_' + key) self.__dict__[key] = v
def test_single_support_fixed_class_inputs(): import logging logging.basicConfig(level=logging.INFO) data_set = [ (QASetting("Where is the cat?", ["the cat is on the mat."]), [Answer("mat")]) ] shared_resources = SharedResources(Vocab(), {}) input_module = MultipleChoiceSingleSupportInputModule(shared_resources) input_module.setup_from_data(data_set) assert len(input_module.shared_resources.answer_vocab) == 1 assert len(input_module.shared_resources.vocab) == 9 tensor_data_set = list(input_module.batch_generator(data_set, batch_size=3, is_eval=False)) expected_support = ["the", "cat", "is", "on", "the", "mat", "."] expected_support_ids = [[shared_resources.vocab.get_id(sym) for sym in expected_support]] first_instance = tensor_data_set[0] actual_support_ids = first_instance[Ports.Input.support] assert np.array_equal(actual_support_ids, expected_support_ids) assert first_instance[Ports.Input.support_length][0] == len(expected_support) actual_answer_ids = first_instance[Ports.Target.target_index] expected_answer = [input_module.shared_resources.answer_vocab.get_id("mat")] assert np.array_equal(actual_answer_ids, expected_answer) actual_question_ids = first_instance[Ports.Input.question] expected_question = ["where", "is", "the", "cat", "?"] expected_question_ids = [[shared_resources.vocab.get_id(sym) for sym in expected_question]] assert np.array_equal(actual_question_ids, expected_question_ids) assert first_instance[Ports.Input.question_length][0] == len(expected_question)
def test_fastqa(): tf.reset_default_graph() data = load_jack('tests/test_data/squad/snippet_jtr.json') questions = [] # fast qa must be initialized with existing embeddings, so we create some vocab = dict() for question, _ in data: questions.append(question) for t in tokenize(question.question): if t not in vocab: vocab[t] = len(vocab) embeddings = Embeddings(vocab, np.random.random([len(vocab), 10])) # we need a vocabulary (with embeddings for our fastqa_reader, but this is not always necessary) vocab = Vocab(emb=embeddings, init_from_embeddings=True) # ... and a config config = {"batch_size": 1, "repr_dim": 10, "repr_dim_input": embeddings.lookup.shape[1], "with_char_embeddings": True} # create/setup reader shared_resources = SharedResources(vocab, config) fastqa_reader = readers.fastqa_reader(shared_resources) fastqa_reader.setup_from_data(data) answers = fastqa_reader(questions) assert answers, "FastQA reader should produce answers"
def test_shared_resources_store(): embeddings_file = "data/GloVe/glove.the.50d.txt" embeddings = load_embeddings(embeddings_file, 'glove') config = {"embedding_file": embeddings_file, "embedding_format": "glove"} some_vocab = Vocab(vocab=embeddings.vocabulary) some_vocab('foo') shared_resources = SharedResources(some_vocab, config, embeddings) import tempfile with tempfile.TemporaryDirectory() as tmp_dir: path = tmp_dir + "_resources" shared_resources.store(path) new_shared_resources = SharedResources() new_shared_resources.load(path) type_a, type_b = type(new_shared_resources.vocab), type( shared_resources.vocab) assert type_a == type_b for k in new_shared_resources.vocab.__dict__: assert new_shared_resources.vocab.__dict__[ k] == shared_resources.vocab.__dict__[k] assert new_shared_resources.config == shared_resources.config assert new_shared_resources.embeddings.lookup.shape == embeddings.lookup.shape assert np.array_equal(new_shared_resources.embeddings.get(b"the"), embeddings.get(b"the"))
def test_prepare_data(): result = prepare_data(qa_setting, answers, Vocab(), with_answers=True) question_tokens, question_ids, question_lemmas, question_length, \ support_tokens, support_ids, support_lemmas, support_length, \ word_in_question, token_offsets, answer_spans = result assert question_tokens == ['What', 'is', 'the', 'answer', '?'] assert question_ids == [1, 2, 3, 4, 5] assert question_lemmas is None assert question_length == 5 assert support_tokens == [[ 'It', 'is', 'not', 'A', '.', ], ['It', 'is', 'B', '.']] assert support_ids == [[6, 2, 7, 8, 9], [6, 2, 10, 9]] assert support_lemmas == [None, None] assert support_length == [5, 4] assert word_in_question == [[0.0, 1.0, 0.0, 0.0, 0.0], [0.0, 1.0, 0.0, 0.0]] assert token_offsets == [[0, 3, 6, 10, 11], [0, 3, 6, 7]] assert answer_spans == [[], [(2, 2)]]
def fill_vocab(qa_settings, vocab=None, lowercase=False, lemmatize=False, spacy_nlp=False): vocab = vocab or Vocab(unk=None) assert not vocab.frozen, 'Filling frozen vocabs does not make a lot fo sense...' for qa_setting in qa_settings: nlp_preprocess(qa_setting.question, vocab, lowercase, lemmatize, use_spacy=spacy_nlp) for s in qa_setting.support: nlp_preprocess(s, vocab, lowercase, lemmatize, use_spacy=spacy_nlp) return vocab
def create_answer_vocab(qa_settings: Iterable[QASetting] = None, answers: Iterable[Answer] = None): vocab = Vocab(unk=None) if qa_settings is not None: for qa in qa_settings: for c in qa.atomic_candidates: vocab(c) if answers is not None: for a in answers: vocab(a.text) return vocab
def load(self, path): """ Loads this (potentially empty) resource from path (all object attributes). Args: path: path to shared resources """ if os.path.exists(path): with open(path, 'rb') as f: self.__dict__.update(pickle.load(f)) dirname = os.path.dirname(path) for f in os.listdir(dirname): if f.startswith(os.path.basename(path) + '_'): key = f[len(os.path.basename(path) + '_'):] if key == 'config.yaml': with open(os.path.join(dirname, f), 'r') as f: self.config = yaml.load(f) elif os.path.isdir(os.path.join(dirname, f)): v = Vocab() v.load(path + '_' + key) self.__dict__[key] = v
def load(self, path): """ Loads this (potentially empty) resource from path (all object attributes). Args: path: path to shared resources """ remainder_path = os.path.join(path, 'remainder') if os.path.exists(remainder_path): with open(remainder_path, 'rb') as f: self.__dict__.update(pickle.load(f)) for f in os.listdir(path): if f == 'config.yaml': with open(os.path.join(path, f), 'r') as f: self.config = yaml.load(f) elif f == 'embeddings': self.embeddings = Embeddings.from_dir(os.path.join(path, f)) else: v = Vocab() v.load(os.path.join(path, f)) self.__dict__[f] = v
def nlp_preprocess(text: str, vocab: Vocab, lowercase: bool = False, lemmatize: bool = False, with_lemmas: bool = False, with_tokens_offsets: bool = False, use_spacy: bool = False) \ -> Tuple[List[str], List[int], int, Optional[List[str]], Optional[List[int]]]: """Preprocesses a question and support: The steps include tokenization, lower-casing. It also includes the computation of token-to-character offsets for the support. Lemmatization is supported in 2 ways. If lemmatize is True then the returned tokens are lemmatized and the ids correspond to the lemma ids in the vocab. If with_lemmas and not lemmatize then an additional list of the lemmatized token in string form is returned. Returns: tokens, ids, length, lemmas or None, token_offsets or None """ assert not with_lemmas or use_spacy, "enable spacy when using lemmas" assert not lemmatize or use_spacy, "enable spacy when using lemmas" if use_spacy: import spacy nlp = spacy_nlp() thistokenize = lambda t: nlp(t) else: thistokenize = tokenize if lowercase: text = text.lower() tokens = thistokenize(text) token_offsets = None lemmas = None if use_spacy: if with_lemmas: lemmas = [t.lemma_ for t in tokens] if with_tokens_offsets: token_offsets = [t.idx for t in tokens] tokens = [t.lemma for t in tokens] if lemmatize else [t.orth_ for t in tokens] else: # char to token offsets if with_tokens_offsets: token_offsets = token_to_char_offsets(text, tokens) length = len(tokens) ids = vocab(tokens) # make sure ids are non-negative if not vocab.frozen: for i in range(len(ids)): ids[i] = vocab.normalize(ids[i]) return tokens, ids, length, lemmas, token_offsets
def build_vocab(questions): """Since some readers require an initialized vocabulary, initialize it here.""" vocab = dict() for question in questions: for t in tokenize(question.question): if t not in vocab: vocab[t] = len(vocab) embeddings = Embeddings(vocab, np.random.random([len(vocab), 10])) vocab = Vocab(vocab=embeddings.vocabulary) return vocab, embeddings
def test_fastqa(): tf.reset_default_graph() data = load_jack('tests/test_data/squad/snippet_jtr.json') # fast qa must be initialized with existing embeddings, so we create some embeddings = load_embeddings('./tests/test_data/glove.840B.300d_top256.txt', 'glove') # we need a vocabulary (with embeddings for our fastqa_reader, but this is not always necessary) vocab = Vocab(emb=embeddings, init_from_embeddings=True) # ... and a config config = { "batch_size": 1, "repr_dim": 10, "repr_dim_input": embeddings.lookup.shape[1], "with_char_embeddings": True } # create/setup reader shared_resources = SharedResources(vocab, config) input_module = XQAInputModule(shared_resources) model_module = FastQAModule(shared_resources) output_module = XQAOutputModule() reader = TFReader(shared_resources, input_module, model_module, output_module) reader.setup_from_data(data, is_training=True) loss = reader.model_module.tensors[Ports.loss] optimizer = tf.train.AdagradOptimizer(learning_rate=0.01) min_op = optimizer.minimize(loss) session = model_module.tf_session session.run(tf.global_variables_initializer()) for epoch in range(0, 10): for batch in reader.input_module.batch_generator(data, 1, False): feed_dict = reader.model_module.convert_to_feed_dict(batch) loss_value, _ = session.run((loss, min_op), feed_dict=feed_dict) print(loss_value)
def main(config, loader, debug, debug_examples, embedding_file, embedding_format, experiments_db, reader, train, num_train_examples, dev, num_dev_examples, test, vocab_from_embeddings): logger.info("TRAINING") if 'JACK_TEMP' not in os.environ: jack_temp = os.path.join(tempfile.gettempdir(), 'jack', str(uuid.uuid4())) os.environ['JACK_TEMP'] = jack_temp logger.info("JACK_TEMP not set, setting it to %s. Might be used for caching." % jack_temp) else: jack_temp = os.environ['JACK_TEMP'] if not os.path.exists(jack_temp): os.makedirs(jack_temp) if experiments_db is not None: ex.observers.append(SqlObserver.create('sqlite:///%s' % experiments_db)) if debug: train_data = loaders[loader](train, debug_examples) logger.info('loaded {} samples as debug train/dev/test dataset '.format(debug_examples)) dev_data = train_data test_data = train_data if embedding_file is not None and embedding_format is not None: emb_file = 'glove.6B.50d.txt' embeddings = load_embeddings(path.join('data', 'GloVe', emb_file), 'glove') logger.info('loaded pre-trained embeddings ({})'.format(emb_file)) ex.current_run.config["repr_dim_input"] = 50 else: embeddings = Embeddings(None, None) else: train_data = loaders[loader](train, num_train_examples) dev_data = loaders[loader](dev, num_dev_examples) test_data = loaders[loader](test) if test else None logger.info('loaded train/dev/test data') if embedding_file is not None and embedding_format is not None: embeddings = load_embeddings(embedding_file, embedding_format) logger.info('loaded pre-trained embeddings ({})'.format(embedding_file)) ex.current_run.config["repr_dim_input"] = embeddings.lookup[0].shape[0] else: embeddings = None if ex.current_run.config["vocab_from_embeddings"]: raise RuntimeError("If you want to create vocab from embeddings, embeddings have to be provided") vocab = Vocab(emb=embeddings, init_from_embeddings=vocab_from_embeddings) # build JTReader checkpoint() parsed_config = ex.current_run.config ex.run('print_config', config_updates=parsed_config) # name defaults to name of the model if 'name' not in parsed_config or parsed_config['name'] is None: parsed_config['name'] = reader shared_resources = SharedResources(vocab, parsed_config) jtreader = readers.readers[reader](shared_resources) checkpoint() try: jtrain(jtreader, train_data, test_data, dev_data, parsed_config, debug=debug) finally: # clean up temporary dir if os.path.exists(jack_temp): shutil.rmtree(jack_temp)
def run(loader, debug, debug_examples, embedding_file, embedding_format, repr_dim_task_embedding, reader, train, num_train_examples, dev, num_dev_examples, test, vocab_from_embeddings, **kwargs): logger.info("TRAINING") # build JTReader parsed_config = ex.current_run.config ex.run('print_config', config_updates=parsed_config) if 'JACK_TEMP' not in os.environ: jack_temp = os.path.join(tempfile.gettempdir(), 'jack', str(uuid.uuid4())) os.environ['JACK_TEMP'] = jack_temp logger.info( "JACK_TEMP not set, setting it to %s. Might be used for caching." % jack_temp) else: jack_temp = os.environ['JACK_TEMP'] if not os.path.exists(jack_temp): os.makedirs(jack_temp) if debug: train_data = loaders[loader](train, debug_examples) logger.info( 'loaded {} samples as debug train/dev/test dataset '.format( debug_examples)) dev_data = train_data test_data = train_data if embedding_file is not None and embedding_format is not None: emb_file = 'glove.6B.50d.txt' embeddings = load_embeddings(path.join('data', 'GloVe', emb_file), 'glove') logger.info('loaded pre-trained embeddings ({})'.format(emb_file)) else: embeddings = None else: train_data = loaders[loader](train, num_train_examples) dev_data = loaders[loader](dev, num_dev_examples) test_data = loaders[loader](test) if test else None logger.info('loaded train/dev/test data') if embedding_file is not None and embedding_format is not None: embeddings = load_embeddings(embedding_file, embedding_format) logger.info( 'loaded pre-trained embeddings ({})'.format(embedding_file)) else: embeddings = None if vocab_from_embeddings: raise ValueError( "If you want to create vocab from embeddings, embeddings have to be provided" ) vocab = Vocab(vocab=embeddings.vocabulary if vocab_from_embeddings and embeddings is not None else None) if repr_dim_task_embedding < 1 and embeddings is None: raise ValueError( "Either provide pre-trained embeddings or set repr_dim_task_embedding > 0." ) # name defaults to name of the model if 'name' not in parsed_config or parsed_config['name'] is None: parsed_config['name'] = reader shared_resources = SharedResources(vocab, parsed_config, embeddings) jtreader = readers.readers[reader](shared_resources) try: jtrain(jtreader, train_data, test_data, dev_data, parsed_config, debug=debug) finally: # clean up temporary dir if os.path.exists(jack_temp): shutil.rmtree(jack_temp)
def main(batch_size, clip_value, config, loader, debug, debug_examples, dev, embedding_file, embedding_format, experiments_db, epochs, l2, optimizer, learning_rate, learning_rate_decay, log_interval, validation_interval, model, model_dir, seed, tensorboard_folder, test, train, vocab_from_embeddings, write_metrics_to): logger.info("TRAINING") if experiments_db is not None: ex.observers.append(SqlObserver.create('sqlite:///%s' % experiments_db)) if debug: train_data = loaders[loader](train, debug_examples) logger.info( 'loaded {} samples as debug train/dev/test dataset '.format( debug_examples)) dev_data = train_data test_data = train_data if embedding_file is not None and embedding_format is not None: emb_file = 'glove.6B.50d.txt' embeddings = load_embeddings(path.join('data', 'GloVe', emb_file), 'glove') logger.info('loaded pre-trained embeddings ({})'.format(emb_file)) ex.current_run.config["repr_dim_input"] = 50 else: embeddings = Embeddings(None, None) else: train_data = loaders[loader](train) dev_data = loaders[loader](dev) test_data = loaders[loader](test) if test else None logger.info('loaded train/dev/test data') if embedding_file is not None and embedding_format is not None: embeddings = load_embeddings(embedding_file, embedding_format) logger.info( 'loaded pre-trained embeddings ({})'.format(embedding_file)) ex.current_run.config["repr_dim_input"] = embeddings.lookup[ 0].shape[0] else: embeddings = None if ex.current_run.config["vocab_from_embeddings"]: raise RuntimeError( "If you want to create vocab from embeddings, embeddings have to be provided" ) vocab = Vocab(emb=embeddings, init_from_embeddings=vocab_from_embeddings) # build JTReader checkpoint() parsed_config = ex.current_run.config ex.run('print_config', config_updates=parsed_config) # name defaults to name of the model if 'name' not in parsed_config or parsed_config['name'] is None: parsed_config['name'] = model shared_resources = SharedResources(vocab, parsed_config) reader = readers.readers[model](shared_resources) checkpoint() configuration = { 'seed': seed, 'clip_value': clip_value, 'batch_size': batch_size, 'epochs': epochs, 'l2': l2, 'optimizer': optimizer, 'learning_rate': learning_rate, 'learning_rate_decay': learning_rate_decay, 'log_interval': log_interval, 'validation_interval': validation_interval, 'tensorboard_folder': tensorboard_folder, 'model': model, 'model_dir': model_dir, 'write_metrics_to': write_metrics_to } jtrain(reader, train_data, test_data, dev_data, configuration, debug=debug)
tf.app.flags.DEFINE_string('dataset_type', 'squad', 'either squad or jack') tf.app.flags.DEFINE_string('model', None, 'Name of the reader') tf.app.flags.DEFINE_string('model_dir', None, 'directory to saved model') tf.app.flags.DEFINE_string('embedding_path', None, 'path to embeddings') tf.app.flags.DEFINE_string('embedding_format', 'glove', 'embeddings format') tf.app.flags.DEFINE_string('device', "/cpu:0", 'device to use') tf.app.flags.DEFINE_string('out', "results.json", 'Result file path.') tf.app.flags.DEFINE_integer('batch_size', 64, 'batch size') tf.app.flags.DEFINE_integer('beam_size', 1, 'beam size') tf.app.flags.DEFINE_string('kwargs', '{}', 'additional reader-specific configurations') FLAGS = tf.app.flags.FLAGS logger.info("Loading embeddings from {}...".format(FLAGS.embedding_path)) emb = load_embeddings(FLAGS.embedding_path, FLAGS.embedding_format) vocab = Vocab(emb=emb, init_from_embeddings=True) logger.info("Creating and loading reader from {}...".format(FLAGS.model_dir)) config = {"beam_size": FLAGS.beam_size, 'batch_size': FLAGS.batch_size, "max_support_length": None} config.update(json.loads(FLAGS.kwargs)) reader = readers[FLAGS.model](vocab, config) with tf.device(FLAGS.device): reader.load_and_setup(FLAGS.model_dir) if FLAGS.dataset_type == "squad": dataset_jtr = convert_squad(FLAGS.file) else: with open(FLAGS.file) as f: dataset_jtr = json.load(f) dataset = jtr_to_qasetting(dataset_jtr)