コード例 #1
0
def test_serialization():
    all_readers = [
        fastqa_reader,
        modular_qa_reader,
        # fastqa_reader_torch,
        dam_snli_reader,
        cbilstm_nli_reader,
        modular_nli_reader,
        distmult_reader,
        complex_reader,
        transe_reader,
    ]

    for reader in all_readers:
        vocab, config = {}, {}

        data = None
        if reader in {distmult_reader, complex_reader, transe_reader}:
            data = loaders['jack'](
                'tests/test_data/WN18/wn18-snippet.jack.json')
            config['repr_dim'] = 50
        elif reader in {cbilstm_nli_reader, dam_snli_reader}:
            data = loaders['snli'](
                'tests/test_data/SNLI/1000_samples_snli_1.0_train.jsonl')

            embeddings = load_embeddings("data/GloVe/glove.the.50d.txt",
                                         'glove')
            vocab = Vocab(emb=embeddings, init_from_embeddings=True)
            config['repr_dim_input'] = 50
            config['repr_dim'] = 50
        elif reader in {fastqa_reader}:
            data = loaders['squad']('data/SQuAD/snippet.json')

            embeddings = load_embeddings("data/GloVe/glove.the.50d.txt",
                                         'glove')
            vocab = Vocab(emb=embeddings, init_from_embeddings=True)
            config['repr_dim_input'] = 50
            config['repr_dim'] = 50

        if data is not None:
            tf.reset_default_graph()

            shared_resources = SharedResources(vocab, config)
            reader_instance = reader(shared_resources)
            reader_instance.setup_from_data(data)

            temp_dir_path = tempfile.mkdtemp()
            reader_instance.store(temp_dir_path)

            reader_instance.load(temp_dir_path)

            assert reader_instance is not None
コード例 #2
0
 def load(self, path):
     """
     Loads this (potentially empty) resource from path (all object attributes).
     Args:
         path: path to shared resources
     """
     if os.path.exists(path):
         with open(path, 'rb') as f:
             self.__dict__.update(pickle.load(f))
     for f in os.listdir(os.path.dirname(path)):
         if f.startswith(os.path.basename(path) + '_') and os.path.isdir(os.path.join(os.path.dirname(path), f)):
             key = f.split('_')[-1]
             v = Vocab()
             v.load(path + '_' + key)
             self.__dict__[key] = v
コード例 #3
0
def test_single_support_fixed_class_inputs():
    import logging
    logging.basicConfig(level=logging.INFO)
    data_set = [
        (QASetting("Where is the cat?", ["the cat is on the mat."]), [Answer("mat")])
    ]
    shared_resources = SharedResources(Vocab(), {})
    input_module = MultipleChoiceSingleSupportInputModule(shared_resources)
    input_module.setup_from_data(data_set)

    assert len(input_module.shared_resources.answer_vocab) == 1
    assert len(input_module.shared_resources.vocab) == 9

    tensor_data_set = list(input_module.batch_generator(data_set, batch_size=3, is_eval=False))

    expected_support = ["the", "cat", "is", "on", "the", "mat", "."]
    expected_support_ids = [[shared_resources.vocab.get_id(sym) for sym in expected_support]]
    first_instance = tensor_data_set[0]
    actual_support_ids = first_instance[Ports.Input.support]
    assert np.array_equal(actual_support_ids, expected_support_ids)
    assert first_instance[Ports.Input.support_length][0] == len(expected_support)

    actual_answer_ids = first_instance[Ports.Target.target_index]
    expected_answer = [input_module.shared_resources.answer_vocab.get_id("mat")]
    assert np.array_equal(actual_answer_ids, expected_answer)

    actual_question_ids = first_instance[Ports.Input.question]
    expected_question = ["where", "is", "the", "cat", "?"]
    expected_question_ids = [[shared_resources.vocab.get_id(sym) for sym in expected_question]]
    assert np.array_equal(actual_question_ids, expected_question_ids)
    assert first_instance[Ports.Input.question_length][0] == len(expected_question)
コード例 #4
0
def test_fastqa():
    tf.reset_default_graph()

    data = load_jack('tests/test_data/squad/snippet_jtr.json')
    questions = []
    # fast qa must be initialized with existing embeddings, so we create some
    vocab = dict()
    for question, _ in data:
        questions.append(question)
        for t in tokenize(question.question):
            if t not in vocab:
                vocab[t] = len(vocab)
    embeddings = Embeddings(vocab, np.random.random([len(vocab), 10]))

    # we need a vocabulary (with embeddings for our fastqa_reader, but this is not always necessary)
    vocab = Vocab(emb=embeddings, init_from_embeddings=True)

    # ... and a config
    config = {"batch_size": 1, "repr_dim": 10, "repr_dim_input": embeddings.lookup.shape[1],
              "with_char_embeddings": True}

    # create/setup reader
    shared_resources = SharedResources(vocab, config)
    fastqa_reader = readers.fastqa_reader(shared_resources)
    fastqa_reader.setup_from_data(data)

    answers = fastqa_reader(questions)

    assert answers, "FastQA reader should produce answers"
コード例 #5
0
def test_shared_resources_store():
    embeddings_file = "data/GloVe/glove.the.50d.txt"
    embeddings = load_embeddings(embeddings_file, 'glove')
    config = {"embedding_file": embeddings_file, "embedding_format": "glove"}
    some_vocab = Vocab(vocab=embeddings.vocabulary)
    some_vocab('foo')
    shared_resources = SharedResources(some_vocab, config, embeddings)

    import tempfile
    with tempfile.TemporaryDirectory() as tmp_dir:
        path = tmp_dir + "_resources"
        shared_resources.store(path)

        new_shared_resources = SharedResources()
        new_shared_resources.load(path)

        type_a, type_b = type(new_shared_resources.vocab), type(
            shared_resources.vocab)
        assert type_a == type_b

        for k in new_shared_resources.vocab.__dict__:
            assert new_shared_resources.vocab.__dict__[
                k] == shared_resources.vocab.__dict__[k]
        assert new_shared_resources.config == shared_resources.config
        assert new_shared_resources.embeddings.lookup.shape == embeddings.lookup.shape
        assert np.array_equal(new_shared_resources.embeddings.get(b"the"),
                              embeddings.get(b"the"))
コード例 #6
0
ファイル: test_util.py プロジェクト: 5l1v3r1/jack-1
def test_prepare_data():

    result = prepare_data(qa_setting, answers, Vocab(), with_answers=True)

    question_tokens, question_ids, question_lemmas, question_length, \
    support_tokens, support_ids, support_lemmas, support_length, \
    word_in_question, token_offsets, answer_spans = result

    assert question_tokens == ['What', 'is', 'the', 'answer', '?']
    assert question_ids == [1, 2, 3, 4, 5]
    assert question_lemmas is None
    assert question_length == 5

    assert support_tokens == [[
        'It',
        'is',
        'not',
        'A',
        '.',
    ], ['It', 'is', 'B', '.']]
    assert support_ids == [[6, 2, 7, 8, 9], [6, 2, 10, 9]]
    assert support_lemmas == [None, None]
    assert support_length == [5, 4]
    assert word_in_question == [[0.0, 1.0, 0.0, 0.0, 0.0],
                                [0.0, 1.0, 0.0, 0.0]]
    assert token_offsets == [[0, 3, 6, 10, 11], [0, 3, 6, 7]]
    assert answer_spans == [[], [(2, 2)]]
コード例 #7
0
ファイル: preprocessing.py プロジェクト: tbmihailov/jack
def fill_vocab(qa_settings, vocab=None, lowercase=False, lemmatize=False, spacy_nlp=False):
    vocab = vocab or Vocab(unk=None)
    assert not vocab.frozen, 'Filling frozen vocabs does not make a lot fo sense...'
    for qa_setting in qa_settings:
        nlp_preprocess(qa_setting.question, vocab, lowercase, lemmatize, use_spacy=spacy_nlp)
        for s in qa_setting.support:
            nlp_preprocess(s, vocab, lowercase, lemmatize, use_spacy=spacy_nlp)
    return vocab
コード例 #8
0
def create_answer_vocab(qa_settings: Iterable[QASetting] = None, answers: Iterable[Answer] = None):
    vocab = Vocab(unk=None)
    if qa_settings is not None:
        for qa in qa_settings:
            for c in qa.atomic_candidates:
                vocab(c)
    if answers is not None:
        for a in answers:
            vocab(a.text)
    return vocab
コード例 #9
0
 def load(self, path):
     """
     Loads this (potentially empty) resource from path (all object attributes).
     Args:
         path: path to shared resources
     """
     if os.path.exists(path):
         with open(path, 'rb') as f:
             self.__dict__.update(pickle.load(f))
     dirname = os.path.dirname(path)
     for f in os.listdir(dirname):
         if f.startswith(os.path.basename(path) + '_'):
             key = f[len(os.path.basename(path) + '_'):]
             if key == 'config.yaml':
                 with open(os.path.join(dirname, f), 'r') as f:
                     self.config = yaml.load(f)
             elif os.path.isdir(os.path.join(dirname, f)):
                 v = Vocab()
                 v.load(path + '_' + key)
                 self.__dict__[key] = v
コード例 #10
0
 def load(self, path):
     """
     Loads this (potentially empty) resource from path (all object attributes).
     Args:
         path: path to shared resources
     """
     remainder_path = os.path.join(path, 'remainder')
     if os.path.exists(remainder_path):
         with open(remainder_path, 'rb') as f:
             self.__dict__.update(pickle.load(f))
     for f in os.listdir(path):
         if f == 'config.yaml':
             with open(os.path.join(path, f), 'r') as f:
                 self.config = yaml.load(f)
         elif f == 'embeddings':
             self.embeddings = Embeddings.from_dir(os.path.join(path, f))
         else:
             v = Vocab()
             v.load(os.path.join(path, f))
             self.__dict__[f] = v
コード例 #11
0
ファイル: preprocessing.py プロジェクト: tbmihailov/jack
def nlp_preprocess(text: str,
                   vocab: Vocab,
                   lowercase: bool = False,
                   lemmatize: bool = False,
                   with_lemmas: bool = False,
                   with_tokens_offsets: bool = False,
                   use_spacy: bool = False) \
        -> Tuple[List[str], List[int], int, Optional[List[str]],
                 Optional[List[int]]]:
    """Preprocesses a question and support:
    The steps include tokenization, lower-casing. It also includes the computation of token-to-character offsets for
    the support. Lemmatization is supported in 2 ways. If lemmatize is True then the returned tokens are lemmatized
    and the ids correspond to the lemma ids in the vocab. If with_lemmas and not lemmatize then an additional list
    of the lemmatized token in string form is returned.

    Returns:
        tokens, ids, length, lemmas or None, token_offsets or None
    """
    assert not with_lemmas or use_spacy, "enable spacy when using lemmas"
    assert not lemmatize or use_spacy, "enable spacy when using lemmas"

    if use_spacy:
        import spacy
        nlp = spacy_nlp()
        thistokenize = lambda t: nlp(t)
    else:
        thistokenize = tokenize
    if lowercase:
        text = text.lower()
    tokens = thistokenize(text)

    token_offsets = None
    lemmas = None
    if use_spacy:
        if with_lemmas:
            lemmas = [t.lemma_ for t in tokens]
        if with_tokens_offsets:
            token_offsets = [t.idx for t in tokens]
        tokens = [t.lemma for t in tokens] if lemmatize else [t.orth_ for t in tokens]
    else:
        # char to token offsets
        if with_tokens_offsets:
            token_offsets = token_to_char_offsets(text, tokens)

    length = len(tokens)

    ids = vocab(tokens)
    # make sure ids are non-negative
    if not vocab.frozen:
        for i in range(len(ids)):
            ids[i] = vocab.normalize(ids[i])

    return tokens, ids, length, lemmas, token_offsets
コード例 #12
0
def build_vocab(questions):
    """Since some readers require an initialized vocabulary, initialize it here."""

    vocab = dict()
    for question in questions:
        for t in tokenize(question.question):
            if t not in vocab:
                vocab[t] = len(vocab)
    embeddings = Embeddings(vocab, np.random.random([len(vocab), 10]))

    vocab = Vocab(vocab=embeddings.vocabulary)
    return vocab, embeddings
コード例 #13
0
def test_fastqa():
    tf.reset_default_graph()

    data = load_jack('tests/test_data/squad/snippet_jtr.json')

    # fast qa must be initialized with existing embeddings, so we create some
    embeddings = load_embeddings('./tests/test_data/glove.840B.300d_top256.txt', 'glove')

    # we need a vocabulary (with embeddings for our fastqa_reader, but this is not always necessary)
    vocab = Vocab(emb=embeddings, init_from_embeddings=True)

    # ... and a config
    config = {
        "batch_size": 1,
        "repr_dim": 10,
        "repr_dim_input": embeddings.lookup.shape[1],
        "with_char_embeddings": True
    }

    # create/setup reader
    shared_resources = SharedResources(vocab, config)

    input_module = XQAInputModule(shared_resources)
    model_module = FastQAModule(shared_resources)
    output_module = XQAOutputModule()

    reader = TFReader(shared_resources, input_module, model_module, output_module)
    reader.setup_from_data(data, is_training=True)

    loss = reader.model_module.tensors[Ports.loss]
    optimizer = tf.train.AdagradOptimizer(learning_rate=0.01)
    min_op = optimizer.minimize(loss)

    session = model_module.tf_session
    session.run(tf.global_variables_initializer())

    for epoch in range(0, 10):
        for batch in reader.input_module.batch_generator(data, 1, False):
            feed_dict = reader.model_module.convert_to_feed_dict(batch)
            loss_value, _ = session.run((loss, min_op), feed_dict=feed_dict)
            print(loss_value)
コード例 #14
0
ファイル: jack-train.py プロジェクト: jg8610/jack
def main(config,
         loader,
         debug,
         debug_examples,
         embedding_file,
         embedding_format,
         experiments_db,
         reader,
         train,
         num_train_examples,
         dev,
         num_dev_examples,
         test,
         vocab_from_embeddings):
    logger.info("TRAINING")

    if 'JACK_TEMP' not in os.environ:
        jack_temp = os.path.join(tempfile.gettempdir(), 'jack', str(uuid.uuid4()))
        os.environ['JACK_TEMP'] = jack_temp
        logger.info("JACK_TEMP not set, setting it to %s. Might be used for caching." % jack_temp)
    else:
        jack_temp = os.environ['JACK_TEMP']
    if not os.path.exists(jack_temp):
        os.makedirs(jack_temp)

    if experiments_db is not None:
        ex.observers.append(SqlObserver.create('sqlite:///%s' % experiments_db))

    if debug:
        train_data = loaders[loader](train, debug_examples)

        logger.info('loaded {} samples as debug train/dev/test dataset '.format(debug_examples))

        dev_data = train_data
        test_data = train_data

        if embedding_file is not None and embedding_format is not None:
            emb_file = 'glove.6B.50d.txt'
            embeddings = load_embeddings(path.join('data', 'GloVe', emb_file), 'glove')
            logger.info('loaded pre-trained embeddings ({})'.format(emb_file))
            ex.current_run.config["repr_dim_input"] = 50
        else:
            embeddings = Embeddings(None, None)
    else:
        train_data = loaders[loader](train, num_train_examples)
        dev_data = loaders[loader](dev, num_dev_examples)
        test_data = loaders[loader](test) if test else None

        logger.info('loaded train/dev/test data')
        if embedding_file is not None and embedding_format is not None:
            embeddings = load_embeddings(embedding_file, embedding_format)
            logger.info('loaded pre-trained embeddings ({})'.format(embedding_file))
            ex.current_run.config["repr_dim_input"] = embeddings.lookup[0].shape[0]
        else:
            embeddings = None
            if ex.current_run.config["vocab_from_embeddings"]:
                raise RuntimeError("If you want to create vocab from embeddings, embeddings have to be provided")

    vocab = Vocab(emb=embeddings, init_from_embeddings=vocab_from_embeddings)

    # build JTReader
    checkpoint()
    parsed_config = ex.current_run.config
    ex.run('print_config', config_updates=parsed_config)

    # name defaults to name of the model
    if 'name' not in parsed_config or parsed_config['name'] is None:
        parsed_config['name'] = reader

    shared_resources = SharedResources(vocab, parsed_config)
    jtreader = readers.readers[reader](shared_resources)

    checkpoint()

    try:
        jtrain(jtreader, train_data, test_data, dev_data, parsed_config, debug=debug)
    finally:  # clean up temporary dir
        if os.path.exists(jack_temp):
            shutil.rmtree(jack_temp)
コード例 #15
0
def run(loader, debug, debug_examples, embedding_file, embedding_format,
        repr_dim_task_embedding, reader, train, num_train_examples, dev,
        num_dev_examples, test, vocab_from_embeddings, **kwargs):
    logger.info("TRAINING")

    # build JTReader
    parsed_config = ex.current_run.config
    ex.run('print_config', config_updates=parsed_config)

    if 'JACK_TEMP' not in os.environ:
        jack_temp = os.path.join(tempfile.gettempdir(), 'jack',
                                 str(uuid.uuid4()))
        os.environ['JACK_TEMP'] = jack_temp
        logger.info(
            "JACK_TEMP not set, setting it to %s. Might be used for caching." %
            jack_temp)
    else:
        jack_temp = os.environ['JACK_TEMP']
    if not os.path.exists(jack_temp):
        os.makedirs(jack_temp)

    if debug:
        train_data = loaders[loader](train, debug_examples)

        logger.info(
            'loaded {} samples as debug train/dev/test dataset '.format(
                debug_examples))

        dev_data = train_data
        test_data = train_data

        if embedding_file is not None and embedding_format is not None:
            emb_file = 'glove.6B.50d.txt'
            embeddings = load_embeddings(path.join('data', 'GloVe', emb_file),
                                         'glove')
            logger.info('loaded pre-trained embeddings ({})'.format(emb_file))
        else:
            embeddings = None
    else:
        train_data = loaders[loader](train, num_train_examples)
        dev_data = loaders[loader](dev, num_dev_examples)
        test_data = loaders[loader](test) if test else None

        logger.info('loaded train/dev/test data')
        if embedding_file is not None and embedding_format is not None:
            embeddings = load_embeddings(embedding_file, embedding_format)
            logger.info(
                'loaded pre-trained embeddings ({})'.format(embedding_file))
        else:
            embeddings = None
            if vocab_from_embeddings:
                raise ValueError(
                    "If you want to create vocab from embeddings, embeddings have to be provided"
                )

    vocab = Vocab(vocab=embeddings.vocabulary if vocab_from_embeddings
                  and embeddings is not None else None)

    if repr_dim_task_embedding < 1 and embeddings is None:
        raise ValueError(
            "Either provide pre-trained embeddings or set repr_dim_task_embedding > 0."
        )

    # name defaults to name of the model
    if 'name' not in parsed_config or parsed_config['name'] is None:
        parsed_config['name'] = reader

    shared_resources = SharedResources(vocab, parsed_config, embeddings)
    jtreader = readers.readers[reader](shared_resources)

    try:
        jtrain(jtreader,
               train_data,
               test_data,
               dev_data,
               parsed_config,
               debug=debug)
    finally:  # clean up temporary dir
        if os.path.exists(jack_temp):
            shutil.rmtree(jack_temp)
コード例 #16
0
def main(batch_size, clip_value, config, loader, debug, debug_examples, dev,
         embedding_file, embedding_format, experiments_db, epochs, l2,
         optimizer, learning_rate, learning_rate_decay, log_interval,
         validation_interval, model, model_dir, seed, tensorboard_folder, test,
         train, vocab_from_embeddings, write_metrics_to):
    logger.info("TRAINING")

    if experiments_db is not None:
        ex.observers.append(SqlObserver.create('sqlite:///%s' %
                                               experiments_db))

    if debug:
        train_data = loaders[loader](train, debug_examples)

        logger.info(
            'loaded {} samples as debug train/dev/test dataset '.format(
                debug_examples))

        dev_data = train_data
        test_data = train_data

        if embedding_file is not None and embedding_format is not None:
            emb_file = 'glove.6B.50d.txt'
            embeddings = load_embeddings(path.join('data', 'GloVe', emb_file),
                                         'glove')
            logger.info('loaded pre-trained embeddings ({})'.format(emb_file))
            ex.current_run.config["repr_dim_input"] = 50
        else:
            embeddings = Embeddings(None, None)
    else:
        train_data = loaders[loader](train)
        dev_data = loaders[loader](dev)
        test_data = loaders[loader](test) if test else None

        logger.info('loaded train/dev/test data')
        if embedding_file is not None and embedding_format is not None:
            embeddings = load_embeddings(embedding_file, embedding_format)
            logger.info(
                'loaded pre-trained embeddings ({})'.format(embedding_file))
            ex.current_run.config["repr_dim_input"] = embeddings.lookup[
                0].shape[0]
        else:
            embeddings = None
            if ex.current_run.config["vocab_from_embeddings"]:
                raise RuntimeError(
                    "If you want to create vocab from embeddings, embeddings have to be provided"
                )

    vocab = Vocab(emb=embeddings, init_from_embeddings=vocab_from_embeddings)

    # build JTReader
    checkpoint()
    parsed_config = ex.current_run.config
    ex.run('print_config', config_updates=parsed_config)

    # name defaults to name of the model
    if 'name' not in parsed_config or parsed_config['name'] is None:
        parsed_config['name'] = model

    shared_resources = SharedResources(vocab, parsed_config)
    reader = readers.readers[model](shared_resources)

    checkpoint()

    configuration = {
        'seed': seed,
        'clip_value': clip_value,
        'batch_size': batch_size,
        'epochs': epochs,
        'l2': l2,
        'optimizer': optimizer,
        'learning_rate': learning_rate,
        'learning_rate_decay': learning_rate_decay,
        'log_interval': log_interval,
        'validation_interval': validation_interval,
        'tensorboard_folder': tensorboard_folder,
        'model': model,
        'model_dir': model_dir,
        'write_metrics_to': write_metrics_to
    }

    jtrain(reader, train_data, test_data, dev_data, configuration, debug=debug)
コード例 #17
0
tf.app.flags.DEFINE_string('dataset_type', 'squad', 'either squad or jack')
tf.app.flags.DEFINE_string('model', None, 'Name of the reader')
tf.app.flags.DEFINE_string('model_dir', None, 'directory to saved model')
tf.app.flags.DEFINE_string('embedding_path', None, 'path to embeddings')
tf.app.flags.DEFINE_string('embedding_format', 'glove', 'embeddings format')
tf.app.flags.DEFINE_string('device', "/cpu:0", 'device to use')
tf.app.flags.DEFINE_string('out', "results.json", 'Result file path.')
tf.app.flags.DEFINE_integer('batch_size', 64, 'batch size')
tf.app.flags.DEFINE_integer('beam_size', 1, 'beam size')
tf.app.flags.DEFINE_string('kwargs', '{}', 'additional reader-specific configurations')

FLAGS = tf.app.flags.FLAGS

logger.info("Loading embeddings from {}...".format(FLAGS.embedding_path))
emb = load_embeddings(FLAGS.embedding_path, FLAGS.embedding_format)
vocab = Vocab(emb=emb, init_from_embeddings=True)

logger.info("Creating and loading reader from {}...".format(FLAGS.model_dir))
config = {"beam_size": FLAGS.beam_size, 'batch_size': FLAGS.batch_size, "max_support_length": None}
config.update(json.loads(FLAGS.kwargs))
reader = readers[FLAGS.model](vocab, config)
with tf.device(FLAGS.device):
    reader.load_and_setup(FLAGS.model_dir)

if FLAGS.dataset_type == "squad":
    dataset_jtr = convert_squad(FLAGS.file)
else:
    with open(FLAGS.file) as f:
        dataset_jtr = json.load(f)

dataset = jtr_to_qasetting(dataset_jtr)