Beispiel #1
0
    def test_read_from_file(self, lazy):
        reader = SnliReader(lazy=lazy)
        instances = reader.read('tests/fixtures/multinli_1.0_train.jsonl')
        instances = ensure_list(instances)

        instance0 = {"premise": ["Conceptually", "cream", "skimming", "has", "two", "basic", "dimensions", "-", "product", "and", "geography", "."],
        			 "hypothesis": ["Product", "and", "geography", "are", "what", "make", "cream", "skimming", "work", "."],
        			 "label": "neutral"}

        instance1 = {"premise": ["you", "know", "during", "the", "season", "and", "i", "guess", "at", "at", "your", "level", "uh", "you", "lose", "them", "to", "the", "next", "level", "if", "if", "they", "decide", "to", "recall", "the", "the", "parent", "team", "the", "Braves", "decide", "to", "call", "to", "recall", "a", "guy", "from", "triple", "A", "then", "a", "double", "A", "guy", "goes", "up", "to", "replace", "him", "and", "a", "single", "A", "guy", "goes", "up", "to", "replace", "him"],
                     "hypothesis": ["You", "lose", "the", "things", "to", "the", "following", "level", "if", "the", "people", "recall", "."],
                     "label": "entailment"}

        instance2 = {"premise": ["One", "of", "our", "number", "will", "carry", "out", "your", "instructions", "minutely", "."],
                     "hypothesis": ["A", "member", "of", "my", "team", "will", "execute", "your", "orders", "with", "immense", "precision", "."],
                     "label": "entailment"}

        assert len(instances) == 3
        def equals(fields, instance): 
        	assert [t.text for t in fields["premise"].tokens] == instance["premise"]
        	assert [t.text for t in fields["hypothesis"].tokens] == instance["hypothesis"]
        	assert fields["label"].label == instance["label"]

        equals(instances[0].fields, instance0)
        equals(instances[1].fields, instance1)
        equals(instances[2].fields, instance2)
    def test_read_from_file(self, lazy):
        reader = SnliReader(lazy=lazy)
        instances = reader.read(AllenNlpTestCase.FIXTURES_ROOT / u'data' / u'snli.jsonl')
        instances = ensure_list(instances)

        instance1 = {u"premise": [u"A", u"person", u"on", u"a", u"horse", u"jumps", u"over", u"a", u"broken",
                                 u"down", u"airplane", u"."],
                     u"hypothesis": [u"A", u"person", u"is", u"training", u"his", u"horse", u"for", u"a",
                                    u"competition", u"."],
                     u"label": u"neutral"}

        instance2 = {u"premise": [u"A", u"person", u"on", u"a", u"horse", u"jumps", u"over", u"a", u"broken",
                                 u"down", u"airplane", u"."],
                     u"hypothesis": [u"A", u"person", u"is", u"at", u"a", u"diner", u",", u"ordering", u"an",
                                    u"omelette", u"."],
                     u"label": u"contradiction"}
        instance3 = {u"premise": [u"A", u"person", u"on", u"a", u"horse", u"jumps", u"over", u"a", u"broken",
                                 u"down", u"airplane", u"."],
                     u"hypothesis": [u"A", u"person", u"is", u"outdoors", u",", u"on", u"a", u"horse", u"."],
                     u"label": u"entailment"}

        assert len(instances) == 3
        fields = instances[0].fields
        assert [t.text for t in fields[u"premise"].tokens] == instance1[u"premise"]
        assert [t.text for t in fields[u"hypothesis"].tokens] == instance1[u"hypothesis"]
        assert fields[u"label"].label == instance1[u"label"]
        fields = instances[1].fields
        assert [t.text for t in fields[u"premise"].tokens] == instance2[u"premise"]
        assert [t.text for t in fields[u"hypothesis"].tokens] == instance2[u"hypothesis"]
        assert fields[u"label"].label == instance2[u"label"]
        fields = instances[2].fields
        assert [t.text for t in fields[u"premise"].tokens] == instance3[u"premise"]
        assert [t.text for t in fields[u"hypothesis"].tokens] == instance3[u"hypothesis"]
        assert fields[u"label"].label == instance3[u"label"]
Beispiel #3
0
    def test_read_from_file(self):

        reader = SnliReader()
        dataset = reader.read('tests/fixtures/data/snli.jsonl')

        instance1 = {"premise": ["A", "person", "on", "a", "horse",
                                 "jumps", "over", "a", "broken", "down", "airplane", "."],
                     "hypothesis": ["A", "person", "is", "training",
                                    "his", "horse", "for", "a", "competition", "."],
                     "label": "neutral"}

        instance2 = {"premise": ["A", "person", "on", "a", "horse",
                                 "jumps", "over", "a", "broken", "down", "airplane", "."],
                     "hypothesis": ["A", "person", "is", "at", "a", "diner",
                                    ",", "ordering", "an", "omelette", "."],
                     "label": "contradiction"}
        instance3 = {"premise": ["A", "person", "on", "a", "horse",
                                 "jumps", "over", "a", "broken", "down", "airplane", "."],
                     "hypothesis": ["A", "person", "is", "outdoors", ",", "on", "a", "horse", "."],
                     "label": "entailment"}

        assert len(dataset.instances) == 3
        fields = dataset.instances[0].fields()
        assert fields["premise"].tokens() == instance1["premise"]
        assert fields["hypothesis"].tokens() == instance1["hypothesis"]
        assert fields["label"].label() == instance1["label"]
        fields = dataset.instances[1].fields()
        assert fields["premise"].tokens() == instance2["premise"]
        assert fields["hypothesis"].tokens() == instance2["hypothesis"]
        assert fields["label"].label() == instance2["label"]
        fields = dataset.instances[2].fields()
        assert fields["premise"].tokens() == instance3["premise"]
        assert fields["hypothesis"].tokens() == instance3["hypothesis"]
        assert fields["label"].label() == instance3["label"]
 def test_read_creates_cache_file_when_not_present(self):
     snli_file = AllenNlpTestCase.FIXTURES_ROOT / "data" / "snli.jsonl"
     reader = SnliReader(cache_directory=self.cache_directory)
     cache_file = reader._get_cache_location_for_file_path(snli_file)
     assert not os.path.exists(cache_file)
     reader.read(snli_file)
     assert os.path.exists(cache_file)
    def test_read_from_file(self, lazy):
        reader = SnliReader(lazy=lazy)
        instances = reader.read('tests/fixtures/data/snli.jsonl')
        instances = ensure_list(instances)

        instance1 = {"premise": ["A", "person", "on", "a", "horse", "jumps", "over", "a", "broken",
                                 "down", "airplane", "."],
                     "hypothesis": ["A", "person", "is", "training", "his", "horse", "for", "a",
                                    "competition", "."],
                     "label": "neutral"}

        instance2 = {"premise": ["A", "person", "on", "a", "horse", "jumps", "over", "a", "broken",
                                 "down", "airplane", "."],
                     "hypothesis": ["A", "person", "is", "at", "a", "diner", ",", "ordering", "an",
                                    "omelette", "."],
                     "label": "contradiction"}
        instance3 = {"premise": ["A", "person", "on", "a", "horse", "jumps", "over", "a", "broken",
                                 "down", "airplane", "."],
                     "hypothesis": ["A", "person", "is", "outdoors", ",", "on", "a", "horse", "."],
                     "label": "entailment"}

        assert len(instances) == 3
        fields = instances[0].fields
        assert [t.text for t in fields["premise"].tokens] == instance1["premise"]
        assert [t.text for t in fields["hypothesis"].tokens] == instance1["hypothesis"]
        assert fields["label"].label == instance1["label"]
        fields = instances[1].fields
        assert [t.text for t in fields["premise"].tokens] == instance2["premise"]
        assert [t.text for t in fields["hypothesis"].tokens] == instance2["hypothesis"]
        assert fields["label"].label == instance2["label"]
        fields = instances[2].fields
        assert [t.text for t in fields["premise"].tokens] == instance3["premise"]
        assert [t.text for t in fields["hypothesis"].tokens] == instance3["hypothesis"]
        assert fields["label"].label == instance3["label"]
def main():
    args = parse_args()
    checkpoint_path = Path(args.checkpoint)
    checkpoint_dir = checkpoint_path.parent
    params_path = checkpoint_dir / 'params.json'
    vocab_dir = checkpoint_dir / 'vocab'

    params = Params.from_file(params_path)
    train_params, model_params = params.pop('train'), params.pop('model')

    tokenizer = WordTokenizer(
        start_tokens=['<s>'],
        end_tokens=['</s>'],
    )
    token_indexer = SingleIdTokenIndexer(lowercase_tokens=True)
    dataset_reader = SnliReader(tokenizer=tokenizer,
                                token_indexers={'tokens': token_indexer})

    valid_dataset = dataset_reader.read(train_params.pop('valid_dataset_path'))
    vocab = Vocabulary.from_files(vocab_dir)
    random.shuffle(valid_dataset)

    model_params['token_embedder']['pretrained_file'] = None
    model = SNLIModel(params=model_params, vocab=vocab)
    model.load_state_dict(torch.load(checkpoint_path, map_location='cpu'),
                          strict=False)
    model.eval()

    iterator = BasicIterator(batch_size=1)
    iterator.index_with(vocab)
    generator = iterator(valid_dataset)

    for i in range(10):
        batch = next(generator)
        label_token_to_index = vocab.get_token_to_index_vocabulary('labels')
        print('----')
        print(' '.join(
            model.convert_to_readable_text(batch['premise']['tokens'])[0]))
        for label, label_index in label_token_to_index.items():
            label_tensor = torch.tensor([label_index])
            enc_embs = model.embed(batch['premise']['tokens'])
            enc_mask = get_text_field_mask(batch['premise'])
            enc_hidden = model.encode(inputs=enc_embs,
                                      mask=enc_mask,
                                      drop_start_token=True)
            code, kld = model.sample_code_and_compute_kld(enc_hidden)
            generated = model.generate(code=code,
                                       label=label_tensor,
                                       max_length=enc_mask.sum(1) * 2,
                                       beam_size=10,
                                       lp_alpha=args.lp_alpha)
            text = model.convert_to_readable_text(generated[:, 0])[0]
            print(label)
            print(' '.join(text))
Beispiel #7
0
def main():
    args = parse_args()
    checkpoint_path = Path(args.checkpoint)
    checkpoint_dir = checkpoint_path.parent
    params_path = checkpoint_dir / 'params.json'
    vocab_dir = checkpoint_dir / 'vocab'

    params = Params.from_file(params_path)
    train_params, model_params = params.pop('train'), params.pop('model')

    tokenizer = WordTokenizer(
        start_tokens=['<s>'],
        end_tokens=['</s>'],
    )
    token_indexer = SingleIdTokenIndexer(lowercase_tokens=True)
    dataset_reader = SnliReader(tokenizer=tokenizer,
                                token_indexers={'tokens': token_indexer})

    valid_dataset = dataset_reader.read(train_params.pop('valid_dataset_path'))
    if not args.test_dataset:
        test_dataset_path = train_params.pop('test_dataset_path')
    else:
        test_dataset_path = args.test_dataset
    test_dataset = dataset_reader.read(test_dataset_path)
    if args.only_label:
        test_dataset = [
            d for d in test_dataset
            if d.fields['label'].label == args.only_label
        ]
    vocab = Vocabulary.from_files(vocab_dir)
    random.shuffle(valid_dataset)

    model_params['token_embedder']['pretrained_file'] = None
    model = SNLIModel(params=model_params, vocab=vocab)
    model.load_state_dict(torch.load(checkpoint_path, map_location='cpu'),
                          strict=False)
    model.to(args.cuda_device)
    model.eval()

    torch.set_grad_enabled(False)

    iterator = BasicIterator(batch_size=32)
    iterator.index_with(vocab)

    for dataset in (valid_dataset, test_dataset):
        generator = iterator(dataset, shuffle=False, num_epochs=1)
        model.get_metrics(reset=True)
        for batch in tqdm(generator):
            batch = move_to_device(batch, cuda_device=args.cuda_device)
            model(premise=batch['premise'],
                  hypothesis=batch['hypothesis'],
                  label=batch['label'])
        metrics = model.get_metrics()
        pprint(metrics)
Beispiel #8
0
def main():
    args = parse_args()
    checkpoint_path = Path(args.checkpoint)
    checkpoint_dir = checkpoint_path.parent
    params_path = checkpoint_dir / 'params.json'
    vocab_dir = checkpoint_dir / 'vocab'

    params = Params.from_file(params_path)
    train_params, model_params = params.pop('train'), params.pop('model')

    tokenizer = WordTokenizer(start_tokens=['<s>'], end_tokens=['</s>'],)
    token_indexer = SingleIdTokenIndexer(lowercase_tokens=True)
    dataset_reader = SnliReader(
        tokenizer=tokenizer, token_indexers={'tokens': token_indexer})

    valid_dataset = dataset_reader.read(
        train_params.pop('valid_dataset_path'))
    vocab = Vocabulary.from_files(vocab_dir)

    model_params['token_embedder']['pretrained_file'] = None
    model = SNLIModel(params=model_params, vocab=vocab)
    model.load_state_dict(torch.load(checkpoint_path, map_location='cpu'),
                          strict=False)
    model.to(args.device)
    model.eval()

    iterator = BasicIterator(batch_size=args.batch_size)
    iterator.index_with(vocab)
    generator = iterator(valid_dataset, num_epochs=1, shuffle=False)
    label_index_to_token = vocab.get_index_to_token_vocabulary('labels')

    out_file = open(args.out, 'w')

    for batch in tqdm(generator):
        premise_tokens = batch['premise']['tokens']
        enc_embs = model.embed(premise_tokens.to(args.device))
        enc_mask = get_text_field_mask(batch['premise']).to(args.device)
        enc_hidden = model.encode(inputs=enc_embs, mask=enc_mask,
                                  drop_start_token=True)
        code, kld = model.sample_code_and_compute_kld(enc_hidden)
        pre_text = model.convert_to_readable_text(premise_tokens[:, 1:])
        label_tensor = batch['label'].to(args.device)
        generated = model.generate(
            code=code, label=label_tensor, max_length=25,
            beam_size=10, lp_alpha=args.lp_alpha)
        text = model.convert_to_readable_text(generated[:, 0])
        for pre_text_b, text_b, label_index_b in zip(pre_text, text, label_tensor):
            obj = {'sentence1': ' '.join(pre_text_b), 'sentence2': ' '.join(text_b),
                   'gold_label': label_index_to_token[label_index_b.item()]}
            out_file.write(json.dumps(obj))
            out_file.write('\n')
    def setUp(self):
        super(TestDecomposableAttention, self).setUp()

        constants.GLOVE_PATH = 'tests/fixtures/glove.6B.300d.sample.txt.gz'
        dataset = SnliReader().read('tests/fixtures/data/snli.jsonl')
        vocab = Vocabulary.from_dataset(dataset)
        self.vocab = vocab
        dataset.index_instances(vocab)
        self.dataset = dataset
        self.token_indexers = {'tokens': SingleIdTokenIndexer()}

        self.model = DecomposableAttention.from_params(self.vocab, Params({}))
        initializer = InitializerApplicator()
        initializer(self.model)
    def test_read_uses_existing_cache_file_when_present(self):
        snli_file = AllenNlpTestCase.FIXTURES_ROOT / "data" / "snli.jsonl"
        snli_copy_file = str(snli_file) + ".copy"
        shutil.copyfile(snli_file, snli_copy_file)
        reader = SnliReader(cache_directory=self.cache_directory)

        # The first read will create the cache.
        instances = reader.read(snli_copy_file)
        # Now we _remove_ the data file, to be sure we're reading from the cache.
        os.remove(snli_copy_file)
        cached_instances = reader.read(snli_copy_file)
        # We should get the same instances both times.
        assert len(instances) == len(cached_instances)
        for instance, cached_instance in zip(instances, cached_instances):
            assert instance.fields == cached_instance.fields
Beispiel #11
0
    def test_combine_input_fields(self):
        reader = SnliReader(
            tokenizer=PretrainedTransformerTokenizer("bert-base-uncased"),
            combine_input_fields=True)
        instances = reader.read(AllenNlpTestCase.FIXTURES_ROOT / "data" /
                                "snli.jsonl")
        instances = ensure_list(instances)

        instance1 = {
            "tokens": [
                "[CLS]",
                "a",
                "person",
                "on",
                "a",
                "horse",
                "jumps",
                "over",
                "a",
                "broken",
                "down",
                "airplane",
                ".",
                "[SEP]",
                "a",
                "person",
                "is",
                "training",
                "his",
                "horse",
                "for",
                "a",
                "competition",
                ".",
                "[SEP]",
            ],
            "label":
            "neutral",
        }

        assert len(instances) == 3
        fields = instances[0].fields
        assert [t.text for t in fields["tokens"].tokens] == instance1["tokens"]
        assert fields["label"].label == instance1["label"]
    def test_read_only_creates_cache_file_once(self):
        snli_file = AllenNlpTestCase.FIXTURES_ROOT / "data" / "snli.jsonl"
        reader = SnliReader(cache_directory=self.cache_directory)
        cache_file = reader._get_cache_location_for_file_path(snli_file)

        # The first read will create the cache.
        reader.read(snli_file)
        assert os.path.exists(cache_file)
        with open(cache_file, "r") as in_file:
            cache_contents = in_file.read()
        # The second and all subsequent reads should _use_ the cache, not modify it.  I looked
        # into checking file modification times, but this test will probably be faster than the
        # granularity of `os.path.getmtime()` (which only returns values in seconds).
        reader.read(snli_file)
        reader.read(snli_file)
        reader.read(snli_file)
        reader.read(snli_file)
        with open(cache_file, "r") as in_file:
            final_cache_contents = in_file.read()
        assert cache_contents == final_cache_contents
    def test_cached_max_instances(self, lazy):
        snli_file = AllenNlpTestCase.FIXTURES_ROOT / "data" / "snli.jsonl"

        # The first read will create the cache if it's not there already.
        reader = SnliReader(cache_directory=self.cache_directory, lazy=lazy)
        instances = reader.read(snli_file)
        instance_count = sum(1 for _ in instances)
        assert instance_count > 2

        # The second read should only return two instances, even though it's from the cache.
        reader = SnliReader(cache_directory=self.cache_directory,
                            max_instances=2,
                            lazy=lazy)
        instances = reader.read(snli_file)
        instance_count = sum(1 for _ in instances)
        assert instance_count == 2
    def test_caching_works_with_lazy_reading(self):
        snli_file = AllenNlpTestCase.FIXTURES_ROOT / "data" / "snli.jsonl"
        snli_copy_file = str(snli_file) + ".copy"
        shutil.copyfile(snli_file, snli_copy_file)
        reader = SnliReader(lazy=True, cache_directory=self.cache_directory)
        cache_file = reader._get_cache_location_for_file_path(snli_copy_file)

        # The call to read() will give us an _iterator_.  We'll iterate over it multiple times,
        # and the caching behavior should change as we go.
        instances = reader.read(snli_copy_file)
        assert isinstance(instances, _LazyInstances)

        # The first iteration will create the cache
        assert not os.path.exists(cache_file)
        first_pass_instances = []
        for instance in instances:
            first_pass_instances.append(instance)
        assert os.path.exists(cache_file)

        # Now we _remove_ the data file, to be sure we're reading from the cache.
        os.remove(snli_copy_file)
        second_pass_instances = []
        for instance in instances:
            second_pass_instances.append(instance)

        # We should get the same instances both times.
        assert len(first_pass_instances) == len(second_pass_instances)
        for instance, cached_instance in zip(first_pass_instances,
                                             second_pass_instances):
            assert instance.fields == cached_instance.fields

        # And just to be super paranoid, in case the second pass somehow bypassed the cache
        # because of a bug in `_CachedLazyInstance` that's hard to detect, we'll read the
        # instances from the cache with a non-lazy iterator and make sure they're the same.
        reader = SnliReader(lazy=False, cache_directory=self.cache_directory)
        cached_instances = reader.read(snli_copy_file)
        assert len(first_pass_instances) == len(cached_instances)
        for instance, cached_instance in zip(first_pass_instances,
                                             cached_instances):
            assert instance.fields == cached_instance.fields
Beispiel #15
0
vocab_dir = 'vocab/'
serialization_dir = 'checkpoints/'

device = 5
torch.cuda.set_device(device)

batch_size = 64
hid_dim = 100
embed_dim = 300

lr = 1e-3
grad_clipping = 5
dropout = 0.1
# lazy=False将数据一次性加入内存, lazy=True边训练边加载
print('data loading, please wait...')
reader = SnliReader(lazy=True)
train_dataset = reader.read("snli_1.0/snli_1.0_train.jsonl")
dev_dataset = reader.read("snli_1.0/snli_1.0_dev.jsonl")
test_dataset = reader.read("snli_1.0/snli_1.0_test.jsonl")

if os.path.exists(vocab_dir):
    vocab = Vocabulary.from_files(vocab_dir)
else:
    vocab = Vocabulary.from_instances(chain(train_dataset, dev_dataset),
                                      max_vocab_size=max_vocab_size)
    vocab.save_to_files(vocab_dir)

print("vocab_size: {}".format(vocab.get_vocab_size()))

train_iterator = BucketIterator(batch_size=batch_size,
                                sorting_keys=[("premise", "num_tokens")])
Beispiel #16
0
def main():
    args = parse_args()
    params = Params.from_file(args.params)
    save_dir = Path(args.save)
    save_dir.mkdir(parents=True)

    params.to_file(save_dir / 'params.json')

    train_params, model_params = params.pop('train'), params.pop('model')

    random_seed = train_params.pop_int('random_seed', 2019)
    torch.manual_seed(random_seed)
    random.seed(random_seed)

    log_filename = save_dir / 'stdout.log'
    sys.stdout = TeeLogger(filename=log_filename,
                           terminal=sys.stdout,
                           file_friendly_terminal_output=False)
    sys.stderr = TeeLogger(filename=log_filename,
                           terminal=sys.stderr,
                           file_friendly_terminal_output=False)

    tokenizer = WordTokenizer(start_tokens=['<s>'], end_tokens=['</s>'])
    token_indexer = SingleIdTokenIndexer(lowercase_tokens=True)
    dataset_reader = SnliReader(tokenizer=tokenizer,
                                token_indexers={'tokens': token_indexer})

    train_labeled_dataset_path = train_params.pop('train_labeled_dataset_path')
    train_unlabeled_dataset_path = train_params.pop(
        'train_unlabeled_dataset_path', None)
    train_labeled_dataset = dataset_reader.read(train_labeled_dataset_path)
    train_labeled_dataset = filter_dataset_by_length(
        dataset=train_labeled_dataset, max_length=30)
    if train_unlabeled_dataset_path is not None:
        train_unlabeled_dataset = dataset_reader.read(
            train_unlabeled_dataset_path)
        train_unlabeled_dataset = filter_dataset_by_length(
            dataset=train_unlabeled_dataset, max_length=30)
    else:
        train_unlabeled_dataset = []

    valid_dataset = dataset_reader.read(train_params.pop('valid_dataset_path'))

    vocab = Vocabulary.from_instances(
        instances=train_labeled_dataset + train_unlabeled_dataset,
        max_vocab_size=train_params.pop_int('max_vocab_size', None))
    vocab.save_to_files(save_dir / 'vocab')

    labeled_batch_size = train_params.pop_int('labeled_batch_size')
    unlabeled_batch_size = train_params.pop_int('unlabeled_batch_size')
    labeled_iterator = BasicIterator(batch_size=labeled_batch_size)
    unlabeled_iterator = BasicIterator(batch_size=unlabeled_batch_size)
    labeled_iterator.index_with(vocab)
    unlabeled_iterator.index_with(vocab)

    if not train_unlabeled_dataset:
        unlabeled_iterator = None

    pretrained_checkpoint_path = train_params.pop('pretrained_checkpoint_path',
                                                  None)
    model = SNLIModel(params=model_params, vocab=vocab)
    if pretrained_checkpoint_path:
        model.load_state_dict(
            torch.load(pretrained_checkpoint_path, map_location='cpu'))
    model.add_finetune_parameters(
        con_autoweight=train_params.pop_bool('con_autoweight', False),
        con_y_weight=train_params.pop_float('con_y_weight'),
        con_z_weight=train_params.pop_float('con_z_weight'),
        con_z2_weight=train_params.pop_float('con_z2_weight'))

    main_optimizer = optim.Adam(params=model.finetune_main_parameters(
        exclude_generator=train_params.pop_bool('exclude_generator')),
                                lr=train_params.pop_float('lr', 1e-3))
    aux_optimizer = optim.Adam(params=model.finetune_aux_parameters(),
                               lr=train_params.pop_float('aux_lr', 1e-4))

    summary_writer = SummaryWriter(log_dir=save_dir / 'log')

    kl_anneal_rate = train_params.pop_float('kl_anneal_rate', None)
    if kl_anneal_rate is None:
        kl_weight_scheduler = None
    else:
        kl_weight_scheduler = (lambda step: min(1.0, kl_anneal_rate * step))
        model.kl_weight = 0.0

    gumbel_anneal_rate = train_params.pop_float('gumbel_anneal_rate', None)
    if gumbel_anneal_rate is None:
        gumbel_temperature_scheduler = None
    else:
        gumbel_temperature_scheduler = (
            lambda step: max(0.1, 1.0 - gumbel_anneal_rate * step))
        model.gumbel_temperature = 1.0
    iters_per_epoch = train_params.pop_int(
        'iters_per_epoch',
        len(train_labeled_dataset) // labeled_batch_size)

    trainer = FineTuningTrainer(
        model=model,
        main_optimizer=main_optimizer,
        aux_optimizer=aux_optimizer,
        labeled_iterator=labeled_iterator,
        unlabeled_iterator=unlabeled_iterator,
        train_labeled_dataset=train_labeled_dataset,
        train_unlabeled_dataset=train_unlabeled_dataset,
        validation_dataset=valid_dataset,
        summary_writer=summary_writer,
        serialization_dir=save_dir,
        num_epochs=train_params.pop_int('num_epochs', 50),
        iters_per_epoch=iters_per_epoch,
        write_summary_every=100,
        validate_every=1000,
        patience=train_params.pop_int('patience', 5),
        clip_grad_max_norm=train_params.pop_float('grad_max_norm', 5.0),
        kl_weight_scheduler=kl_weight_scheduler,
        gumbel_temperature_scheduler=gumbel_temperature_scheduler,
        cuda_device=train_params.pop_int('cuda_device', 0),
    )
    trainer.train()
 def test_max_instances(self, lazy):
     snli_file = AllenNlpTestCase.FIXTURES_ROOT / "data" / "snli.jsonl"
     reader = SnliReader(max_instances=2, lazy=lazy)
     instances = reader.read(snli_file)
     instance_count = sum(1 for _ in instances)
     assert instance_count == 2
def main():
    args = parse_args()
    params = Params.from_file(args.params)
    save_dir = Path(args.save)
    save_dir.mkdir(parents=True)

    params.to_file(save_dir / 'params.json')

    train_params, model_params = params.pop('train'), params.pop('model')

    random_seed = train_params.pop_int('random_seed', 2019)
    torch.manual_seed(random_seed)
    random.seed(random_seed)

    log_filename = save_dir / 'stdout.log'
    sys.stdout = TeeLogger(filename=log_filename,
                           terminal=sys.stdout,
                           file_friendly_terminal_output=False)
    sys.stderr = TeeLogger(filename=log_filename,
                           terminal=sys.stderr,
                           file_friendly_terminal_output=False)

    tokenizer = WordTokenizer(
        start_tokens=['<s>'],
        end_tokens=['</s>'],
    )
    token_indexer = SingleIdTokenIndexer(lowercase_tokens=True)
    dataset_reader = SnliReader(tokenizer=tokenizer,
                                token_indexers={'tokens': token_indexer})

    train_labeled_dataset_path = train_params.pop('train_labeled_dataset_path')
    train_unlabeled_dataset_path = train_params.pop(
        'train_unlabeled_dataset_path', None)
    train_labeled_dataset = dataset_reader.read(train_labeled_dataset_path)
    train_labeled_dataset = filter_dataset_by_length(
        dataset=train_labeled_dataset, max_length=30)
    if train_unlabeled_dataset_path is not None:
        train_unlabeled_dataset = dataset_reader.read(
            train_unlabeled_dataset_path)
        train_unlabeled_dataset = filter_dataset_by_length(
            dataset=train_unlabeled_dataset, max_length=30)
    else:
        train_unlabeled_dataset = []

    valid_dataset = dataset_reader.read(train_params.pop('valid_dataset_path'))

    vocab = Vocabulary.from_instances(
        instances=train_labeled_dataset + train_unlabeled_dataset,
        max_vocab_size=train_params.pop_int('max_vocab_size', None))
    vocab.save_to_files(save_dir / 'vocab')

    labeled_batch_size = train_params.pop_int('labeled_batch_size')
    unlabeled_batch_size = train_params.pop_int('unlabeled_batch_size')
    labeled_iterator = BasicIterator(batch_size=labeled_batch_size)
    unlabeled_iterator = BasicIterator(batch_size=unlabeled_batch_size)
    labeled_iterator.index_with(vocab)
    unlabeled_iterator.index_with(vocab)

    if not train_unlabeled_dataset:
        unlabeled_iterator = None

    model = SNLIModel(params=model_params, vocab=vocab)
    optimizer = optim.Adam(params=model.parameters(),
                           lr=train_params.pop_float('lr', 1e-3))
    summary_writer = SummaryWriter(log_dir=save_dir / 'log')

    kl_anneal_rate = train_params.pop_float('kl_anneal_rate', None)
    if kl_anneal_rate is None:
        kl_weight_scheduler = None
    else:
        kl_weight_scheduler = (lambda step: min(1.0, kl_anneal_rate * step))
        model.kl_weight = 0.0

    trainer = Trainer(model=model,
                      optimizer=optimizer,
                      labeled_iterator=labeled_iterator,
                      unlabeled_iterator=unlabeled_iterator,
                      train_labeled_dataset=train_labeled_dataset,
                      train_unlabeled_dataset=train_unlabeled_dataset,
                      validation_dataset=valid_dataset,
                      summary_writer=summary_writer,
                      serialization_dir=save_dir,
                      num_epochs=train_params.pop('num_epochs', 50),
                      iters_per_epoch=len(train_labeled_dataset) //
                      labeled_batch_size,
                      write_summary_every=100,
                      validate_every=2000,
                      patience=2,
                      clip_grad_max_norm=5,
                      kl_weight_scheduler=kl_weight_scheduler,
                      cuda_device=train_params.pop_int('cuda_device', 0),
                      early_stop=train_params.pop_bool('early_stop', True))
    trainer.train()
Beispiel #19
0
""" This file was used both for generating stress test results and for MultiNLI tests """
import json
import csv

from tqdm import tqdm

import torch

from allennlp.data import Vocabulary
from allennlp.data.dataset_readers import SnliReader

from allennlp.modules.token_embedders.embedding import Embedding

from model import BowmanEtAlSumOfWords, NLIPredictor, ChenEtAlESIM, RocktaschelEtAlConditionalEncoding, RocktaschelEtAlAttention

t = SnliReader()
###.......................Or the vocabulary your model used
vocab = Vocabulary.from_files('./.vocab/multinli_vocab')
emb = Embedding(vocab.get_vocab_size(), 300)

### choose your architecture here
model = RocktaschelEtAlAttention(vocab, emb).to("cuda")

### load from serialised model here
with open(
        './.serialization_data/C.E. Attention MultiNLI_Adam_32_0.2_0.0003_0.0_True/best.th',
        'rb') as f:
    model.load_state_dict(torch.load(f))

p = NLIPredictor(model, t)
Beispiel #20
0
        i = i+1
    return bows

def getMnliBow(dataset, vocab, bow_type='groundBow'):
    premises, hypothesis, labs = mnliToList(dataset, vocab)

    premises = getBow(premises, vocab, bow_type)
    hypothesis = getBow(hypothesis, vocab, bow_type)

    labels = np.zeros((len(dataset), vocab.get_vocab_size(namespace='labels')))
    for i in range(len(dataset)):
        labels[i, labs[i]] = 1

    return (premises, hypothesis, labels)

reader = SnliReader()

# train_dataset = reader.read(cached_path('datasets/multinli_1.0/multinli_1.0_train.jsonl'))
train_dataset = reader.read('tests/fixtures/train1000.jsonl') # Fixture
validation_dataset = reader.read('tests/fixtures/val1000.jsonl') # Fixture
#validation_dataset = reader.read('datasets/multinli_1.0/multinli_1.0_dev_matched.jsonl')

# print(train_dataset)

vocab = Vocabulary.from_instances(train_dataset + validation_dataset)
# vocab.print_statistics()

t_premises, t_hypothesis, t_labels = getMnliBow(train_dataset, vocab, 'freqBow')
v_premises, v_hypothesis, v_labels = getMnliBow(validation_dataset, vocab, 'freqBow')

# for i in range(3):
Beispiel #21
0
""" Script for evaluation """
import torch

from allennlp.data import Vocabulary
from allennlp.data.dataset_readers import SnliReader
from allennlp.data.iterators.basic_iterator import BasicIterator

from allennlp.modules.token_embedders.embedding import Embedding

from allennlp.training.util import evaluate

from model import BowmanEtAlRNN, BowmanEtAlSumOfWords, NLIPredictor, RocktaschelEtAlConditionalEncoding, RocktaschelEtAlAttention, ChenEtAlESIM

from utils import grad_zero, comb_to_str, re_read_embeddings_from_text_file

t = SnliReader()
### You can choose train/val/test datasets here
train_dataset = t.read('.data/snli_1.0/snli_1.0_train.jsonl')
val_dataset = t.read('.data/snli_1.0/snli_1.0_dev.jsonl')
test_dataset = t.read('.data/snli_1.0/snli_1.0_test.jsonl')

vocab = Vocabulary.from_instances(train_dataset + val_dataset)

vocab = Vocabulary.from_files('./.vocab/snli_vocab')

glove = Embedding(vocab.get_vocab_size(), 300)

### Choose and load model here
model = RocktaschelEtAlAttention(vocab, glove, word_by_word=False).to("cuda")
with open(
        './.serialization_data/C.E. Attention_Adam_32_0.1_0.0003_5e-05_True/best.th',
Beispiel #22
0
    def test_read_from_file(self, lazy):
        reader = SnliReader(lazy=lazy)
        instances = reader.read(AllenNlpTestCase.FIXTURES_ROOT / "data" /
                                "snli.jsonl")
        instances = ensure_list(instances)

        instance1 = {
            "premise": [
                "A",
                "person",
                "on",
                "a",
                "horse",
                "jumps",
                "over",
                "a",
                "broken",
                "down",
                "airplane",
                ".",
            ],
            "hypothesis": [
                "A",
                "person",
                "is",
                "training",
                "his",
                "horse",
                "for",
                "a",
                "competition",
                ".",
            ],
            "label":
            "neutral",
        }

        instance2 = {
            "premise": [
                "A",
                "person",
                "on",
                "a",
                "horse",
                "jumps",
                "over",
                "a",
                "broken",
                "down",
                "airplane",
                ".",
            ],
            "hypothesis": [
                "A",
                "person",
                "is",
                "at",
                "a",
                "diner",
                ",",
                "ordering",
                "an",
                "omelette",
                ".",
            ],
            "label":
            "contradiction",
        }
        instance3 = {
            "premise": [
                "A",
                "person",
                "on",
                "a",
                "horse",
                "jumps",
                "over",
                "a",
                "broken",
                "down",
                "airplane",
                ".",
            ],
            "hypothesis":
            ["A", "person", "is", "outdoors", ",", "on", "a", "horse", "."],
            "label":
            "entailment",
        }

        assert len(instances) == 3
        fields = instances[0].fields
        assert [t.text
                for t in fields["premise"].tokens] == instance1["premise"]
        assert [t.text for t in fields["hypothesis"].tokens
                ] == instance1["hypothesis"]
        assert fields["label"].label == instance1["label"]
        fields = instances[1].fields
        assert [t.text
                for t in fields["premise"].tokens] == instance2["premise"]
        assert [t.text for t in fields["hypothesis"].tokens
                ] == instance2["hypothesis"]
        assert fields["label"].label == instance2["label"]
        fields = instances[2].fields
        assert [t.text
                for t in fields["premise"].tokens] == instance3["premise"]
        assert [t.text for t in fields["hypothesis"].tokens
                ] == instance3["hypothesis"]
        assert fields["label"].label == instance3["label"]
def main():
    args = parse_args()
    params = Params.from_file(args.params)
    save_dir = Path(args.save)
    save_dir.mkdir(parents=True)

    params.to_file(save_dir / 'params.json')

    train_params, model_params = params.pop('train'), params.pop('model')

    random_seed = train_params.pop_int('random_seed', 2019)
    torch.manual_seed(random_seed)
    random.seed(random_seed)

    log_filename = save_dir / 'stdout.log'
    sys.stdout = TeeLogger(
        filename=log_filename, terminal=sys.stdout,
        file_friendly_terminal_output=False)
    sys.stderr = TeeLogger(
        filename=log_filename, terminal=sys.stderr,
        file_friendly_terminal_output=False)

    tokenizer = WordTokenizer()
    token_indexer = SingleIdTokenIndexer(lowercase_tokens=True)
    dataset_reader = SnliReader(
        tokenizer=tokenizer, token_indexers={'tokens': token_indexer})

    train_labeled_dataset_path = train_params.pop(
        'train_labeled_dataset_path')
    train_unlabeled_dataset_path = train_params.pop(
        'train_unlabeled_dataset_path', None)
    train_labeled_dataset = dataset_reader.read(train_labeled_dataset_path)
    truncate_or_pad_dataset(dataset=train_labeled_dataset, length=29)
    if train_unlabeled_dataset_path is not None:
        train_unlabeled_dataset = dataset_reader.read(
            train_unlabeled_dataset_path)
        truncate_or_pad_dataset(
            dataset=train_unlabeled_dataset, length=29)
    else:
        train_unlabeled_dataset = []

    valid_dataset = dataset_reader.read(
        train_params.pop('valid_dataset_path'))
    truncate_or_pad_dataset(valid_dataset, length=29)

    vocab = Vocabulary.from_instances(
        instances=train_labeled_dataset + train_unlabeled_dataset,
        max_vocab_size=train_params.pop_int('max_vocab_size', None))
    vocab.save_to_files(save_dir / 'vocab')

    labeled_batch_size = train_params.pop_int('labeled_batch_size')
    unlabeled_batch_size = train_params.pop_int('unlabeled_batch_size')
    labeled_iterator = BasicIterator(batch_size=labeled_batch_size)
    unlabeled_iterator = BasicIterator(batch_size=unlabeled_batch_size)
    labeled_iterator.index_with(vocab)
    unlabeled_iterator.index_with(vocab)

    if not train_unlabeled_dataset:
        unlabeled_iterator = None

    model = DeconvSNLIModel(params=model_params, vocab=vocab)
    optimizer = optim.Adam(params=model.parameters())
    summary_writer = SummaryWriter(log_dir=save_dir / 'log')

    trainer = SeparatedLVMTrainer(
        model=model,
        optimizer=optimizer,
        labeled_iterator=labeled_iterator,
        unlabeled_iterator=unlabeled_iterator,
        train_labeled_dataset=train_labeled_dataset,
        train_unlabeled_dataset=train_unlabeled_dataset,
        validation_dataset=valid_dataset,
        summary_writer=summary_writer,
        serialization_dir=save_dir,
        num_epochs=train_params.pop('num_epochs', 50),
        iters_per_epoch=len(train_labeled_dataset) // labeled_batch_size,
        write_summary_every=100,
        validate_every=2000,
        patience=2,
        clip_grad_max_norm=5,
        cuda_device=train_params.pop_int('cuda_device', 0)
    )
    trainer.train()
Beispiel #24
0
import torch
from allennlp.data.dataset_readers import SnliReader
from allennlp.predictors.predictor import Predictor
from allennlp.data.tokenizers import CharacterTokenizer
from allennlp.data.token_indexers import ELMoTokenCharactersIndexer
torch.manual_seed(1)

DATA_PT = 'data/multinli_1.0/multinli_1.0_dev_matched.jsonl'
reader = SnliReader(CharacterTokenizer(),
                    {"elmo": ELMoTokenCharactersIndexer()})
data = reader.read(DATA_PT)

MODEL_PT = "models/decomposable-attention-elmo-2018.02.19.tar.gz"
predictor = Predictor.from_path(MODEL_PT)
result = predictor.predict_instance(data[0])
print(result)