Esempio n. 1
0
def test_basic_e2e(use_cuda, try_serialize):
    if use_cuda and not torch.cuda.is_available():
        pytest.skip("Cuda not available")
    vocab = BasicVocab(["a", "b", "c", "d", "e", "f", "g", "h", "i"] +
                       parse_constants.ALL_SPECIALS)
    model = make_default_cookie_monster(vocab,
                                        hidden_size_base=16,
                                        use_cuda=use_cuda)
    pad_ind = vocab.token_to_index(parse_constants.PAD)
    split_ind = vocab.token_to_index(parse_constants.TASK_SPLITTER)
    dvc = torch.device("cuda" if use_cuda else "cpu")
    examples = [
        make_fake_lm_example([1, 2, 3, split_ind, 1, 2], True, dvc),
        make_fake_lm_example([1, 2, 3, split_ind, 6, 4], False, dvc),
        make_fake_lm_example([1, 3, 2, split_ind, 3, 2], True, dvc),
        make_fake_lm_example([6, 5, 4, split_ind, 6, 5], True, dvc),
        make_fake_lm_example([1, 2, 3, split_ind, 5, 6], False, dvc),
        make_fake_lm_example([6, 5, 3, split_ind, 1, 1], False, dvc),
    ]
    model.start_train_session()
    for i in range(1000):
        ns_loss, lm_loss, total_loss = model.train_batch(
            LMBatch.from_example_list(examples, pad_ind, dvc))
        if ns_loss < math.log(
                1.25) and lm_loss < 0.0001 and total_loss < math.log(1.25):
            break
    else:
        pytest.fail("Did not converge in expected number of iterations")

    if try_serialize:
        save_state = model.get_save_state_dict()
        _, f = tempfile.mkstemp()
        try:
            torch.save(save_state, f)
            new_state = torch.load(f)
            new_model = model.create_from_save_state_dict(new_state)
            new_model.start_train_session()
            if use_cuda:
                new_model.cuda()
            # TODO really should be eval
            ns_loss, lm_loss, total_loss = new_model.train_batch(
                LMBatch.from_example_list(examples, pad_ind, dvc))
            assert ns_loss < math.log(
                1.25) and lm_loss < 0.0001 and total_loss < math.log(1.25)
        finally:
            os.remove(f)


# TODO (DNGros): Add test for mask task
Esempio n. 2
0
def _get_default_tokenizers(
) -> Tuple[Tuple[tokenizers.Tokenizer, Optional[Vocab]], tokenizers.Tokenizer]:
    # Copy pasta from encdecmodel.py. bad. fix
    """Returns tuple (default x tokenizer, default y tokenizer)"""
    word_piece_tok, word_list = get_default_pieced_tokenizer_word_list()
    x_vocab = BasicVocab(word_list + parse_constants.ALL_SPECIALS)
    return (word_piece_tok, x_vocab), AstValTokenizer()
Esempio n. 3
0
def get_default_tokenizers(
    use_word_piece: bool = False
) -> Tuple[Tuple[tokenizers.Tokenizer, Optional[Vocab]], tokenizers.Tokenizer]:
    """Returns tuple (default x tokenizer, default y tokenizer)"""
    if not use_word_piece:
        return (NonLetterTokenizer(), None), AstValTokenizer()
    word_piece_tok, word_list = get_default_pieced_tokenizer_word_list()
    x_vocab = BasicVocab(word_list + parse_constants.ALL_SPECIALS)
    return (word_piece_tok, x_vocab), AstValTokenizer()
Esempio n. 4
0
 def create_from_save_state_dict(cls,
                                 state_dict: dict) -> 'StringQueryEncoder':
     return StringQueryEncoder(
         tokenizer=tokenizers.tokenizer_from_save_dict(
             state_dict['tokenizer']),
         query_vocab=BasicVocab.create_from_save_state_dict(
             state_dict['query_vocab']),
         query_vectorizer=vectorizer_from_save_dict(
             state_dict['query_vectorizer']),
         internal_encoder=state_dict['internal_encoder'])
Esempio n. 5
0
 def create_from_save_state_dict(
         cls, state_dict: dict) -> 'PretrainPoweredQueryEncoder':
     instance = cls(tokenizer=tokenizers.tokenizer_from_save_dict(
         state_dict['tokenizer']),
                    query_vocab=BasicVocab.create_from_save_state_dict(
                        state_dict['query_vocab']),
                    initial_encoder=state_dict['initial_encoder'],
                    summary_size=state_dict['summary_size'])
     instance.other_models.load_state_dict(state_dict['other_models_state'])
     return instance
Esempio n. 6
0
        "--epochs",
        default=1,
        type=int,
        help=
        "Number of epochs. Note that since pairs of are dynamically generated"
        "there are actually many more possible examples")
    parser.add_argument("--restore_from",
                        default=None,
                        type=str,
                        help="A path to restore from")
    args = parser.parse_args()

    tokenizer, vocab_list = tokenizers.get_default_pieced_tokenizer_word_list()
    use_cuda = True
    vocab = BasicVocab(
        vocab_list + parse_constants.ALL_SPECIALS,
        default_device=torch.device("cuda" if use_cuda else "cpu"))
    dataset = CookieMonsterDataset(
        #["../../builtin_types/otherdata/stackexchange/unix-stackexchange/sentences.txt"],
        args.files,
        tokenizer,
        vocab,
        max_docs_to_load=9e9,
        use_cuda=use_cuda)
    if args.restore_from is None:
        model = make_default_cookie_monster(vocab,
                                            hidden_size_base=args.hiddensize,
                                            use_cuda=use_cuda)
    else:
        restore = torch.load(args.restore_from)
        model = CookieMonsterForPretraining.create_from_save_state_dict(
Esempio n. 7
0
                                         device=self.dataset.device)


def human_test(dataset: CookieMonsterDataset, samples):
    rights = []
    for i in range(samples):
        sent, gt_seq = dataset.random_sample_sentence_str()
        guess = input(f"TRY(1 seq/ 0 not): {sent}")
        right = bool(float(guess)) == gt_seq
        print("guess", bool(float(guess)), "actual", gt_seq, "right", right)
        rights.append(right)
    print(f"Right percent {sum(rights) / samples * 100}")


if __name__ == "__main__":
    tokenizer, vocab = tokenizers.get_default_pieced_tokenizer_word_list()
    dataset = CookieMonsterDataset([
        "../../../builtin_types/otherdata/stackexchange/unix-stackexchange/sentences.txt"
    ],
                                   tokenizer,
                                   BasicVocab(vocab +
                                              parse_constants.ALL_SPECIALS),
                                   max_docs_to_load=100)
    print(len(dataset))
    print(dataset.random_sample())
    for batch in CookieMonsterBatchIterator(dataset,
                                            batch_size=4,
                                            max_num_batches=2):
        print(batch)
    human_test(dataset, 10)