def test_basic_e2e(use_cuda, try_serialize): if use_cuda and not torch.cuda.is_available(): pytest.skip("Cuda not available") vocab = BasicVocab(["a", "b", "c", "d", "e", "f", "g", "h", "i"] + parse_constants.ALL_SPECIALS) model = make_default_cookie_monster(vocab, hidden_size_base=16, use_cuda=use_cuda) pad_ind = vocab.token_to_index(parse_constants.PAD) split_ind = vocab.token_to_index(parse_constants.TASK_SPLITTER) dvc = torch.device("cuda" if use_cuda else "cpu") examples = [ make_fake_lm_example([1, 2, 3, split_ind, 1, 2], True, dvc), make_fake_lm_example([1, 2, 3, split_ind, 6, 4], False, dvc), make_fake_lm_example([1, 3, 2, split_ind, 3, 2], True, dvc), make_fake_lm_example([6, 5, 4, split_ind, 6, 5], True, dvc), make_fake_lm_example([1, 2, 3, split_ind, 5, 6], False, dvc), make_fake_lm_example([6, 5, 3, split_ind, 1, 1], False, dvc), ] model.start_train_session() for i in range(1000): ns_loss, lm_loss, total_loss = model.train_batch( LMBatch.from_example_list(examples, pad_ind, dvc)) if ns_loss < math.log( 1.25) and lm_loss < 0.0001 and total_loss < math.log(1.25): break else: pytest.fail("Did not converge in expected number of iterations") if try_serialize: save_state = model.get_save_state_dict() _, f = tempfile.mkstemp() try: torch.save(save_state, f) new_state = torch.load(f) new_model = model.create_from_save_state_dict(new_state) new_model.start_train_session() if use_cuda: new_model.cuda() # TODO really should be eval ns_loss, lm_loss, total_loss = new_model.train_batch( LMBatch.from_example_list(examples, pad_ind, dvc)) assert ns_loss < math.log( 1.25) and lm_loss < 0.0001 and total_loss < math.log(1.25) finally: os.remove(f) # TODO (DNGros): Add test for mask task
def _get_default_tokenizers( ) -> Tuple[Tuple[tokenizers.Tokenizer, Optional[Vocab]], tokenizers.Tokenizer]: # Copy pasta from encdecmodel.py. bad. fix """Returns tuple (default x tokenizer, default y tokenizer)""" word_piece_tok, word_list = get_default_pieced_tokenizer_word_list() x_vocab = BasicVocab(word_list + parse_constants.ALL_SPECIALS) return (word_piece_tok, x_vocab), AstValTokenizer()
def get_default_tokenizers( use_word_piece: bool = False ) -> Tuple[Tuple[tokenizers.Tokenizer, Optional[Vocab]], tokenizers.Tokenizer]: """Returns tuple (default x tokenizer, default y tokenizer)""" if not use_word_piece: return (NonLetterTokenizer(), None), AstValTokenizer() word_piece_tok, word_list = get_default_pieced_tokenizer_word_list() x_vocab = BasicVocab(word_list + parse_constants.ALL_SPECIALS) return (word_piece_tok, x_vocab), AstValTokenizer()
def create_from_save_state_dict(cls, state_dict: dict) -> 'StringQueryEncoder': return StringQueryEncoder( tokenizer=tokenizers.tokenizer_from_save_dict( state_dict['tokenizer']), query_vocab=BasicVocab.create_from_save_state_dict( state_dict['query_vocab']), query_vectorizer=vectorizer_from_save_dict( state_dict['query_vectorizer']), internal_encoder=state_dict['internal_encoder'])
def create_from_save_state_dict( cls, state_dict: dict) -> 'PretrainPoweredQueryEncoder': instance = cls(tokenizer=tokenizers.tokenizer_from_save_dict( state_dict['tokenizer']), query_vocab=BasicVocab.create_from_save_state_dict( state_dict['query_vocab']), initial_encoder=state_dict['initial_encoder'], summary_size=state_dict['summary_size']) instance.other_models.load_state_dict(state_dict['other_models_state']) return instance
"--epochs", default=1, type=int, help= "Number of epochs. Note that since pairs of are dynamically generated" "there are actually many more possible examples") parser.add_argument("--restore_from", default=None, type=str, help="A path to restore from") args = parser.parse_args() tokenizer, vocab_list = tokenizers.get_default_pieced_tokenizer_word_list() use_cuda = True vocab = BasicVocab( vocab_list + parse_constants.ALL_SPECIALS, default_device=torch.device("cuda" if use_cuda else "cpu")) dataset = CookieMonsterDataset( #["../../builtin_types/otherdata/stackexchange/unix-stackexchange/sentences.txt"], args.files, tokenizer, vocab, max_docs_to_load=9e9, use_cuda=use_cuda) if args.restore_from is None: model = make_default_cookie_monster(vocab, hidden_size_base=args.hiddensize, use_cuda=use_cuda) else: restore = torch.load(args.restore_from) model = CookieMonsterForPretraining.create_from_save_state_dict(
device=self.dataset.device) def human_test(dataset: CookieMonsterDataset, samples): rights = [] for i in range(samples): sent, gt_seq = dataset.random_sample_sentence_str() guess = input(f"TRY(1 seq/ 0 not): {sent}") right = bool(float(guess)) == gt_seq print("guess", bool(float(guess)), "actual", gt_seq, "right", right) rights.append(right) print(f"Right percent {sum(rights) / samples * 100}") if __name__ == "__main__": tokenizer, vocab = tokenizers.get_default_pieced_tokenizer_word_list() dataset = CookieMonsterDataset([ "../../../builtin_types/otherdata/stackexchange/unix-stackexchange/sentences.txt" ], tokenizer, BasicVocab(vocab + parse_constants.ALL_SPECIALS), max_docs_to_load=100) print(len(dataset)) print(dataset.random_sample()) for batch in CookieMonsterBatchIterator(dataset, batch_size=4, max_num_batches=2): print(batch) human_test(dataset, 10)