def _initialize(self, p, sentence_encoder:SequenceEncoder, min_freq:int): self.data = {} self.sentence_encoder = sentence_encoder jp = os.path.join(p, "lcquad_dataset.json") with open(jp, "r") as f: examples = ujson.load(f) examples = self.lines_to_examples(examples) questions, queries = tuple(zip(*examples)) trainlen = int(round(0.8 * len(examples))) validlen = int(round(0.1 * len(examples))) testlen = int(round(0.1 * len(examples))) splits = ["train"] * trainlen + ["valid"] * validlen + ["test"] * testlen random.seed(123456) random.shuffle(splits) assert(len(splits) == len(examples)) self.query_encoder = SequenceEncoder(tokenizer=partial(tree_query_tokenizer, strtok=sentence_encoder.tokenizer), add_end_token=True) # build vocabularies for i, (question, query, split) in enumerate(zip(questions, queries, splits)): self.sentence_encoder.inc_build_vocab(question, seen=split=="train") self.query_encoder.inc_build_vocab(query, seen=split=="train") for word, wordid in self.sentence_encoder.vocab.D.items(): self.query_encoder.vocab.add_token(word, seen=False) self.sentence_encoder.finalize_vocab(min_freq=min_freq) self.query_encoder.finalize_vocab(min_freq=min_freq) self.build_data(questions, queries, splits)
def _initialize(self, p, sentence_encoder:SequenceEncoder, min_freq:int): self.data = {} self.sentence_encoder = sentence_encoder trainlines = [x.strip() for x in open(os.path.join(p, "train.txt"), "r").readlines()] testlines = [x.strip() for x in open(os.path.join(p, "test.txt"), "r").readlines()] if self.cvfolds is None: splits = ["train"]*len(trainlines) + ["test"] * len(testlines) else: cvsplit_len = len(trainlines)/self.cvfolds splits = [] for i in range(0, self.cvfolds): splits += [i] * round(cvsplit_len * (i+1) - len(splits)) random.shuffle(splits) splits = ["valid" if x == self.testfold else "train" for x in splits] splits = splits + ["test"] * len(testlines) questions, queries = zip(*[x.split("\t") for x in trainlines]) testqs, testxs = zip(*[x.split("\t") for x in testlines]) questions += testqs queries += testxs self.query_encoder = SequenceEncoder(tokenizer=partial(basic_query_tokenizer, strtok=sentence_encoder.tokenizer), add_end_token=True) # build vocabularies for i, (question, query, split) in enumerate(zip(questions, queries, splits)): self.sentence_encoder.inc_build_vocab(question, seen=split=="train") self.query_encoder.inc_build_vocab(query, seen=split=="train") # for word, wordid in self.sentence_encoder.vocab.D.items(): # self.query_encoder.vocab.add_token(word, seen=False) self.sentence_encoder.finalize_vocab(min_freq=min_freq, keep_rare=True) self.query_encoder.finalize_vocab(min_freq=min_freq) token_specs = self.build_token_specs(queries) self.token_specs = token_specs self.build_data(questions, queries, splits)
def _initialize(self, p, sentence_encoder: SequenceEncoder, min_freq: int): self.data = {} self.sentence_encoder = sentence_encoder trainlines = [ x.strip() for x in open(os.path.join(p, "train.txt"), "r").readlines() ] testlines = [ x.strip() for x in open(os.path.join(p, "test.txt"), "r").readlines() ] splits = ["train"] * len(trainlines) + ["test"] * len(testlines) questions, queries = zip(*[x.split("\t") for x in trainlines]) testqs, testxs = zip(*[x.split("\t") for x in testlines]) questions += testqs queries += testxs self.query_encoder = SequenceEncoder(tokenizer=partial( basic_query_tokenizer, strtok=sentence_encoder.tokenizer), add_end_token=True) # build vocabularies for i, (question, query, split) in enumerate(zip(questions, queries, splits)): self.sentence_encoder.inc_build_vocab(question, seen=split == "train") self.query_encoder.inc_build_vocab(query, seen=split == "train") # for word, wordid in self.sentence_encoder.vocab.D.items(): # self.query_encoder.vocab.add_token(word, seen=False) self.sentence_encoder.finalize_vocab(min_freq=min_freq, keep_rare=True) self.query_encoder.finalize_vocab(min_freq=min_freq) self.build_data(questions, queries, splits)
def _initialize(self, p): self.data = {} with open(os.path.join(p, "trainpreds.json")) as f: trainpreds = ujson.load(f) with open(os.path.join(p, "testpreds.json")) as f: testpreds = ujson.load(f) splits = ["train"] * len(trainpreds) + ["test"] * len(testpreds) preds = trainpreds + testpreds self.sentence_encoder = SequenceEncoder(tokenizer=lambda x: x.split()) self.query_encoder = SequenceEncoder(tokenizer=lambda x: x.split()) # build vocabularies for i, (example, split) in enumerate(zip(preds, splits)): self.sentence_encoder.inc_build_vocab(" ".join( example["sentence"]), seen=split == "train") self.query_encoder.inc_build_vocab(" ".join(example["gold"]), seen=split == "train") for can in example["candidates"]: self.query_encoder.inc_build_vocab(" ".join(can["tokens"]), seen=False) # for word, wordid in self.sentence_encoder.vocab.D.items(): # self.query_encoder.vocab.add_token(word, seen=False) self.sentence_encoder.finalize_vocab() self.query_encoder.finalize_vocab() self.build_data(preds, splits)
def _initialize(self, p, bert_tokenizer, min_freq: int): self.data = {} self.bert_vocab = Vocab() self.bert_vocab.set_dict(bert_tokenizer.vocab) self.sentence_encoder = SequenceEncoder( lambda x: bert_tokenizer.tokenize(f"[CLS] {x} [SEP]"), vocab=self.bert_vocab) trainlines = [ x for x in ujson.load( open(os.path.join(p, f"geo-{self.train_lang}.json"), "r")) ] testlines = [ x for x in ujson.load( open(os.path.join(p, f"geo-{self.train_lang}.json"), "r")) ] trainlines = [x for x in trainlines if x["split"] == "train"] testlines = [x for x in testlines if x["split"] == "test"] if self.cvfolds is None: splits = ["train"] * len(trainlines) + ["test"] * len(testlines) else: cvsplit_len = len(trainlines) / self.cvfolds splits = [] for i in range(0, self.cvfolds): splits += [i] * round(cvsplit_len * (i + 1) - len(splits)) random.shuffle(splits) splits = [ "valid" if x == self.testfold else "train" for x in splits ] splits = splits + ["test"] * len(testlines) questions = [x["nl"] for x in trainlines] queries = [x["mrl"] for x in trainlines] xquestions = [x["nl"] for x in testlines] xqueries = [x["mrl"] for x in testlines] questions += xquestions queries += xqueries # initialize output vocabulary outvocab = Vocab() for token, bertid in self.bert_vocab.D.items(): outvocab.add_token(token, seen=False) self.query_encoder = SequenceEncoder(tokenizer=partial( basic_query_tokenizer, strtok=bert_tokenizer), vocab=outvocab, add_end_token=True) # build vocabularies for i, (question, query, split) in enumerate(zip(questions, queries, splits)): self.query_encoder.inc_build_vocab(query, seen=split == "train") keeptokens = set(self.bert_vocab.D.keys()) self.query_encoder.finalize_vocab(min_freq=min_freq, keep_tokens=keeptokens) token_specs = self.build_token_specs(queries) self.token_specs = token_specs self.build_data(questions, queries, splits)
def _initialize(self, p, domain, sentence_encoder: SequenceEncoder, min_freq: int): self.data = {} self.sentence_encoder = sentence_encoder trainexamples, testexamples = None, None if self._usecache: try: trainexamples, testexamples = self._load_cached() except (IOError, ValueError) as e: pass if trainexamples is None: trainlines = [ x.strip() for x in open( os.path.join(p, f"{domain}.paraphrases.train.examples"), "r").readlines() ] testlines = [ x.strip() for x in open( os.path.join(p, f"{domain}.paraphrases.test.examples"), "r").readlines() ] trainexamples = self.lines_to_examples(trainlines) testexamples = self.lines_to_examples(testlines) if self._usecache: self._cache(trainexamples, testexamples) questions, queries = tuple(zip(*(trainexamples + testexamples))) trainlen = int(round(0.8 * len(trainexamples))) validlen = int(round(0.2 * len(trainexamples))) splits = ["train"] * trainlen + ["valid"] * validlen # random.seed(1223) random.shuffle(splits) assert (len(splits) == len(trainexamples)) splits = splits + ["test"] * len(testexamples) self.query_encoder = SequenceEncoder(tokenizer=partial( tree_query_tokenizer, strtok=sentence_encoder.tokenizer), add_end_token=True) # build vocabularies for i, (question, query, split) in enumerate(zip(questions, queries, splits)): self.sentence_encoder.inc_build_vocab(question, seen=split == "train") self.query_encoder.inc_build_vocab(query, seen=split == "train") for word, wordid in self.sentence_encoder.vocab.D.items(): self.query_encoder.vocab.add_token(word, seen=False) self.sentence_encoder.finalize_vocab(min_freq=min_freq) self.query_encoder.finalize_vocab(min_freq=min_freq) self.build_data(questions, queries, splits)
class GeoQueryDatasetSub(GeoQueryDatasetFunQL): def __init__(self, p="../../datasets/geo880dong/", sentence_encoder: SequenceEncoder = None, min_freq: int = 2, **kw): super(GeoQueryDatasetSub, self).__init__(p, sentence_encoder, min_freq, **kw) def _initialize(self, p, sentence_encoder: SequenceEncoder, min_freq: int): self.data = {} self.sentence_encoder = sentence_encoder trainlines = [ x.strip() for x in open(os.path.join(p, "train.txt"), "r").readlines() ] testlines = [ x.strip() for x in open(os.path.join(p, "test.txt"), "r").readlines() ] splits = ["train"] * len(trainlines) + ["test"] * len(testlines) questions, queries = zip(*[x.split("\t") for x in trainlines]) testqs, testxs = zip(*[x.split("\t") for x in testlines]) questions += testqs queries += testxs queries = self.lisp2prolog(queries) self.query_encoder = SequenceEncoder(tokenizer=partial( basic_query_tokenizer, strtok=sentence_encoder.tokenizer), add_end_token=True) # build vocabularies for i, (question, query, split) in enumerate(zip(questions, queries, splits)): self.sentence_encoder.inc_build_vocab(question, seen=split == "train") self.query_encoder.inc_build_vocab(query, seen=split == "train") for word, wordid in self.sentence_encoder.vocab.D.items(): self.query_encoder.vocab.add_token(word, seen=False) self.sentence_encoder.finalize_vocab(min_freq=min_freq) self.query_encoder.finalize_vocab(min_freq=min_freq) self.build_data(questions, queries, splits) def lisp2prolog(self, data: List[str]): ret = [] for x in data: pas = lisp_to_pas(x) prolog = pas_to_prolog(pas) ret.append(prolog) return ret
def _initialize(self, p, sentence_encoder: SequenceEncoder, min_freq: int): self.data = {} self.sentence_encoder = sentence_encoder questions = [ x.strip() for x in open(os.path.join(p, "questions.txt"), "r").readlines() ] queries = [ x.strip() for x in open(os.path.join(p, "queries.funql"), "r").readlines() ] trainidxs = set([ int(x.strip()) for x in open(os.path.join(p, "train_indexes.txt"), "r").readlines() ]) testidxs = set([ int(x.strip()) for x in open(os.path.join(p, "test_indexes.txt"), "r").readlines() ]) splits = [None] * len(questions) for trainidx in trainidxs: splits[trainidx] = "train" for testidx in testidxs: splits[testidx] = "test" if any([split == None for split in splits]): print( f"{len([split for split in splits if split == None])} examples not assigned to any split" ) self.query_encoder = SequenceEncoder(tokenizer=partial( basic_query_tokenizer, strtok=sentence_encoder.tokenizer), add_end_token=True) # build vocabularies unktokens = set() for i, (question, query, split) in enumerate(zip(questions, queries, splits)): question_tokens = self.sentence_encoder.inc_build_vocab( question, seen=split == "train") query_tokens = self.query_encoder.inc_build_vocab( query, seen=split == "train") unktokens |= set(query_tokens) - set(question_tokens) for word in self.sentence_encoder.vocab.counts.keys(): self.query_encoder.vocab.add_token(word, seen=False) self.sentence_encoder.finalize_vocab(min_freq=min_freq, keep_rare=True) self.query_encoder.finalize_vocab(min_freq=min_freq, keep_rare=True) unktokens = unktokens & self.query_encoder.vocab.rare_tokens self.build_data(questions, queries, splits, unktokens=unktokens)
def __init__(self, p="../../datasets/geoquery/", sentence_encoder: SequenceEncoder = None, min_freq: int = 2, **kw): super(GeoQueryDataset, self).__init__(**kw) self.data = {} self.sentence_encoder = sentence_encoder questions = [ x.strip() for x in open(os.path.join(p, "questions.txt"), "r").readlines() ] queries = [ x.strip() for x in open(os.path.join(p, "queries.funql"), "r").readlines() ] trainidxs = set([ int(x.strip()) for x in open(os.path.join(p, "train_indexes.txt"), "r").readlines() ]) testidxs = set([ int(x.strip()) for x in open(os.path.join(p, "test_indexes.txt"), "r").readlines() ]) splits = [None] * len(questions) for trainidx in trainidxs: splits[trainidx] = "train" for testidx in testidxs: splits[testidx] = "test" if any([split == None for split in splits]): print( f"{len([split for split in splits if split == None])} examples not assigned to any split" ) self.query_encoder = SequenceEncoder(tokenizer=partial( basic_query_tokenizer, strtok=sentence_encoder.tokenizer), add_end_token=True) # build vocabularies for i, (question, query, split) in enumerate(zip(questions, queries, splits)): self.sentence_encoder.inc_build_vocab(question, seen=split == "train") self.query_encoder.inc_build_vocab(query, seen=split == "train") self.sentence_encoder.finalize_vocab(min_freq=min_freq) self.query_encoder.finalize_vocab(min_freq=min_freq) self.build_data(questions, queries, splits)
def try_dataset(): tt = q.ticktock("dataset") tt.tick("building dataset") ds = GeoDataset(sentence_encoder=SequenceEncoder(tokenizer=lambda x: x.split())) train_dl = ds.dataloader("train", batsize=20) test_dl = ds.dataloader("test", batsize=20) examples = set() examples_list = [] duplicates = [] testexamples = set() testexamples_list = [] testduplicates = [] for b in train_dl: for i in range(len(b)): example = b.inp_strings[i] + " --> " + str(b.gold_trees[i]) if example in examples: duplicates.append(example) examples.add(example) examples_list.append(example) # print(example) for b in test_dl: for i in range(len(b)): example = b.inp_strings[i] + " --> " + str(b.gold_trees[i]) if example in examples: testduplicates.append(example) testexamples.add(example) testexamples_list.append(example) print(f"duplicates within train: {len(duplicates)} from {len(examples_list)} total") print(f"duplicates from test to train: {len(testduplicates)} from {len(testexamples_list)} total:") for x in testduplicates: print(x) tt.tock("dataset built")
def _initialize(self, p, xlmr, min_freq:int): self.data = {} self.xlmr = xlmr self.xlmr_vocab = Vocab() self.xlmr_vocab.set_dict(xlmr.model.decoder.dictionary.indices) self.sentence_encoder = SequenceEncoder(lambda x: f"<s> {xlmr.bpe.encode(x)} </s>".split(), vocab=self.xlmr_vocab) trainlines = [x for x in ujson.load(open(os.path.join(p, f"geo-{self.train_lang}.json"), "r"))] testlines = [x for x in ujson.load(open(os.path.join(p, f"geo-{self.test_lang}.json"), "r"))] trainlines = [x for x in trainlines if x["split"] == "train"] testlines = [x for x in testlines if x["split"] == "test"] if self.cvfolds is None: splits = ["train"]*len(trainlines) + ["test"] * len(testlines) else: cvsplit_len = len(trainlines)/self.cvfolds splits = [] for i in range(0, self.cvfolds): splits += [i] * round(cvsplit_len * (i+1) - len(splits)) random.shuffle(splits) splits = ["valid" if x == self.testfold else "train" for x in splits] splits = splits + ["test"] * len(testlines) questions = [x["nl"] for x in trainlines] queries = [x["mrl"] for x in trainlines] xquestions = [x["nl"] for x in testlines] xqueries = [x["mrl"] for x in testlines] questions += xquestions queries += xqueries # initialize output vocabulary outvocab = Vocab() # for token, bertid in self.xlmr_vocab.D.items(): # outvocab.add_token(token, seen=False) self.query_encoder = SequenceEncoder(tokenizer=partial(basic_query_tokenizer, strtok=lambda x: xlmr.bpe.encode(x).split()), vocab=outvocab, add_end_token=True) # build vocabularies for i, (question, query, split) in enumerate(zip(questions, queries, splits)): question_tokens = self.sentence_encoder.convert(question, return_what="tokens")[0] for token in question_tokens: self.query_encoder.vocab.add_token(token, seen=False) self.query_encoder.inc_build_vocab(query, seen=split=="train") keeptokens = set(self.xlmr_vocab.D.keys()) self.query_encoder.finalize_vocab(min_freq=min_freq, keep_tokens=keeptokens) token_specs = self.build_token_specs(queries) self.token_specs = token_specs self.build_data(questions, queries, splits)
def __init__(self, inp_strings:List[str]=None, gold_strings:List[str]=None, inp_tensor:torch.Tensor=None, gold_tensor:torch.Tensor=None, inp_tokens:List[List[str]]=None, gold_tokens:List[List[str]]=None, sentence_encoder:SequenceEncoder=None, query_encoder:SequenceEncoder=None, **kw): if inp_strings is None: super(BasicDecoderState, self).__init__(**kw) else: kw = kw.copy() kw.update({"inp_strings": np.asarray(inp_strings), "gold_strings": np.asarray(gold_strings)}) super(BasicDecoderState, self).__init__(**kw) self.sentence_encoder = sentence_encoder self.query_encoder = query_encoder # self.set(followed_actions_str = np.asarray([None for _ in self.inp_strings])) # for i in range(len(self.followed_actions_str)): # self.followed_actions_str[i] = [] self.set(followed_actions = torch.zeros(len(inp_strings), 0, dtype=torch.long)) self.set(_is_terminated = np.asarray([False for _ in self.inp_strings])) self.set(_timesteps = np.asarray([0 for _ in self.inp_strings])) if sentence_encoder is not None: x = [sentence_encoder.convert(x, return_what="tensor,tokens") for x in self.inp_strings] x = list(zip(*x)) inp_tokens = np.asarray([None for _ in range(len(x[1]))], dtype=np.object) for i, inp_tokens_e in enumerate(x[1]): inp_tokens[i] = tuple(inp_tokens_e) x = {"inp_tensor": batchstack(x[0]), "inp_tokens": inp_tokens} self.set(**x) if self.gold_strings is not None: if query_encoder is not None: x = [query_encoder.convert(x, return_what="tensor,tokens") for x in self.gold_strings] x = list(zip(*x)) gold_tokens = np.asarray([None for _ in range(len(x[1]))]) for i, gold_tokens_e in enumerate(x[1]): gold_tokens[i] = tuple(gold_tokens_e) x = {"gold_tensor": batchstack(x[0]), "gold_tokens": gold_tokens} self.set(**x)
def try_perturbed_generated_dataset(): torch.manual_seed(1234) ovd = OvernightDatasetLoader().load() govd = PCFGDataset(OvernightPCFGBuilder() .build(ovd[(None, None, lambda x: x in {"train", "valid"})] .map(lambda f: f[1]).examples), N=10000) print(govd[0]) # print(govd[lambda x: True][0]) # print(govd[:]) # create vocab from pcfg vocab = build_vocab_from_pcfg(govd._pcfg) seqenc = SequenceEncoder(vocab=vocab, tokenizer=tree_to_lisp_tokens) spanmasker = SpanMasker(seed=12345667) treemasker = SubtreeMasker(p=.05, seed=2345677) perturbed_govd = govd.cache()\ .map(lambda x: (seqenc.convert(x, "tensor"), x)) \ .map(lambda x: x + (seqenc.convert(x[-1], "tokens"),)) \ .map(lambda x: x + (spanmasker(x[-1]),)) \ .map(lambda x: x + (seqenc.convert(x[-1], "tensor"),)) \ .map(lambda x: (x[-1], x[0])) dl = DataLoader(perturbed_govd, batch_size=10, shuffle=True, collate_fn=pad_and_default_collate) batch = next(iter(dl)) print(batch) print(vocab.tostr(batch[0][1])) print(vocab.tostr(batch[1][1])) tt = q.ticktock() tt.tick("first run") for i in range(10000): y = perturbed_govd[i] if i < 10: print(f"{y[0]}\n{y[-2]}") tt.tock("first run done") tt.tick("second run") for i in range(10000): y = perturbed_govd[i] if i < 10: print(f"{y[0]}\n{y[-2]}") tt.tock("second run done")
def __init__(self, maxlen=10, NperY=10, **kw): super(ConditionalRecallDataset, self).__init__(**kw) self.data = {} self.NperY, self.maxlen = NperY, maxlen self._seqs, self._ys = gen_data(self.maxlen, self.NperY) self.encoder = SequenceEncoder(tokenizer=lambda x: list(x)) for seq, y in zip(self._seqs, self._ys): self.encoder.inc_build_vocab(seq) self.encoder.inc_build_vocab(y) self.N = len(self._seqs) N = self.N splits = ["train"] * int(N * 0.8) + ["valid"] * int( N * 0.1) + ["test"] * int(N * 0.1) random.shuffle(splits) self.encoder.finalize_vocab() self.build_data(self._seqs, self._ys, splits)
def test_beam_transition(self): texts = [ "i went to chocolate @END@", "awesome is @END@", "the meaning of life @END@" ] from parseq.vocab import SequenceEncoder se = SequenceEncoder(tokenizer=lambda x: x.split()) for t in texts: se.inc_build_vocab(t) se.finalize_vocab() x = BasicDecoderState(texts, texts, se, se) x.start_decoding() class Model(TransitionModel): def forward(self, x: BasicDecoderState): outprobs = torch.randn(len(x), x.query_encoder.vocab.number_of_ids()) outprobs = torch.nn.functional.log_softmax(outprobs, -1) return outprobs, x model = Model() beamsize = 50 maxtime = 10 beam_xs = [ x.make_copy(detach=False, deep=True) for _ in range(beamsize) ] beam_states = BeamState(beam_xs) print(len(beam_xs)) print(len(beam_states)) bt = BeamTransition(model, beamsize, maxtime=maxtime) i = 0 _, _, y, _ = bt(x, i) i += 1 _, _, y, _ = bt(y, i) all_terminated = y.all_terminated() while not all_terminated: _, predactions, y, all_terminated = bt(y, i) i += 1 print("timesteps done:") print(i) print(y) print(predactions[0]) for i in range(beamsize): print("-") # print(y.bstates[0].get(i).followed_actions) # print(predactions[0, i, :]) pa = predactions[0, i, :] # print((pa == se.vocab[se.vocab.endtoken]).cumsum(0)) pa = ((pa == se.vocab[se.vocab.endtoken]).long().cumsum(0) < 1).long() * pa yb = y.bstates[0].get(i).followed_actions[0, :] yb = yb * (yb != se.vocab[se.vocab.endtoken]).long() print(pa) print(yb) self.assertTrue(torch.allclose(pa, yb))
def test_beam_search(self): texts = [ "i went to chocolate @END@", "awesome is @END@", "the meaning of life @END@" ] from parseq.vocab import SequenceEncoder se = SequenceEncoder(tokenizer=lambda x: x.split()) for t in texts: se.inc_build_vocab(t) se.finalize_vocab() x = BasicDecoderState(texts, texts, se, se) x.start_decoding() class Model(TransitionModel): def forward(self, x: BasicDecoderState): outprobs = torch.randn(len(x), x.query_encoder.vocab.number_of_ids()) outprobs = torch.nn.functional.log_softmax(outprobs, -1) return outprobs, x model = Model() beamsize = 50 maxtime = 10 bs = BeamDecoder(model, eval=[CELoss(ignore_index=0), SeqAccuracies()], eval_beam=[BeamSeqAccuracies()], beamsize=beamsize, maxtime=maxtime) y = bs(x) print(y)
def test_free_decoder(self): texts = [ "i went to chocolate a b c d e f g h i j k l m n o p q r @END@", "awesome is @END@", "the meaning of life @END@" ] se = SequenceEncoder(tokenizer=lambda x: x.split()) for t in texts: se.inc_build_vocab(t) se.finalize_vocab() texts = ["@END@"] * 100 x = BasicDecoderState(texts, texts, se, se) class Model(TransitionModel): def forward(self, x: BasicDecoderState): outprobs = torch.rand(len(x), x.query_encoder.vocab.number_of_ids()) return outprobs, x MAXTIME = 10 dec = SeqDecoder(FreerunningTransition(Model(), maxtime=MAXTIME)) y = dec(x) print(y[1].followed_actions) print(max([len(y[1].followed_actions[i]) for i in range(len(y[1]))])) print(min([len(y[1].followed_actions[i]) for i in range(len(y[1]))])) self.assertTrue( max([len(y[1].followed_actions[i]) for i in range(len(y[1]))]) <= MAXTIME + 1)
def load_ds(domain="restaurants", min_freq=0, top_k=np.infty, nl_mode="bart-large", trainonvalid=False): ds = OvernightDatasetLoader(simplify_mode="light").load( domain=domain, trainonvalid=trainonvalid) seqenc_vocab = Vocab(padid=1, startid=0, endid=2, unkid=UNKID) seqenc = SequenceEncoder(vocab=seqenc_vocab, tokenizer=tree_to_lisp_tokens, add_start_token=True, add_end_token=True) for example in ds.examples: query = example[1] seqenc.inc_build_vocab(query, seen=example[2] == "train") seqenc.finalize_vocab(min_freq=min_freq, top_k=top_k) nl_tokenizer = AutoTokenizer.from_pretrained(nl_mode) def tokenize(x): ret = (nl_tokenizer.encode(x[0], return_tensors="pt")[0], seqenc.convert(x[1], return_what="tensor"), x[2], x[0], x[1]) return ret tds, vds, xds = ds[(None, None, "train")].map(tokenize), \ ds[(None, None, "valid")].map(tokenize), \ ds[(None, None, "test")].map(tokenize) return tds, vds, xds, nl_tokenizer, seqenc
def test_tf_decoder(self): texts = [ "i went to chocolate @END@", "awesome is @END@", "the meaning of life @END@" ] se = SequenceEncoder(tokenizer=lambda x: x.split()) for t in texts: se.inc_build_vocab(t) se.finalize_vocab() x = BasicDecoderState(texts, texts, se, se) class Model(TransitionModel): def forward(self, x: BasicDecoderState): outprobs = torch.rand(len(x), x.query_encoder.vocab.number_of_ids()) return outprobs, x dec = SeqDecoder(TFTransition(Model())) y = dec(x) print(y[1].followed_actions) outactions = y[1].followed_actions.detach().cpu().numpy() print(outactions[0]) print(se.vocab.print(outactions[0])) print(se.vocab.print(outactions[1])) print(se.vocab.print(outactions[2])) self.assertTrue(se.vocab.print(outactions[0]) == texts[0]) self.assertTrue(se.vocab.print(outactions[1]) == texts[1]) self.assertTrue(se.vocab.print(outactions[2]) == texts[2])
class ConditionalRecallDataset(object): def __init__(self, maxlen=10, NperY=10, **kw): super(ConditionalRecallDataset, self).__init__(**kw) self.data = {} self.NperY, self.maxlen = NperY, maxlen self._seqs, self._ys = gen_data(self.maxlen, self.NperY) self.encoder = SequenceEncoder(tokenizer=lambda x: list(x)) for seq, y in zip(self._seqs, self._ys): self.encoder.inc_build_vocab(seq) self.encoder.inc_build_vocab(y) self.N = len(self._seqs) N = self.N splits = ["train"] * int(N * 0.8) + ["valid"] * int( N * 0.1) + ["test"] * int(N * 0.1) random.shuffle(splits) self.encoder.finalize_vocab() self.build_data(self._seqs, self._ys, splits) def build_data(self, seqs, ys, splits): for seq, y, split in zip(seqs, ys, splits): seq_tensor = self.encoder.convert(seq, return_what="tensor") y_tensor = self.encoder.convert(y, return_what="tensor") if split not in self.data: self.data[split] = [] self.data[split].append((seq_tensor[0], y_tensor[0][0])) def get_split(self, split: str): return DatasetSplitProxy(self.data[split]) def dataloader(self, split: str = None, batsize: int = 5, shuffle=None): if split is None: # return all splits ret = {} for split in self.data.keys(): ret[split] = self.dataloader(batsize=batsize, split=split, shuffle=shuffle) return ret else: assert (split in self.data.keys()) shuffle = shuffle if shuffle is not None else split in ( "train", "train+valid") dl = DataLoader(self.get_split(split), batch_size=batsize, shuffle=shuffle) return dl
def test_beam_search_vs_greedy(self): with torch.no_grad(): texts = ["a b"] * 10 from parseq.vocab import SequenceEncoder se = SequenceEncoder(tokenizer=lambda x: x.split()) for t in texts: se.inc_build_vocab(t) se.finalize_vocab() x = BasicDecoderState(texts, texts, se, se) x.start_decoding() class Model(TransitionModel): transition_tensor = torch.tensor([[0, 0, 0, 0, .51, .49], [0, 0, 0, 0, .51, .49], [0, 0, 0, 0, .51, .49], [0, 0, 0, 0, .51, .49], [0, 0, 0, 0, .51, .49], [0, 0, 0, 0, .01, .99]]) def forward(self, x: BasicDecoderState): prev = x.prev_actions outprobs = self.transition_tensor[prev] outprobs = torch.log(outprobs) return outprobs, x model = Model() beamsize = 50 maxtime = 10 beam_xs = [ x.make_copy(detach=False, deep=True) for _ in range(beamsize) ] beam_states = BeamState(beam_xs) print(len(beam_xs)) print(len(beam_states)) bt = BeamTransition(model, beamsize, maxtime=maxtime) i = 0 _, _, y, _ = bt(x, i) i += 1 _, _, y, _ = bt(y, i) all_terminated = y.all_terminated() while not all_terminated: start_time = time.time() _, _, y, all_terminated = bt(y, i) i += 1 # print(i) end_time = time.time() print(f"{i}: {end_time - start_time}") print(y) print(y.bstates.get(0).followed_actions)
def try_tokenizer_dataset(): from transformers import BartTokenizer ovd = OvernightDatasetLoader().load() seqenc = SequenceEncoder(tokenizer=tree_to_lisp_tokens) for example in ovd.examples: query = example[1] seqenc.inc_build_vocab(query, seen=example[2] == "train") seqenc.finalize_vocab() nl_tokenizer = BartTokenizer.from_pretrained("bart-large") def tokenize(x): ret = [xe for xe in x] ret.append(nl_tokenizer.tokenize(ret[0])) ret.append(nl_tokenizer.encode(ret[0], return_tensors="pt")) ret.append(seqenc.convert(ret[1], return_what="tensor")[0][None]) return ret ovd = ovd.map(tokenize) print(ovd[0])
def test_decoder_API(self): texts = ["i went to chocolate", "awesome is", "the meaning of life"] se = SequenceEncoder(tokenizer=lambda x: x.split()) for t in texts: se.inc_build_vocab(t) se.finalize_vocab() x = BasicDecoderState(texts, texts, se, se) print(x.inp_tensor) print("terminated") print(x.is_terminated()) print(x.all_terminated()) print("prev_actions") x.start_decoding() print(x.prev_actions) print("step") x.step(["i", torch.tensor([7]), "the"]) print(x.prev_actions) print(x.followed_actions)
def test_tf_decoder_with_losses_with_gold(self): texts = [ "i went to chocolate @END@", "awesome is @END@", "the meaning of life @END@" ] se = SequenceEncoder(tokenizer=lambda x: x.split()) for t in texts: se.inc_build_vocab(t) se.finalize_vocab() x = BasicDecoderState(texts, texts, se, se) class Model(TransitionModel): def forward(self, x: BasicDecoderState): outprobs = torch.zeros(len(x), x.query_encoder.vocab.number_of_ids()) golds = x.get_gold().gather( 1, torch.tensor(x._timesteps).to(torch.long)[:, None]) outprobs.scatter_(1, golds, 1) return outprobs, x celoss = CELoss(ignore_index=0) accs = SeqAccuracies() dec = SeqDecoder(TFTransition(Model()), eval=[celoss, accs]) y = dec(x) print(y[0]) print(y[1].followed_actions) print(y[1].get_gold()) self.assertEqual(y[0]["seq_acc"], 1) self.assertEqual(y[0]["elem_acc"], 1) # print(y[1].followed_actions) outactions = y[1].followed_actions.detach().cpu().numpy() # print(outactions[0]) # print(se.vocab.print(outactions[0])) # print(se.vocab.print(outactions[1])) # print(se.vocab.print(outactions[2])) self.assertTrue(se.vocab.print(outactions[0]) == texts[0]) self.assertTrue(se.vocab.print(outactions[1]) == texts[1]) self.assertTrue(se.vocab.print(outactions[2]) == texts[2])
def try_dataset(): tt = q.ticktock("dataset") tt.tick("building dataset") ds = GeoQueryDatasetFunQL(sentence_encoder=SequenceEncoder( tokenizer=lambda x: x.split())) train_dl = ds.dataloader("train", batsize=19) test_dl = ds.dataloader("test", batsize=20) examples = set() examples_list = [] duplicates = [] for b in train_dl: print(len(b)) for i in range(len(b)): example = b.inp_strings[i] + " --> " + b.gold_strings[i] if example in examples: duplicates.append(example) examples.add(example) examples_list.append(example) # print(example) pass print( f"duplicates within train: {len(duplicates)} from {len(examples_list)} total" ) tt.tock("dataset built")
def test_create(self): se = SequenceEncoder(tokenizer=lambda x: x.split()) texts = [ "i went to chocolate", "awesome is @PAD@ @PAD@", "the meaning of life" ] for t in texts: se.inc_build_vocab(t) se.finalize_vocab() x = [BasicDecoderState([t], [t], se, se) for t in texts] merged_x = x[0].merge(x) texts = ["i went to chocolate", "awesome is", "the meaning of life"] batch_x = BasicDecoderState(texts, texts, se, se) print(merged_x.inp_tensor) print(batch_x.inp_tensor) self.assertTrue(torch.allclose(merged_x.inp_tensor, batch_x.inp_tensor)) self.assertTrue( torch.allclose(merged_x.gold_tensor, batch_x.gold_tensor))
def test_tf_decoder_with_losses(self): texts = [ "i went to chocolate @END@", "awesome is @END@", "the meaning of life @END@" ] se = SequenceEncoder(tokenizer=lambda x: x.split()) for t in texts: se.inc_build_vocab(t) se.finalize_vocab() x = BasicDecoderState(texts, texts, se, se) class Model(TransitionModel): def forward(self, x: BasicDecoderState): outprobs = torch.rand(len(x), x.query_encoder.vocab.number_of_ids()) outprobs = torch.nn.functional.log_softmax(outprobs, -1) return outprobs, x celoss = CELoss(ignore_index=0) accs = SeqAccuracies() dec = SeqDecoder(TFTransition(Model()), eval=[celoss, accs]) y = dec(x) print(y[0]) print(y[1].followed_actions) print(y[1].get_gold()) # print(y[1].followed_actions) outactions = y[1].followed_actions.detach().cpu().numpy() # print(outactions[0]) # print(se.vocab.print(outactions[0])) # print(se.vocab.print(outactions[1])) # print(se.vocab.print(outactions[2])) self.assertTrue(se.vocab.print(outactions[0]) == texts[0]) self.assertTrue(se.vocab.print(outactions[1]) == texts[1]) self.assertTrue(se.vocab.print(outactions[2]) == texts[2])
class GeoDatasetRank(object): def __init__(self, p="geoquery_gen/run4/", min_freq: int = 2, splits=None, **kw): super(GeoDatasetRank, self).__init__(**kw) self._initialize(p) self.splits_proportions = splits def _initialize(self, p): self.data = {} with open(os.path.join(p, "trainpreds.json")) as f: trainpreds = ujson.load(f) with open(os.path.join(p, "testpreds.json")) as f: testpreds = ujson.load(f) splits = ["train"] * len(trainpreds) + ["test"] * len(testpreds) preds = trainpreds + testpreds self.sentence_encoder = SequenceEncoder(tokenizer=lambda x: x.split()) self.query_encoder = SequenceEncoder(tokenizer=lambda x: x.split()) # build vocabularies for i, (example, split) in enumerate(zip(preds, splits)): self.sentence_encoder.inc_build_vocab(" ".join( example["sentence"]), seen=split == "train") self.query_encoder.inc_build_vocab(" ".join(example["gold"]), seen=split == "train") for can in example["candidates"]: self.query_encoder.inc_build_vocab(" ".join(can["tokens"]), seen=False) # for word, wordid in self.sentence_encoder.vocab.D.items(): # self.query_encoder.vocab.add_token(word, seen=False) self.sentence_encoder.finalize_vocab() self.query_encoder.finalize_vocab() self.build_data(preds, splits) def build_data(self, examples: Iterable[dict], splits: Iterable[str]): maxlen_in, maxlen_out = 0, 0 for example, split in zip(examples, splits): inp, out = " ".join(example["sentence"]), " ".join(example["gold"]) inp_tensor, inp_tokens = self.sentence_encoder.convert( inp, return_what="tensor,tokens") gold_tree = lisp_to_tree(" ".join(example["gold"][:-1])) if not isinstance(gold_tree, Tree): assert (gold_tree is not None) gold_tensor, gold_tokens = self.query_encoder.convert( out, return_what="tensor,tokens") candidate_tensors, candidate_tokens, candidate_align_tensors = [], [], [] candidate_align_entropies = [] candidate_trees = [] candidate_same = [] for cand in example["candidates"]: cand_tree, _ = lisp_to_tree(" ".join(cand["tokens"][:-1]), None) if cand_tree is None: cand_tree = Tree("@UNK@", []) assert (cand_tree is not None) cand_tensor, cand_tokens = self.query_encoder.convert( " ".join(cand["tokens"]), return_what="tensor,tokens") candidate_tensors.append(cand_tensor) candidate_tokens.append(cand_tokens) candidate_align_tensors.append(torch.tensor( cand["alignments"])) candidate_align_entropies.append( torch.tensor(cand["align_entropies"])) candidate_trees.append(cand_tree) candidate_same.append( are_equal_trees(cand_tree, gold_tree, orderless={"and", "or"}, unktoken="@NOUNKTOKENHERE@")) candidate_tensor = torch.stack(q.pad_tensors(candidate_tensors, 0), 0) candidate_align_tensor = torch.stack( q.pad_tensors(candidate_align_tensors, 0), 0) candidate_align_entropy = torch.stack( q.pad_tensors(candidate_align_entropies, 0), 0) candidate_same = torch.tensor(candidate_same) state = RankState( inp_tensor[None, :], gold_tensor[None, :], candidate_tensor[None, :, :], candidate_same[None, :], candidate_align_tensor[None, :], candidate_align_entropy[None, :], self.sentence_encoder.vocab, self.query_encoder.vocab, ) if split not in self.data: self.data[split] = [] self.data[split].append(state) maxlen_in = max(maxlen_in, len(inp_tokens)) maxlen_out = max(maxlen_out, candidate_tensor.size(-1), gold_tensor.size(-1)) self.maxlen_input = maxlen_in self.maxlen_output = maxlen_out def get_split(self, split: str): return DatasetSplitProxy(self.data[split]) @staticmethod def collate_fn(data: Iterable): goldmaxlen = 0 inpmaxlen = 0 data = [state.make_copy(detach=True, deep=True) for state in data] for state in data: goldmaxlen = max(goldmaxlen, state.gold_tensor.size(1)) inpmaxlen = max(inpmaxlen, state.inp_tensor.size(1)) goldmaxlen = max(goldmaxlen, state.candtensors.size(-1)) inp_tensors = q.pad_tensors([state.inp_tensor for state in data], 1, 0) gold_tensors = q.pad_tensors([state.gold_tensor for state in data], 1, 0) candtensors = q.pad_tensors([state.candtensors for state in data], 2, 0) alignments = q.pad_tensors([state.alignments for state in data], 2, 0) alignment_entropies = q.pad_tensors( [state.alignment_entropies for state in data], 2, 0) for i, state in enumerate(data): state.inp_tensor = inp_tensors[i] state.gold_tensor = gold_tensors[i] state.candtensors = candtensors[i] state.alignments = alignments[i] state.alignment_entropies = alignment_entropies[i] ret = data[0].merge(data) return ret def dataloader(self, split: str = None, batsize: int = 5, shuffle=None): if split is None: # return all splits ret = {} for split in self.data.keys(): ret[split] = self.dataloader(batsize=batsize, split=split, shuffle=shuffle) return ret else: assert (split in self.data.keys()) shuffle = shuffle if shuffle is not None else split in ( "train", "train+valid") dl = DataLoader(self.get_split(split), batch_size=batsize, shuffle=shuffle, collate_fn=type(self).collate_fn) return dl
def run( lr=0.001, batsize=50, epochs=50, embdim=100, encdim=100, numlayers=1, beamsize=1, dropout=.2, wreg=1e-10, cuda=False, gpu=0, minfreq=3, gradnorm=3., cosine_restarts=1., beta=0.001, vib_init=True, vib_enc=True, ): localargs = locals().copy() print(locals()) tt = q.ticktock("script") device = torch.device("cpu") if not cuda else torch.device("cuda", gpu) tt.tick("loading data") ds = LCQuaDnoENTDataset( sentence_encoder=SequenceEncoder(tokenizer=split_tokenizer), min_freq=minfreq) print( f"max lens: {ds.maxlen_input} (input) and {ds.maxlen_output} (output)") tt.tock("data loaded") do_rare_stats(ds) # batch = next(iter(train_dl)) # print(batch) # print("input graph") # print(batch.batched_states) model = BasicGenModel_VIB(embdim=embdim, hdim=encdim, dropout=dropout, numlayers=numlayers, sentence_encoder=ds.sentence_encoder, query_encoder=ds.query_encoder, feedatt=True, vib_init=vib_init, vib_enc=vib_enc) # sentence_rare_tokens = set([ds.sentence_encoder.vocab(i) for i in model.inp_emb.rare_token_ids]) # do_rare_stats(ds, sentence_rare_tokens=sentence_rare_tokens) losses = [CELoss(ignore_index=0, mode="logprobs")] if vib_init: losses.append( StatePenalty(lambda state: sum(state.mstate.vib.init), weight=beta)) if vib_enc: losses.append(StatePenalty("mstate.vib.enc", weight=beta)) tfdecoder = SeqDecoder( model, tf_ratio=1., eval=losses + [ SeqAccuracies(), TreeAccuracy(tensor2tree=partial(tensor2tree, D=ds.query_encoder.vocab), orderless={"select", "count", "ask"}) ]) # beamdecoder = BeamActionSeqDecoder(tfdecoder.model, beamsize=beamsize, maxsteps=50) if beamsize == 1: freedecoder = SeqDecoder( model, maxtime=40, tf_ratio=0., eval=[ SeqAccuracies(), TreeAccuracy(tensor2tree=partial(tensor2tree, D=ds.query_encoder.vocab), orderless={"select", "count", "ask"}) ]) else: freedecoder = BeamDecoder( model, maxtime=30, beamsize=beamsize, eval=[ SeqAccuracies(), TreeAccuracy(tensor2tree=partial(tensor2tree, D=ds.query_encoder.vocab), orderless={"select", "count", "ask"}) ]) # # test # tt.tick("doing one epoch") # for batch in iter(train_dl): # batch = batch.to(device) # ttt.tick("start batch") # # with torch.no_grad(): # out = tfdecoder(batch) # ttt.tock("end batch") # tt.tock("done one epoch") # print(out) # sys.exit() # beamdecoder(next(iter(train_dl))) # print(dict(tfdecoder.named_parameters()).keys()) losses = make_array_of_metrics("loss", "elem_acc", "seq_acc", "tree_acc") vlosses = make_array_of_metrics("seq_acc", "tree_acc") # if beamsize >= 3: # vlosses = make_loss_array("seq_acc", "tree_acc", "tree_acc_at3", "tree_acc_at_last") # else: # vlosses = make_loss_array("seq_acc", "tree_acc", "tree_acc_at_last") # trainable_params = tfdecoder.named_parameters() # exclude_params = set() # exclude_params.add("model.model.inp_emb.emb.weight") # don't train input embeddings if doing glove # trainable_params = [v for k, v in trainable_params if k not in exclude_params] # 4. define optim # optim = torch.optim.Adam(trainable_params, lr=lr, weight_decay=wreg) optim = torch.optim.Adam(tfdecoder.parameters(), lr=lr, weight_decay=wreg) # lr schedule if cosine_restarts >= 0: # t_max = epochs * len(train_dl) t_max = epochs print(f"Total number of updates: {t_max}") lr_schedule = q.WarmupCosineWithHardRestartsSchedule( optim, 0, t_max, cycles=cosine_restarts) reduce_lr = [lambda: lr_schedule.step()] else: reduce_lr = [] # 6. define training function clipgradnorm = lambda: torch.nn.utils.clip_grad_norm_( tfdecoder.parameters(), gradnorm) # clipgradnorm = lambda: None trainbatch = partial(q.train_batch, on_before_optim_step=[clipgradnorm]) trainepoch = partial(q.train_epoch, model=tfdecoder, dataloader=ds.dataloader("train", batsize), optim=optim, losses=losses, _train_batch=trainbatch, device=device, on_end=reduce_lr) # 7. define validation function (using partial) validepoch = partial(q.test_epoch, model=freedecoder, dataloader=ds.dataloader("test", batsize), losses=vlosses, device=device) # validepoch = partial(q.test_epoch, model=freedecoder, dataloader=valid_dl, losses=vlosses, device=device) # p = q.save_run(freedecoder, localargs, filepath=__file__) # q.save_dataset(ds, p) # _freedecoder, _localargs = q.load_run(p) # _ds = q.load_dataset(p) # sys.exit() # 7. run training tt.tick("training") q.run_training(run_train_epoch=trainepoch, run_valid_epoch=validepoch, max_epochs=epochs) tt.tock("done training") # testing tt.tick("testing") testresults = q.test_epoch(model=freedecoder, dataloader=ds.dataloader("valid", batsize), losses=vlosses, device=device) print("validation test results: ", testresults) tt.tock("tested") tt.tick("testing") testresults = q.test_epoch(model=freedecoder, dataloader=ds.dataloader("test", batsize), losses=vlosses, device=device) print("test results: ", testresults) tt.tock("tested") # save model? tosave = input( "Save this model? 'y(es)'=Yes, <int>=overwrite previous, otherwise=No) \n>" ) if tosave.lower() == "y" or tosave.lower() == "yes" or re.match( "\d+", tosave.lower()): overwrite = int(tosave) if re.match("\d+", tosave) else None p = q.save_run(model, localargs, filepath=__file__, overwrite=overwrite) q.save_dataset(ds, p) _model, _localargs = q.load_run(p) _ds = q.load_dataset(p) _freedecoder = BeamDecoder( _model, maxtime=50, beamsize=beamsize, eval_beam=[ TreeAccuracy(tensor2tree=partial(tensor2tree, D=ds.query_encoder.vocab), orderless={"op:and", "SW:concat"}) ]) # testing tt.tick("testing reloaded") _testresults = q.test_epoch(model=_freedecoder, dataloader=_ds.dataloader("test", batsize), losses=vlosses, device=device) print(_testresults) assert (testresults == _testresults) tt.tock("tested")
class LCQuaDnoENTDataset(object): def __init__(self, p="../../datasets/lcquad/", sentence_encoder: SequenceEncoder = None, min_freq: int = 2, splits=None, **kw): super(LCQuaDnoENTDataset, self).__init__(**kw) self._simplify_filters = True # if True, filter expressions are converted to orderless and-expressions self._initialize(p, sentence_encoder, min_freq) self.splits_proportions = splits def lines_to_examples(self, lines: List[str]): maxsize_before = 0 avgsize_before = [] maxsize_after = 0 avgsize_after = [] afterstring = set() def convert_to_lispstr(_x): splits = _x.split() assert (sum([1 if xe == "~" else 0 for xe in splits]) == 1) assert (splits[1] == "~") splits = ["," if xe == "&" else xe for xe in splits] pstr = f"{splits[0]} ({' '.join(splits[2:])})" return pstr ret = [] ltp = None j = 0 for i, line in enumerate(lines): question = line["question"] query = line["logical_form"] query = convert_to_lispstr(query) z, ltp = prolog_to_pas(query, ltp) if z is not None: ztree = pas_to_tree(z) maxsize_before = max(maxsize_before, tree_size(ztree)) avgsize_before.append(tree_size(ztree)) lf = ztree ret.append((question, lf)) # print(f"Example {j}:") # print(ret[-1][0]) # print(ret[-1][1]) # print() ltp = None maxsize_after = max(maxsize_after, tree_size(lf)) avgsize_after.append(tree_size(lf)) j += 1 avgsize_before = sum(avgsize_before) / len(avgsize_before) avgsize_after = sum(avgsize_after) / len(avgsize_after) print("Sizes ({j} examples):") # print(f"\t Max, Avg size before: {maxsize_before}, {avgsize_before}") print(f"\t Max, Avg size: {maxsize_after}, {avgsize_after}") return ret def _initialize(self, p, sentence_encoder: SequenceEncoder, min_freq: int): self.data = {} self.sentence_encoder = sentence_encoder jp = os.path.join(p, "lcquad_dataset.json") with open(jp, "r") as f: examples = ujson.load(f) examples = self.lines_to_examples(examples) questions, queries = tuple(zip(*examples)) trainlen = int(round(0.8 * len(examples))) validlen = int(round(0.1 * len(examples))) testlen = int(round(0.1 * len(examples))) splits = ["train"] * trainlen + ["valid"] * validlen + ["test" ] * testlen random.seed(1337) random.shuffle(splits) assert (len(splits) == len(examples)) self.query_encoder = SequenceEncoder(tokenizer=partial( tree_query_tokenizer, strtok=sentence_encoder.tokenizer), add_end_token=True) # build vocabularies for i, (question, query, split) in enumerate(zip(questions, queries, splits)): self.sentence_encoder.inc_build_vocab(question, seen=split == "train") self.query_encoder.inc_build_vocab(query, seen=split == "train") for word, wordid in self.sentence_encoder.vocab.D.items(): self.query_encoder.vocab.add_token(word, seen=False) self.sentence_encoder.finalize_vocab(min_freq=min_freq) self.query_encoder.finalize_vocab(min_freq=min_freq) self.build_data(questions, queries, splits) def build_data(self, inputs: Iterable[str], outputs: Iterable[str], splits: Iterable[str]): maxlen_in, maxlen_out = 0, 0 eid = 0 gold_map = torch.arange( 0, self.query_encoder.vocab.number_of_ids(last_nonrare=False)) rare_tokens = self.query_encoder.vocab.rare_tokens - set( self.sentence_encoder.vocab.D.keys()) for rare_token in rare_tokens: gold_map[self.query_encoder.vocab[rare_token]] = \ self.query_encoder.vocab[self.query_encoder.vocab.unktoken] for inp, out, split in zip(inputs, outputs, splits): inp_tensor, inp_tokens = self.sentence_encoder.convert( inp, return_what="tensor,tokens") out_tensor, out_tokens = self.query_encoder.convert( out, return_what="tensor,tokens") out_tensor = gold_map[out_tensor] state = TreeDecoderState([inp], [out], inp_tensor[None, :], out_tensor[None, :], [inp_tokens], [out_tokens], self.sentence_encoder.vocab, self.query_encoder.vocab) state.eids = np.asarray([eid], dtype="int64") maxlen_in, maxlen_out = max(maxlen_in, len(state.inp_tokens[0])), max( maxlen_out, len(state.gold_tokens[0])) if split not in self.data: self.data[split] = [] self.data[split].append(state) eid += 1 self.maxlen_input, self.maxlen_output = maxlen_in, maxlen_out def get_split(self, split: str): splits = split.split("+") data = [] for split in splits: data += self.data[split] return DatasetSplitProxy(data) @staticmethod def collate_fn(data: Iterable): goldmaxlen = 0 inpmaxlen = 0 data = [state.make_copy(detach=True, deep=True) for state in data] for state in data: goldmaxlen = max(goldmaxlen, state.gold_tensor.size(1)) inpmaxlen = max(inpmaxlen, state.inp_tensor.size(1)) for state in data: state.gold_tensor = torch.cat([ state.gold_tensor, state.gold_tensor.new_zeros( 1, goldmaxlen - state.gold_tensor.size(1)) ], 1) state.inp_tensor = torch.cat([ state.inp_tensor, state.inp_tensor.new_zeros( 1, inpmaxlen - state.inp_tensor.size(1)) ], 1) ret = data[0].merge(data) return ret def dataloader(self, split: str = None, batsize: int = 5): if split is None: # return all splits ret = {} for split in self.data.keys(): ret[split] = self.dataloader(batsize=batsize, split=split) return ret else: dl = DataLoader(self.get_split(split), batch_size=batsize, shuffle=split in ("train", "train+valid"), collate_fn=type(self).collate_fn) return dl