def test_beam(self): x = [ "( and ( got the walk ) ( got the talk ) ( and ( got thatstyle ) ( got thatsmile ) ) )", "( and ( got the walk ) ( got talk the ) ( and ( got thatstyle ) ( got thatsmile ) ) )", "( and ( got the walk ) ( got the walk ) ( and ( got thatstyle ) ( got thatsmile ) ) )", "( and ( got the talk ) ( got the walk ) ( and ( got thatsmile ) ( got thatstyle ) ) )", "( too_bad ( she ( has ( a penis ) ) ) )" ] D = Vocab() for xe in x: for xes in xe.split(): D.add_token(xes, seen=True) print(D.D) acc = TreeAccuracy(tensor2tree=partial(tensor2tree, D=D), orderless={"and"}) x = [[D[xes] for xes in xe.split()] for xe in x] # equalize dims maxlen = max([len(xe) for xe in x]) x = [xe + [0] * (maxlen - len(xe)) for xe in x] x = torch.tensor(x) print(x) a = acc(None, x[torch.tensor([1, 4, 2, 3, 0])][None, :, :], x[0:1]) print(a) self.assertTrue(a["tree_acc"] == 0) self.assertTrue(a["tree_acc_at1"] == 0) self.assertTrue(a["tree_acc_at2"] == 0) self.assertTrue(a["tree_acc_at3"] == 0) self.assertTrue(a["tree_acc_at4"] == 1) self.assertTrue(a["tree_acc_at5"] == 1) self.assertTrue(a["tree_acc_at_last"] == 1)
def _initialize(self, p, bert_tokenizer, min_freq: int): self.data = {} self.bert_vocab = Vocab() self.bert_vocab.set_dict(bert_tokenizer.vocab) self.sentence_encoder = SequenceEncoder( lambda x: bert_tokenizer.tokenize(f"[CLS] {x} [SEP]"), vocab=self.bert_vocab) trainlines = [ x for x in ujson.load( open(os.path.join(p, f"geo-{self.train_lang}.json"), "r")) ] testlines = [ x for x in ujson.load( open(os.path.join(p, f"geo-{self.train_lang}.json"), "r")) ] trainlines = [x for x in trainlines if x["split"] == "train"] testlines = [x for x in testlines if x["split"] == "test"] if self.cvfolds is None: splits = ["train"] * len(trainlines) + ["test"] * len(testlines) else: cvsplit_len = len(trainlines) / self.cvfolds splits = [] for i in range(0, self.cvfolds): splits += [i] * round(cvsplit_len * (i + 1) - len(splits)) random.shuffle(splits) splits = [ "valid" if x == self.testfold else "train" for x in splits ] splits = splits + ["test"] * len(testlines) questions = [x["nl"] for x in trainlines] queries = [x["mrl"] for x in trainlines] xquestions = [x["nl"] for x in testlines] xqueries = [x["mrl"] for x in testlines] questions += xquestions queries += xqueries # initialize output vocabulary outvocab = Vocab() for token, bertid in self.bert_vocab.D.items(): outvocab.add_token(token, seen=False) self.query_encoder = SequenceEncoder(tokenizer=partial( basic_query_tokenizer, strtok=bert_tokenizer), vocab=outvocab, add_end_token=True) # build vocabularies for i, (question, query, split) in enumerate(zip(questions, queries, splits)): self.query_encoder.inc_build_vocab(query, seen=split == "train") keeptokens = set(self.bert_vocab.D.keys()) self.query_encoder.finalize_vocab(min_freq=min_freq, keep_tokens=keeptokens) token_specs = self.build_token_specs(queries) self.token_specs = token_specs self.build_data(questions, queries, splits)
def test_normal(self): x = [ "( and ( has service ) ( has money ) ( and ( got thatstyle ) ( got thatsmile ) ) )", "( and ( has service ) ( has service ) ( and ( got thatstyle ) ( got thatsmile ) ) )", "( and ( has money ) ( has service ) ( and ( got thatsmile ) ( got thatstyle ) ) )" ] D = Vocab() for xe in x: for xes in xe.split(): D.add_token(xes, seen=True) print(D.D) acc = TreeAccuracy(tensor2tree=partial(tensor2tree, D=D), orderless={"and"}) x = [[D[xes] for xes in xe.split()] for xe in x] x = torch.tensor(x) print(x) a = acc(None, x[0:1], x[1:2]) self.assertEqual(a["tree_acc"], 0) print(a) a = acc(None, x[0:1], x[2:3]) self.assertEqual(a["tree_acc"], 1.) print(a)
def build_vocab_from_pcfg(pcfg, min_freq=0, top_k=np.infty)->Vocab: vocab = Vocab() vocab.add_token("(") vocab.add_token(")") for rule in pcfg.productions(): vocab.add_token(str(rule.lhs())) for rhse in rule.rhs(): vocab.add_token(str(rhse)) vocab.finalize(min_freq=min_freq, top_k=top_k) return vocab
def load_ds(dataset="scan/random", validfrac=0.1, recompute=False, bertname="bert-base-uncased"): tt = q.ticktock("data") tt.tick(f"loading '{dataset}'") if bertname.startswith("none"): bertname = "bert" + bertname[4:] if dataset.startswith("cfq/") or dataset.startswith("scan/mcd"): key = f"{dataset}|bertname={bertname}" print(f"validfrac is ineffective with dataset '{dataset}'") else: key = f"{dataset}|validfrac={validfrac}|bertname={bertname}" shelfname = os.path.basename(__file__) + ".cache.shelve" if not recompute: tt.tick(f"loading from shelf (key '{key}')") with shelve.open(shelfname) as shelf: if key not in shelf: recompute = True tt.tock("couldn't load from shelf") else: shelved = shelf[key] trainex, validex, testex, fldic = shelved["trainex"], shelved[ "validex"], shelved["testex"], shelved["fldic"] inpdic = shelved["inpdic"] if "inpdic" in shelved else None trainds, validds, testds = Dataset(trainex), Dataset( validex), Dataset(testex) tt.tock("loaded from shelf") if recompute: tt.tick("loading data") splits = dataset.split("/") dataset, splits = splits[0], splits[1:] split = "/".join(splits) if dataset == "scan": ds = SCANDatasetLoader().load(split, validfrac=validfrac) elif dataset == "cfq": ds = CFQDatasetLoader().load(split + "/modent") else: raise Exception(f"Unknown dataset: '{dataset}'") tt.tock("loaded data") tt.tick("creating tokenizer") tokenizer = Tokenizer(bertname=bertname) tt.tock("created tokenizer") print(len(ds)) tt.tick("dictionaries") inpdic = Vocab() inplens, outlens = [0], [] fldic = Vocab() for x in ds: outtoks = tokenizer.get_out_toks(x[1]) outlens.append(len(outtoks)) for tok in outtoks: fldic.add_token(tok, seen=x[2] == "train") inptoks = tokenizer.get_toks(x[0]) for tok in inptoks: inpdic.add_token(tok, seen=x[2] == "train") inpdic.finalize(min_freq=0, top_k=np.infty) fldic.finalize(min_freq=0, top_k=np.infty) print( f"input avg/max length is {np.mean(inplens):.1f}/{max(inplens)}, output avg/max length is {np.mean(outlens):.1f}/{max(outlens)}" ) print( f"output vocabulary size: {len(fldic.D)} at output, {len(inpdic.D)} at input" ) tt.tock() tt.tick("tensorizing") tokenizer.inpvocab = inpdic tokenizer.outvocab = fldic trainds = ds.filter(lambda x: x[-1] == "train").map( lambda x: x[:-1]).map( lambda x: tokenizer.tokenize(x[0], x[1])).cache(True) validds = ds.filter(lambda x: x[-1] == "valid").map( lambda x: x[:-1]).map( lambda x: tokenizer.tokenize(x[0], x[1])).cache(True) testds = ds.filter(lambda x: x[-1] == "test").map( lambda x: x[:-1]).map( lambda x: tokenizer.tokenize(x[0], x[1])).cache(True) # ds = ds.map(lambda x: tokenizer.tokenize(x[0], x[1]) + (x[2],)).cache(True) tt.tock("tensorized") tt.tick("shelving") with shelve.open(shelfname) as shelf: shelved = { "trainex": trainds.examples, "validex": validds.examples, "testex": testds.examples, "fldic": fldic, "inpdic": inpdic, } shelf[key] = shelved tt.tock("shelved") tt.tock(f"loaded '{dataset}'") tt.msg( f"#train={len(trainds)}, #valid={len(validds)}, #test={len(testds)}") tt.msg("Overlap of validation with train:") overlaps = compute_overlaps(trainds, validds) print(json.dumps(overlaps, indent=4)) tt.msg("Overlap of test with train:") overlaps = compute_overlaps(trainds, testds) print(json.dumps(overlaps, indent=4)) return trainds, validds, testds, fldic, inpdic
def load_ds(domain="restaurants", nl_mode="bert-base-uncased", trainonvalid=False, noreorder=False): """ Creates a dataset of examples which have * NL question and tensor * original FL tree * reduced FL tree with slots (this is randomly generated) * tensor corresponding to reduced FL tree with slots * mask specifying which elements in reduced FL tree are terminated * 2D gold that specifies whether a token/action is in gold for every position (compatibility with MML!) """ orderless = {"op:and", "SW:concat"} # only use in eval!! ds = OvernightDatasetLoader().load(domain=domain, trainonvalid=trainonvalid) ds = ds.map(lambda x: (x[0], ATree("@START@", [x[1]]), x[2])) if not noreorder: ds = ds.map(lambda x: (x[0], reorder_tree(x[1], orderless=orderless), x[2])) vocab = Vocab(padid=0, startid=2, endid=3, unkid=1) vocab.add_token("@START@", seen=np.infty) vocab.add_token( "@CLOSE@", seen=np.infty ) # only here for the action of closing an open position, will not be seen at input vocab.add_token( "@OPEN@", seen=np.infty ) # only here for the action of opening a closed position, will not be seen at input vocab.add_token( "@REMOVE@", seen=np.infty ) # only here for deletion operations, won't be seen at input vocab.add_token( "@REMOVESUBTREE@", seen=np.infty ) # only here for deletion operations, won't be seen at input vocab.add_token("@SLOT@", seen=np.infty) # will be seen at input, can't be produced! nl_tokenizer = BertTokenizer.from_pretrained(nl_mode) # for tok, idd in nl_tokenizer.vocab.items(): # vocab.add_token(tok, seen=np.infty) # all wordpieces are added for possible later generation tds, vds, xds = ds[lambda x: x[2] == "train"], \ ds[lambda x: x[2] == "valid"], \ ds[lambda x: x[2] == "test"] seqenc = SequenceEncoder( vocab=vocab, tokenizer=lambda x: extract_info(x, onlytokens=True), add_start_token=False, add_end_token=False) for example in tds.examples: query = example[1] seqenc.inc_build_vocab(query, seen=True) for example in vds.examples: query = example[1] seqenc.inc_build_vocab(query, seen=False) for example in xds.examples: query = example[1] seqenc.inc_build_vocab(query, seen=False) seqenc.finalize_vocab(min_freq=0) def mapper(x): nl = x[0] fl = x[1] fltoks = extract_info(fl, onlytokens=True) seq = seqenc.convert(fltoks, return_what="tensor") ret = (nl_tokenizer.encode(nl, return_tensors="pt")[0], seq) return ret tds_seq = tds.map(mapper) vds_seq = vds.map(mapper) xds_seq = xds.map(mapper) return tds_seq, vds_seq, xds_seq, nl_tokenizer, seqenc, orderless
def load_ds(domain="restaurants", nl_mode="bert-base-uncased", trainonvalid=False, noreorder=False, numbered=False): """ Creates a dataset of examples which have * NL question and tensor * original FL tree * reduced FL tree with slots (this is randomly generated) * tensor corresponding to reduced FL tree with slots * mask specifying which elements in reduced FL tree are terminated * 2D gold that specifies whether a token/action is in gold for every position (compatibility with MML!) """ # orderless = {"op:and", "SW:concat"} # only use in eval!! orderless = ORDERLESS ds = OvernightDatasetLoader(simplify_mode="none").load( domain=domain, trainonvalid=trainonvalid) # ds contains 3-tuples of (input, output tree, split name) if not noreorder: ds = ds.map(lambda x: (x[0], reorder_tree(x[1], orderless=orderless), x[2])) ds = ds.map(lambda x: (x[0], tree_to_seq(x[1]), x[2])) if numbered: ds = ds.map(lambda x: (x[0], make_numbered_tokens(x[1]), x[2])) vocab = Vocab(padid=0, startid=2, endid=3, unkid=1) vocab.add_token("@BOS@", seen=np.infty) vocab.add_token("@EOS@", seen=np.infty) vocab.add_token("@STOP@", seen=np.infty) nl_tokenizer = BertTokenizer.from_pretrained(nl_mode) tds, vds, xds = ds[lambda x: x[2] == "train"], \ ds[lambda x: x[2] == "valid"], \ ds[lambda x: x[2] == "test"] seqenc = SequenceEncoder(vocab=vocab, tokenizer=lambda x: x, add_start_token=False, add_end_token=False) for example in tds.examples: query = example[1] seqenc.inc_build_vocab(query, seen=True) for example in vds.examples: query = example[1] seqenc.inc_build_vocab(query, seen=False) for example in xds.examples: query = example[1] seqenc.inc_build_vocab(query, seen=False) seqenc.finalize_vocab(min_freq=0) def mapper(x): seq = seqenc.convert(x[1], return_what="tensor") ret = (nl_tokenizer.encode(x[0], return_tensors="pt")[0], seq) return ret tds_seq = tds.map(mapper) vds_seq = vds.map(mapper) xds_seq = xds.map(mapper) return tds_seq, vds_seq, xds_seq, nl_tokenizer, seqenc, orderless