def load_ds(domain="restaurants", min_freq=0, top_k=np.infty, nl_mode="bart-large", trainonvalid=False): ds = OvernightDatasetLoader(simplify_mode="light").load( domain=domain, trainonvalid=trainonvalid) seqenc_vocab = Vocab(padid=1, startid=0, endid=2, unkid=UNKID) seqenc = SequenceEncoder(vocab=seqenc_vocab, tokenizer=tree_to_lisp_tokens, add_start_token=True, add_end_token=True) for example in ds.examples: query = example[1] seqenc.inc_build_vocab(query, seen=example[2] == "train") seqenc.finalize_vocab(min_freq=min_freq, top_k=top_k) nl_tokenizer = AutoTokenizer.from_pretrained(nl_mode) def tokenize(x): ret = (nl_tokenizer.encode(x[0], return_tensors="pt")[0], seqenc.convert(x[1], return_what="tensor"), x[2], x[0], x[1]) return ret tds, vds, xds = ds[(None, None, "train")].map(tokenize), \ ds[(None, None, "valid")].map(tokenize), \ ds[(None, None, "test")].map(tokenize) return tds, vds, xds, nl_tokenizer, seqenc
def load_ds(traindomains=("restaurants",), testdomain="housing", min_freq=1, mincoverage=1, top_k=np.infty, nl_mode="bert-base-uncased", fullsimplify=False, onlyabstract=False, pretrainsetting="all+lex", # "all", "lex" or "all+lex" finetunesetting="lex", # "lex", "all", "min" ): """ :param traindomains: :param testdomain: :param min_freq: :param mincoverage: :param top_k: :param nl_mode: :param fullsimplify: :param add_domain_start: :param onlyabstract: :param pretrainsetting: "all": use all examples from every domain "lex": use only lexical examples "all+lex": use both :param finetunesetting: "lex": use lexical examples "all": use all training examples "min": use minimal lexicon-covering set of examples ! Test is always over the same original test set. ! Validation is over a fraction of training data :return: """ general_tokens = { "(", ")", "arg:~type", "arg:type", "op:and", "SW:concat", "cond:has", "arg:<=", "arg:<", "arg:>=", "arg:>", "arg:!=", "arg:=", "SW:superlative", "SW:CNT-arg:min", "SW:CNT-arg:<", "SW:CNT-arg:<=", "SW:CNT-arg:>=", "SW:CNT-arg:>", "SW:CNT-arg:max", "SW:CNT-arg:=", "arg:max", } def tokenize_and_add_start(t): tokens = tree_to_lisp_tokens(t) starttok = "@START@" tokens = [starttok] + tokens return tokens sourceex = [] for traindomain in traindomains: ds = OvernightDatasetLoader(simplify_mode="light" if not fullsimplify else "full", simplify_blocks=True, restore_reverse=DATA_RESTORE_REVERSE, validfrac=.10)\ .load(domain=traindomain) sourceex += ds[(None, None, lambda x: x in ("train", "valid", "lexicon"))].map(lambda x: (x[0], x[1], x[2], traindomain)).examples # don't use test examples testds = OvernightDatasetLoader(simplify_mode="light" if not fullsimplify else "full", simplify_blocks=True, restore_reverse=DATA_RESTORE_REVERSE)\ .load(domain=testdomain) targetex = testds.map(lambda x: x + (testdomain,)).examples pretrainex = [] if "all" in pretrainsetting.split("+"): pretrainex += [(a, tokenize_and_add_start(b), "pretrain", d) for a, b, c, d in sourceex if c == "train"] if "lex" in pretrainsetting.split("+"): pretrainex += [(a, tokenize_and_add_start(b), "pretrain", d) for a, b, c, d in sourceex if c == "lexicon"] pretrainvalidex = [(a, tokenize_and_add_start(b), "pretrainvalid", d) for a, b, c, d in sourceex if c == "valid"] if finetunesetting == "all": finetunetrainex = [(a, tokenize_and_add_start(b), "fttrain", d) for a, b, c, d in targetex if c == "train"] elif finetunesetting == "lex": finetunetrainex = [(a, tokenize_and_add_start(b), "fttrain", d) for a, b, c, d in targetex if c == "lexicon"] elif finetunesetting == "min": finetunetrainex = get_maximum_spanning_examples([(a, b, c, d) for a, b, c, d in targetex if c == "train"], mincoverage=mincoverage, loadedex=[e for e in pretrainex if e[2] == "pretrain"]) finetunetrainex = [(a, tokenize_and_add_start(b), "fttrain", d) for a, b, c, d in finetunetrainex] finetunevalidex = [(a, tokenize_and_add_start(b), "ftvalid", d) for a, b, c, d in targetex if c == "valid"] finetunetestex = [(a, tokenize_and_add_start(b), "fttest", d) for a, b, c, d in targetex if c == "test"] print(f"Using mode \"{finetunesetting}\" for finetuning data: " f"\n\t{len(finetunetrainex)} training examples") allex = pretrainex + pretrainvalidex + finetunetrainex + finetunevalidex + finetunetestex ds = Dataset(allex) if onlyabstract: et = get_lf_abstract_transform(ds[lambda x: x[3] != testdomain].examples) ds = ds.map(lambda x: (x[0], et(x[1]), x[2], x[3])) seqenc_vocab = Vocab(padid=0, startid=1, endid=2, unkid=UNKID) seqenc = SequenceEncoder(vocab=seqenc_vocab, tokenizer=lambda x: x, add_start_token=False, add_end_token=True) for example in ds.examples: query = example[1] seqenc.inc_build_vocab(query, seen=example[2] in ("pretrain", "fttrain")) seqenc.finalize_vocab(min_freq=min_freq, top_k=top_k) generaltokenmask = torch.zeros(seqenc_vocab.number_of_ids(), dtype=torch.long) for token, tokenid in seqenc_vocab.D.items(): if token in general_tokens: generaltokenmask[tokenid] = 1 nl_tokenizer = AutoTokenizer.from_pretrained(nl_mode) def tokenize(x): ret = (nl_tokenizer.encode(x[0], return_tensors="pt")[0], seqenc.convert(x[1], return_what="tensor"), x[2], x[0], x[1], x[3]) return ret tds, ftds, vds, fvds, xds = ds[(None, None, "pretrain", None)].map(tokenize), \ ds[(None, None, "fttrain", None)].map(tokenize), \ ds[(None, None, "pretrainvalid", None)].map(tokenize), \ ds[(None, None, "ftvalid", None)].map(tokenize), \ ds[(None, None, "fttest", None)].map(tokenize) return tds, ftds, vds, fvds, xds, nl_tokenizer, seqenc, generaltokenmask
def load_ds(domain="restaurants", nl_mode="bert-base-uncased", trainonvalid=False, noreorder=False): """ Creates a dataset of examples which have * NL question and tensor * original FL tree * reduced FL tree with slots (this is randomly generated) * tensor corresponding to reduced FL tree with slots * mask specifying which elements in reduced FL tree are terminated * 2D gold that specifies whether a token/action is in gold for every position (compatibility with MML!) """ orderless = {"op:and", "SW:concat"} # only use in eval!! ds = OvernightDatasetLoader().load(domain=domain, trainonvalid=trainonvalid) ds = ds.map(lambda x: (x[0], ATree("@START@", [x[1]]), x[2])) if not noreorder: ds = ds.map(lambda x: (x[0], reorder_tree(x[1], orderless=orderless), x[2])) vocab = Vocab(padid=0, startid=2, endid=3, unkid=1) vocab.add_token("@START@", seen=np.infty) vocab.add_token( "@CLOSE@", seen=np.infty ) # only here for the action of closing an open position, will not be seen at input vocab.add_token( "@OPEN@", seen=np.infty ) # only here for the action of opening a closed position, will not be seen at input vocab.add_token( "@REMOVE@", seen=np.infty ) # only here for deletion operations, won't be seen at input vocab.add_token( "@REMOVESUBTREE@", seen=np.infty ) # only here for deletion operations, won't be seen at input vocab.add_token("@SLOT@", seen=np.infty) # will be seen at input, can't be produced! nl_tokenizer = BertTokenizer.from_pretrained(nl_mode) # for tok, idd in nl_tokenizer.vocab.items(): # vocab.add_token(tok, seen=np.infty) # all wordpieces are added for possible later generation tds, vds, xds = ds[lambda x: x[2] == "train"], \ ds[lambda x: x[2] == "valid"], \ ds[lambda x: x[2] == "test"] seqenc = SequenceEncoder( vocab=vocab, tokenizer=lambda x: extract_info(x, onlytokens=True), add_start_token=False, add_end_token=False) for example in tds.examples: query = example[1] seqenc.inc_build_vocab(query, seen=True) for example in vds.examples: query = example[1] seqenc.inc_build_vocab(query, seen=False) for example in xds.examples: query = example[1] seqenc.inc_build_vocab(query, seen=False) seqenc.finalize_vocab(min_freq=0) def mapper(x): nl = x[0] fl = x[1] fltoks = extract_info(fl, onlytokens=True) seq = seqenc.convert(fltoks, return_what="tensor") ret = (nl_tokenizer.encode(nl, return_tensors="pt")[0], seq) return ret tds_seq = tds.map(mapper) vds_seq = vds.map(mapper) xds_seq = xds.map(mapper) return tds_seq, vds_seq, xds_seq, nl_tokenizer, seqenc, orderless
def load_ds(traindomains=("restaurants", ), testdomain="housing", min_freq=1, mincoverage=1, top_k=np.infty, nl_mode="bert-base-uncased", fullsimplify=False, add_domain_start=True, useall=False): def tokenize_and_add_start(t, _domain): tokens = tree_to_lisp_tokens(t) starttok = f"@START/{_domain}@" if add_domain_start else "@START@" tokens = [starttok] + tokens return tokens allex = [] for traindomain in traindomains: ds = OvernightDatasetLoader(simplify_mode="light" if not fullsimplify else "full", simplify_blocks=True, restore_reverse=DATA_RESTORE_REVERSE, validfrac=.10)\ .load(domain=traindomain) allex += ds[(None, None, lambda x: x in ("train", "valid"))].map(lambda x: (x[0], x[1], x[ 2], traindomain)).examples # don't use test examples testds = OvernightDatasetLoader(simplify_mode="light" if not fullsimplify else "full", simplify_blocks=True, restore_reverse=DATA_RESTORE_REVERSE)\ .load(domain=testdomain) if useall: print("using all training examples") sortedexamples = testds[(None, None, "train")].examples else: sortedexamples = get_maximum_spanning_examples( testds[(None, None, "train")].examples, mincoverage=mincoverage, loadedex=[e for e in allex if e[2] == "train"]) allex += testds[( None, None, "valid")].map(lambda x: (x[0], x[1], "ftvalid", testdomain)).examples allex += testds[( None, None, "test")].map(lambda x: (x[0], x[1], x[2], testdomain)).examples allex += [(ex[0], ex[1], "fttrain", testdomain) for ex in sortedexamples] _ds = Dataset(allex) ds = _ds.map(lambda x: (x[0], tokenize_and_add_start(x[1], x[3]), x[2], x[3])) et = get_lf_abstract_transform(ds[lambda x: x[3] != testdomain].examples) ds = ds.map(lambda x: (x[0], et(x[1]), x[1], x[2], x[3])) seqenc_vocab = Vocab(padid=0, startid=1, endid=2, unkid=UNKID) absseqenc_vocab = Vocab(padid=0, startid=1, endid=2, unkid=UNKID) absseqenc = SequenceEncoder(vocab=seqenc_vocab, tokenizer=lambda x: x, add_start_token=False, add_end_token=True) fullseqenc = SequenceEncoder(vocab=absseqenc_vocab, tokenizer=lambda x: x, add_start_token=False, add_end_token=True) for example in ds.examples: absseqenc.inc_build_vocab(example[1], seen=example[3] in ("train", "fttrain")) fullseqenc.inc_build_vocab(example[2], seen=example[3] in ("train", "fttrain")) absseqenc.finalize_vocab(min_freq=min_freq, top_k=top_k) fullseqenc.finalize_vocab(min_freq=min_freq, top_k=top_k) nl_tokenizer = AutoTokenizer.from_pretrained(nl_mode) def tokenize(x): ret = (nl_tokenizer.encode(x[0], return_tensors="pt")[0], absseqenc.convert(x[1], return_what="tensor"), fullseqenc.convert(x[2], return_what="tensor"), x[3], x[0], x[1], x[4]) return ret tds, ftds, vds, fvds, xds = ds[(None, None, None, "train", None)].map(tokenize), \ ds[(None, None, None, "fttrain", None)].map(tokenize), \ ds[(None, None, None, "valid", None)].map(tokenize), \ ds[(None, None, None, "ftvalid", None)].map(tokenize), \ ds[(None, None, None, "test", None)].map(tokenize) return tds, ftds, vds, fvds, xds, nl_tokenizer, fullseqenc, absseqenc
def load_ds(domain="restaurants", nl_mode="bert-base-uncased", trainonvalid=False, noreorder=False, numbered=False): """ Creates a dataset of examples which have * NL question and tensor * original FL tree * reduced FL tree with slots (this is randomly generated) * tensor corresponding to reduced FL tree with slots * mask specifying which elements in reduced FL tree are terminated * 2D gold that specifies whether a token/action is in gold for every position (compatibility with MML!) """ # orderless = {"op:and", "SW:concat"} # only use in eval!! orderless = ORDERLESS ds = OvernightDatasetLoader(simplify_mode="none").load( domain=domain, trainonvalid=trainonvalid) # ds contains 3-tuples of (input, output tree, split name) if not noreorder: ds = ds.map(lambda x: (x[0], reorder_tree(x[1], orderless=orderless), x[2])) ds = ds.map(lambda x: (x[0], tree_to_seq(x[1]), x[2])) if numbered: ds = ds.map(lambda x: (x[0], make_numbered_tokens(x[1]), x[2])) vocab = Vocab(padid=0, startid=2, endid=3, unkid=1) vocab.add_token("@BOS@", seen=np.infty) vocab.add_token("@EOS@", seen=np.infty) vocab.add_token("@STOP@", seen=np.infty) nl_tokenizer = BertTokenizer.from_pretrained(nl_mode) tds, vds, xds = ds[lambda x: x[2] == "train"], \ ds[lambda x: x[2] == "valid"], \ ds[lambda x: x[2] == "test"] seqenc = SequenceEncoder(vocab=vocab, tokenizer=lambda x: x, add_start_token=False, add_end_token=False) for example in tds.examples: query = example[1] seqenc.inc_build_vocab(query, seen=True) for example in vds.examples: query = example[1] seqenc.inc_build_vocab(query, seen=False) for example in xds.examples: query = example[1] seqenc.inc_build_vocab(query, seen=False) seqenc.finalize_vocab(min_freq=0) def mapper(x): seq = seqenc.convert(x[1], return_what="tensor") ret = (nl_tokenizer.encode(x[0], return_tensors="pt")[0], seq) return ret tds_seq = tds.map(mapper) vds_seq = vds.map(mapper) xds_seq = xds.map(mapper) return tds_seq, vds_seq, xds_seq, nl_tokenizer, seqenc, orderless