def build_vocab_from_pcfg(pcfg, min_freq=0, top_k=np.infty)->Vocab: vocab = Vocab() vocab.add_token("(") vocab.add_token(")") for rule in pcfg.productions(): vocab.add_token(str(rule.lhs())) for rhse in rule.rhs(): vocab.add_token(str(rhse)) vocab.finalize(min_freq=min_freq, top_k=top_k) return vocab
def load_ds(dataset="scan/random", validfrac=0.1, recompute=False, bertname="bert-base-uncased"): tt = q.ticktock("data") tt.tick(f"loading '{dataset}'") if bertname.startswith("none"): bertname = "bert" + bertname[4:] if dataset.startswith("cfq/") or dataset.startswith("scan/mcd"): key = f"{dataset}|bertname={bertname}" print(f"validfrac is ineffective with dataset '{dataset}'") else: key = f"{dataset}|validfrac={validfrac}|bertname={bertname}" shelfname = os.path.basename(__file__) + ".cache.shelve" if not recompute: tt.tick(f"loading from shelf (key '{key}')") with shelve.open(shelfname) as shelf: if key not in shelf: recompute = True tt.tock("couldn't load from shelf") else: shelved = shelf[key] trainex, validex, testex, fldic = shelved["trainex"], shelved[ "validex"], shelved["testex"], shelved["fldic"] inpdic = shelved["inpdic"] if "inpdic" in shelved else None trainds, validds, testds = Dataset(trainex), Dataset( validex), Dataset(testex) tt.tock("loaded from shelf") if recompute: tt.tick("loading data") splits = dataset.split("/") dataset, splits = splits[0], splits[1:] split = "/".join(splits) if dataset == "scan": ds = SCANDatasetLoader().load(split, validfrac=validfrac) elif dataset == "cfq": ds = CFQDatasetLoader().load(split + "/modent") else: raise Exception(f"Unknown dataset: '{dataset}'") tt.tock("loaded data") tt.tick("creating tokenizer") tokenizer = Tokenizer(bertname=bertname) tt.tock("created tokenizer") print(len(ds)) tt.tick("dictionaries") inpdic = Vocab() inplens, outlens = [0], [] fldic = Vocab() for x in ds: outtoks = tokenizer.get_out_toks(x[1]) outlens.append(len(outtoks)) for tok in outtoks: fldic.add_token(tok, seen=x[2] == "train") inptoks = tokenizer.get_toks(x[0]) for tok in inptoks: inpdic.add_token(tok, seen=x[2] == "train") inpdic.finalize(min_freq=0, top_k=np.infty) fldic.finalize(min_freq=0, top_k=np.infty) print( f"input avg/max length is {np.mean(inplens):.1f}/{max(inplens)}, output avg/max length is {np.mean(outlens):.1f}/{max(outlens)}" ) print( f"output vocabulary size: {len(fldic.D)} at output, {len(inpdic.D)} at input" ) tt.tock() tt.tick("tensorizing") tokenizer.inpvocab = inpdic tokenizer.outvocab = fldic trainds = ds.filter(lambda x: x[-1] == "train").map( lambda x: x[:-1]).map( lambda x: tokenizer.tokenize(x[0], x[1])).cache(True) validds = ds.filter(lambda x: x[-1] == "valid").map( lambda x: x[:-1]).map( lambda x: tokenizer.tokenize(x[0], x[1])).cache(True) testds = ds.filter(lambda x: x[-1] == "test").map( lambda x: x[:-1]).map( lambda x: tokenizer.tokenize(x[0], x[1])).cache(True) # ds = ds.map(lambda x: tokenizer.tokenize(x[0], x[1]) + (x[2],)).cache(True) tt.tock("tensorized") tt.tick("shelving") with shelve.open(shelfname) as shelf: shelved = { "trainex": trainds.examples, "validex": validds.examples, "testex": testds.examples, "fldic": fldic, "inpdic": inpdic, } shelf[key] = shelved tt.tock("shelved") tt.tock(f"loaded '{dataset}'") tt.msg( f"#train={len(trainds)}, #valid={len(validds)}, #test={len(testds)}") tt.msg("Overlap of validation with train:") overlaps = compute_overlaps(trainds, validds) print(json.dumps(overlaps, indent=4)) tt.msg("Overlap of test with train:") overlaps = compute_overlaps(trainds, testds) print(json.dumps(overlaps, indent=4)) return trainds, validds, testds, fldic, inpdic