Ejemplo n.º 1
0
    def _initialize(self, p, bert_tokenizer, min_freq: int):
        self.data = {}
        self.bert_vocab = Vocab()
        self.bert_vocab.set_dict(bert_tokenizer.vocab)
        self.sentence_encoder = SequenceEncoder(
            lambda x: bert_tokenizer.tokenize(f"[CLS] {x} [SEP]"),
            vocab=self.bert_vocab)
        trainlines = [
            x for x in ujson.load(
                open(os.path.join(p, f"geo-{self.train_lang}.json"), "r"))
        ]
        testlines = [
            x for x in ujson.load(
                open(os.path.join(p, f"geo-{self.train_lang}.json"), "r"))
        ]
        trainlines = [x for x in trainlines if x["split"] == "train"]
        testlines = [x for x in testlines if x["split"] == "test"]
        if self.cvfolds is None:
            splits = ["train"] * len(trainlines) + ["test"] * len(testlines)
        else:
            cvsplit_len = len(trainlines) / self.cvfolds
            splits = []
            for i in range(0, self.cvfolds):
                splits += [i] * round(cvsplit_len * (i + 1) - len(splits))
            random.shuffle(splits)
            splits = [
                "valid" if x == self.testfold else "train" for x in splits
            ]
            splits = splits + ["test"] * len(testlines)
        questions = [x["nl"] for x in trainlines]
        queries = [x["mrl"] for x in trainlines]
        xquestions = [x["nl"] for x in testlines]
        xqueries = [x["mrl"] for x in testlines]
        questions += xquestions
        queries += xqueries

        # initialize output vocabulary
        outvocab = Vocab()
        for token, bertid in self.bert_vocab.D.items():
            outvocab.add_token(token, seen=False)

        self.query_encoder = SequenceEncoder(tokenizer=partial(
            basic_query_tokenizer, strtok=bert_tokenizer),
                                             vocab=outvocab,
                                             add_end_token=True)

        # build vocabularies
        for i, (question, query,
                split) in enumerate(zip(questions, queries, splits)):
            self.query_encoder.inc_build_vocab(query, seen=split == "train")
        keeptokens = set(self.bert_vocab.D.keys())
        self.query_encoder.finalize_vocab(min_freq=min_freq,
                                          keep_tokens=keeptokens)

        token_specs = self.build_token_specs(queries)
        self.token_specs = token_specs

        self.build_data(questions, queries, splits)
Ejemplo n.º 2
0
    def test_beam(self):
        x = [
            "( and ( got the walk ) ( got the talk ) ( and ( got thatstyle ) ( got thatsmile ) ) )",
            "( and ( got the walk ) ( got talk the ) ( and ( got thatstyle ) ( got thatsmile ) ) )",
            "( and ( got the walk ) ( got the walk ) ( and ( got thatstyle ) ( got thatsmile ) ) )",
            "( and ( got the talk ) ( got the walk ) ( and ( got thatsmile ) ( got thatstyle ) ) )",
            "( too_bad ( she ( has ( a penis ) ) ) )"
        ]
        D = Vocab()
        for xe in x:
            for xes in xe.split():
                D.add_token(xes, seen=True)
        print(D.D)
        acc = TreeAccuracy(tensor2tree=partial(tensor2tree, D=D),
                           orderless={"and"})
        x = [[D[xes] for xes in xe.split()] for xe in x]
        # equalize dims
        maxlen = max([len(xe) for xe in x])
        x = [xe + [0] * (maxlen - len(xe)) for xe in x]
        x = torch.tensor(x)
        print(x)

        a = acc(None, x[torch.tensor([1, 4, 2, 3, 0])][None, :, :], x[0:1])
        print(a)
        self.assertTrue(a["tree_acc"] == 0)
        self.assertTrue(a["tree_acc_at1"] == 0)
        self.assertTrue(a["tree_acc_at2"] == 0)
        self.assertTrue(a["tree_acc_at3"] == 0)
        self.assertTrue(a["tree_acc_at4"] == 1)
        self.assertTrue(a["tree_acc_at5"] == 1)
        self.assertTrue(a["tree_acc_at_last"] == 1)
Ejemplo n.º 3
0
def load_ds(domain="restaurants",
            min_freq=0,
            top_k=np.infty,
            nl_mode="bart-large",
            trainonvalid=False):
    ds = OvernightDatasetLoader(simplify_mode="light").load(
        domain=domain, trainonvalid=trainonvalid)

    seqenc_vocab = Vocab(padid=1, startid=0, endid=2, unkid=UNKID)
    seqenc = SequenceEncoder(vocab=seqenc_vocab,
                             tokenizer=tree_to_lisp_tokens,
                             add_start_token=True,
                             add_end_token=True)
    for example in ds.examples:
        query = example[1]
        seqenc.inc_build_vocab(query, seen=example[2] == "train")
    seqenc.finalize_vocab(min_freq=min_freq, top_k=top_k)

    nl_tokenizer = AutoTokenizer.from_pretrained(nl_mode)

    def tokenize(x):
        ret = (nl_tokenizer.encode(x[0], return_tensors="pt")[0],
               seqenc.convert(x[1], return_what="tensor"), x[2], x[0], x[1])
        return ret
    tds, vds, xds = ds[(None, None, "train")].map(tokenize), \
                    ds[(None, None, "valid")].map(tokenize), \
                    ds[(None, None, "test")].map(tokenize)
    return tds, vds, xds, nl_tokenizer, seqenc
Ejemplo n.º 4
0
    def _initialize(self, p, xlmr, min_freq:int):
        self.data = {}
        self.xlmr = xlmr
        self.xlmr_vocab = Vocab()
        self.xlmr_vocab.set_dict(xlmr.model.decoder.dictionary.indices)
        self.sentence_encoder = SequenceEncoder(lambda x: f"<s> {xlmr.bpe.encode(x)} </s>".split(), vocab=self.xlmr_vocab)
        trainlines = [x for x in ujson.load(open(os.path.join(p, f"geo-{self.train_lang}.json"), "r"))]
        testlines = [x for x in ujson.load(open(os.path.join(p, f"geo-{self.test_lang}.json"), "r"))]
        trainlines = [x for x in trainlines if x["split"] == "train"]
        testlines = [x for x in testlines if x["split"] == "test"]
        if self.cvfolds is None:
            splits = ["train"]*len(trainlines) + ["test"] * len(testlines)
        else:
            cvsplit_len = len(trainlines)/self.cvfolds
            splits = []
            for i in range(0, self.cvfolds):
                splits += [i] * round(cvsplit_len * (i+1) - len(splits))
            random.shuffle(splits)
            splits = ["valid" if x == self.testfold else "train" for x in splits]
            splits = splits + ["test"] * len(testlines)
        questions = [x["nl"] for x in trainlines]
        queries = [x["mrl"] for x in trainlines]
        xquestions = [x["nl"] for x in testlines]
        xqueries = [x["mrl"] for x in testlines]
        questions += xquestions
        queries += xqueries

        # initialize output vocabulary
        outvocab = Vocab()
        # for token, bertid in self.xlmr_vocab.D.items():
        #     outvocab.add_token(token, seen=False)

        self.query_encoder = SequenceEncoder(tokenizer=partial(basic_query_tokenizer, strtok=lambda x: xlmr.bpe.encode(x).split()), vocab=outvocab, add_end_token=True)

        # build vocabularies
        for i, (question, query, split) in enumerate(zip(questions, queries, splits)):
            question_tokens = self.sentence_encoder.convert(question, return_what="tokens")[0]
            for token in question_tokens:
                self.query_encoder.vocab.add_token(token, seen=False)
            self.query_encoder.inc_build_vocab(query, seen=split=="train")
        keeptokens = set(self.xlmr_vocab.D.keys())
        self.query_encoder.finalize_vocab(min_freq=min_freq, keep_tokens=keeptokens)

        token_specs = self.build_token_specs(queries)
        self.token_specs = token_specs

        self.build_data(questions, queries, splits)
Ejemplo n.º 5
0
def build_vocab_from_pcfg(pcfg, min_freq=0, top_k=np.infty)->Vocab:
    vocab = Vocab()
    vocab.add_token("(")
    vocab.add_token(")")
    for rule in pcfg.productions():
        vocab.add_token(str(rule.lhs()))
        for rhse in rule.rhs():
            vocab.add_token(str(rhse))
    vocab.finalize(min_freq=min_freq, top_k=top_k)
    return vocab
Ejemplo n.º 6
0
    def test_normal(self):
        x = [
            "( and ( has service ) ( has money ) ( and ( got thatstyle ) ( got thatsmile ) ) )",
            "( and ( has service ) ( has service ) ( and ( got thatstyle ) ( got thatsmile ) ) )",
            "( and ( has money ) ( has service ) ( and ( got thatsmile ) ( got thatstyle ) ) )"
        ]
        D = Vocab()
        for xe in x:
            for xes in xe.split():
                D.add_token(xes, seen=True)
        print(D.D)
        acc = TreeAccuracy(tensor2tree=partial(tensor2tree, D=D),
                           orderless={"and"})
        x = [[D[xes] for xes in xe.split()] for xe in x]
        x = torch.tensor(x)
        print(x)

        a = acc(None, x[0:1], x[1:2])
        self.assertEqual(a["tree_acc"], 0)
        print(a)
        a = acc(None, x[0:1], x[2:3])
        self.assertEqual(a["tree_acc"], 1.)
        print(a)
Ejemplo n.º 7
0
def load_ds(dataset="scan/random",
            validfrac=0.1,
            recompute=False,
            bertname="bert-base-uncased"):
    tt = q.ticktock("data")
    tt.tick(f"loading '{dataset}'")
    if bertname.startswith("none"):
        bertname = "bert" + bertname[4:]
    if dataset.startswith("cfq/") or dataset.startswith("scan/mcd"):
        key = f"{dataset}|bertname={bertname}"
        print(f"validfrac is ineffective with dataset '{dataset}'")
    else:
        key = f"{dataset}|validfrac={validfrac}|bertname={bertname}"

    shelfname = os.path.basename(__file__) + ".cache.shelve"
    if not recompute:
        tt.tick(f"loading from shelf (key '{key}')")
        with shelve.open(shelfname) as shelf:
            if key not in shelf:
                recompute = True
                tt.tock("couldn't load from shelf")
            else:
                shelved = shelf[key]
                trainex, validex, testex, fldic = shelved["trainex"], shelved[
                    "validex"], shelved["testex"], shelved["fldic"]
                inpdic = shelved["inpdic"] if "inpdic" in shelved else None
                trainds, validds, testds = Dataset(trainex), Dataset(
                    validex), Dataset(testex)
                tt.tock("loaded from shelf")

    if recompute:
        tt.tick("loading data")
        splits = dataset.split("/")
        dataset, splits = splits[0], splits[1:]
        split = "/".join(splits)
        if dataset == "scan":
            ds = SCANDatasetLoader().load(split, validfrac=validfrac)
        elif dataset == "cfq":
            ds = CFQDatasetLoader().load(split + "/modent")
        else:
            raise Exception(f"Unknown dataset: '{dataset}'")
        tt.tock("loaded data")

        tt.tick("creating tokenizer")
        tokenizer = Tokenizer(bertname=bertname)
        tt.tock("created tokenizer")

        print(len(ds))

        tt.tick("dictionaries")
        inpdic = Vocab()
        inplens, outlens = [0], []
        fldic = Vocab()
        for x in ds:
            outtoks = tokenizer.get_out_toks(x[1])
            outlens.append(len(outtoks))
            for tok in outtoks:
                fldic.add_token(tok, seen=x[2] == "train")
            inptoks = tokenizer.get_toks(x[0])
            for tok in inptoks:
                inpdic.add_token(tok, seen=x[2] == "train")
        inpdic.finalize(min_freq=0, top_k=np.infty)
        fldic.finalize(min_freq=0, top_k=np.infty)
        print(
            f"input avg/max length is {np.mean(inplens):.1f}/{max(inplens)}, output avg/max length is {np.mean(outlens):.1f}/{max(outlens)}"
        )
        print(
            f"output vocabulary size: {len(fldic.D)} at output, {len(inpdic.D)} at input"
        )
        tt.tock()

        tt.tick("tensorizing")
        tokenizer.inpvocab = inpdic
        tokenizer.outvocab = fldic
        trainds = ds.filter(lambda x: x[-1] == "train").map(
            lambda x: x[:-1]).map(
                lambda x: tokenizer.tokenize(x[0], x[1])).cache(True)
        validds = ds.filter(lambda x: x[-1] == "valid").map(
            lambda x: x[:-1]).map(
                lambda x: tokenizer.tokenize(x[0], x[1])).cache(True)
        testds = ds.filter(lambda x: x[-1] == "test").map(
            lambda x: x[:-1]).map(
                lambda x: tokenizer.tokenize(x[0], x[1])).cache(True)
        # ds = ds.map(lambda x: tokenizer.tokenize(x[0], x[1]) + (x[2],)).cache(True)
        tt.tock("tensorized")

        tt.tick("shelving")
        with shelve.open(shelfname) as shelf:
            shelved = {
                "trainex": trainds.examples,
                "validex": validds.examples,
                "testex": testds.examples,
                "fldic": fldic,
                "inpdic": inpdic,
            }
            shelf[key] = shelved
        tt.tock("shelved")

    tt.tock(f"loaded '{dataset}'")
    tt.msg(
        f"#train={len(trainds)}, #valid={len(validds)}, #test={len(testds)}")

    tt.msg("Overlap of validation with train:")
    overlaps = compute_overlaps(trainds, validds)
    print(json.dumps(overlaps, indent=4))

    tt.msg("Overlap of test with train:")
    overlaps = compute_overlaps(trainds, testds)
    print(json.dumps(overlaps, indent=4))

    return trainds, validds, testds, fldic, inpdic
Ejemplo n.º 8
0
def load_ds(traindomains=("restaurants",),
            testdomain="housing",
            min_freq=1,
            mincoverage=1,
            top_k=np.infty,
            nl_mode="bert-base-uncased",
            fullsimplify=False,
            onlyabstract=False,
            pretrainsetting="all+lex",    # "all", "lex" or "all+lex"
            finetunesetting="lex",        # "lex", "all", "min"
            ):
    """
    :param traindomains:
    :param testdomain:
    :param min_freq:
    :param mincoverage:
    :param top_k:
    :param nl_mode:
    :param fullsimplify:
    :param add_domain_start:
    :param onlyabstract:
    :param pretrainsetting:     "all": use all examples from every domain
                                "lex": use only lexical examples
                                "all+lex": use both
    :param finetunesetting:     "lex": use lexical examples
                                "all": use all training examples
                                "min": use minimal lexicon-covering set of examples
                            ! Test is always over the same original test set.
                            ! Validation is over a fraction of training data
    :return:
    """
    general_tokens = {
        "(", ")", "arg:~type", "arg:type", "op:and", "SW:concat", "cond:has",
        "arg:<=", "arg:<", "arg:>=", "arg:>", "arg:!=", "arg:=", "SW:superlative",
        "SW:CNT-arg:min", "SW:CNT-arg:<", "SW:CNT-arg:<=", "SW:CNT-arg:>=", "SW:CNT-arg:>",
        "SW:CNT-arg:max", "SW:CNT-arg:=", "arg:max",
    }

    def tokenize_and_add_start(t):
        tokens = tree_to_lisp_tokens(t)
        starttok = "@START@"
        tokens = [starttok] + tokens
        return tokens

    sourceex = []
    for traindomain in traindomains:
        ds = OvernightDatasetLoader(simplify_mode="light" if not fullsimplify else "full", simplify_blocks=True,
                                    restore_reverse=DATA_RESTORE_REVERSE, validfrac=.10)\
            .load(domain=traindomain)
        sourceex += ds[(None, None, lambda x: x in ("train", "valid", "lexicon"))].map(lambda x: (x[0], x[1], x[2], traindomain)).examples       # don't use test examples

    testds = OvernightDatasetLoader(simplify_mode="light" if not fullsimplify else "full", simplify_blocks=True, restore_reverse=DATA_RESTORE_REVERSE)\
        .load(domain=testdomain)

    targetex = testds.map(lambda x: x + (testdomain,)).examples

    pretrainex = []
    if "all" in pretrainsetting.split("+"):
        pretrainex += [(a, tokenize_and_add_start(b), "pretrain", d) for a, b, c, d in sourceex if c == "train"]
    if "lex" in pretrainsetting.split("+"):
        pretrainex += [(a, tokenize_and_add_start(b), "pretrain", d) for a, b, c, d in sourceex if c == "lexicon"]

    pretrainvalidex = [(a, tokenize_and_add_start(b), "pretrainvalid", d) for a, b, c, d in sourceex if c == "valid"]

    if finetunesetting == "all":
        finetunetrainex = [(a, tokenize_and_add_start(b), "fttrain", d) for a, b, c, d in targetex if c == "train"]
    elif finetunesetting == "lex":
        finetunetrainex = [(a, tokenize_and_add_start(b), "fttrain", d) for a, b, c, d in targetex if c == "lexicon"]
    elif finetunesetting == "min":
        finetunetrainex = get_maximum_spanning_examples([(a, b, c, d) for a, b, c, d in targetex if c == "train"],
                                      mincoverage=mincoverage,
                                      loadedex=[e for e in pretrainex if e[2] == "pretrain"])
        finetunetrainex = [(a, tokenize_and_add_start(b), "fttrain", d) for a, b, c, d in finetunetrainex]
    finetunevalidex = [(a, tokenize_and_add_start(b), "ftvalid", d) for a, b, c, d in targetex if c == "valid"]
    finetunetestex = [(a, tokenize_and_add_start(b), "fttest", d) for a, b, c, d in targetex if c == "test"]
    print(f"Using mode \"{finetunesetting}\" for finetuning data: "
          f"\n\t{len(finetunetrainex)} training examples")


    allex = pretrainex + pretrainvalidex + finetunetrainex + finetunevalidex + finetunetestex
    ds = Dataset(allex)

    if onlyabstract:
        et = get_lf_abstract_transform(ds[lambda x: x[3] != testdomain].examples)
        ds = ds.map(lambda x: (x[0], et(x[1]), x[2], x[3]))

    seqenc_vocab = Vocab(padid=0, startid=1, endid=2, unkid=UNKID)
    seqenc = SequenceEncoder(vocab=seqenc_vocab, tokenizer=lambda x: x,
                             add_start_token=False, add_end_token=True)
    for example in ds.examples:
        query = example[1]
        seqenc.inc_build_vocab(query, seen=example[2] in ("pretrain", "fttrain"))
    seqenc.finalize_vocab(min_freq=min_freq, top_k=top_k)

    generaltokenmask = torch.zeros(seqenc_vocab.number_of_ids(), dtype=torch.long)
    for token, tokenid in seqenc_vocab.D.items():
        if token in general_tokens:
            generaltokenmask[tokenid] = 1

    nl_tokenizer = AutoTokenizer.from_pretrained(nl_mode)
    def tokenize(x):
        ret = (nl_tokenizer.encode(x[0], return_tensors="pt")[0],
               seqenc.convert(x[1], return_what="tensor"),
               x[2],
               x[0], x[1], x[3])
        return ret
    tds, ftds, vds, fvds, xds = ds[(None, None, "pretrain", None)].map(tokenize), \
                          ds[(None, None, "fttrain", None)].map(tokenize), \
                          ds[(None, None, "pretrainvalid", None)].map(tokenize), \
                          ds[(None, None, "ftvalid", None)].map(tokenize), \
                          ds[(None, None, "fttest", None)].map(tokenize)
    return tds, ftds, vds, fvds, xds, nl_tokenizer, seqenc, generaltokenmask
def load_ds(domain="restaurants",
            nl_mode="bert-base-uncased",
            trainonvalid=False,
            noreorder=False):
    """
    Creates a dataset of examples which have
    * NL question and tensor
    * original FL tree
    * reduced FL tree with slots (this is randomly generated)
    * tensor corresponding to reduced FL tree with slots
    * mask specifying which elements in reduced FL tree are terminated
    * 2D gold that specifies whether a token/action is in gold for every position (compatibility with MML!)
    """
    orderless = {"op:and", "SW:concat"}  # only use in eval!!

    ds = OvernightDatasetLoader().load(domain=domain,
                                       trainonvalid=trainonvalid)
    ds = ds.map(lambda x: (x[0], ATree("@START@", [x[1]]), x[2]))

    if not noreorder:
        ds = ds.map(lambda x:
                    (x[0], reorder_tree(x[1], orderless=orderless), x[2]))

    vocab = Vocab(padid=0, startid=2, endid=3, unkid=1)
    vocab.add_token("@START@", seen=np.infty)
    vocab.add_token(
        "@CLOSE@", seen=np.infty
    )  # only here for the action of closing an open position, will not be seen at input
    vocab.add_token(
        "@OPEN@", seen=np.infty
    )  # only here for the action of opening a closed position, will not be seen at input
    vocab.add_token(
        "@REMOVE@", seen=np.infty
    )  # only here for deletion operations, won't be seen at input
    vocab.add_token(
        "@REMOVESUBTREE@", seen=np.infty
    )  # only here for deletion operations, won't be seen at input
    vocab.add_token("@SLOT@",
                    seen=np.infty)  # will be seen at input, can't be produced!

    nl_tokenizer = BertTokenizer.from_pretrained(nl_mode)
    # for tok, idd in nl_tokenizer.vocab.items():
    #     vocab.add_token(tok, seen=np.infty)          # all wordpieces are added for possible later generation

    tds, vds, xds = ds[lambda x: x[2] == "train"], \
                    ds[lambda x: x[2] == "valid"], \
                    ds[lambda x: x[2] == "test"]

    seqenc = SequenceEncoder(
        vocab=vocab,
        tokenizer=lambda x: extract_info(x, onlytokens=True),
        add_start_token=False,
        add_end_token=False)
    for example in tds.examples:
        query = example[1]
        seqenc.inc_build_vocab(query, seen=True)
    for example in vds.examples:
        query = example[1]
        seqenc.inc_build_vocab(query, seen=False)
    for example in xds.examples:
        query = example[1]
        seqenc.inc_build_vocab(query, seen=False)
    seqenc.finalize_vocab(min_freq=0)

    def mapper(x):
        nl = x[0]
        fl = x[1]
        fltoks = extract_info(fl, onlytokens=True)
        seq = seqenc.convert(fltoks, return_what="tensor")
        ret = (nl_tokenizer.encode(nl, return_tensors="pt")[0], seq)
        return ret

    tds_seq = tds.map(mapper)
    vds_seq = vds.map(mapper)
    xds_seq = xds.map(mapper)
    return tds_seq, vds_seq, xds_seq, nl_tokenizer, seqenc, orderless
def load_ds(traindomains=("restaurants", ),
            testdomain="housing",
            min_freq=1,
            mincoverage=1,
            top_k=np.infty,
            nl_mode="bert-base-uncased",
            fullsimplify=False,
            add_domain_start=True,
            useall=False):
    def tokenize_and_add_start(t, _domain):
        tokens = tree_to_lisp_tokens(t)
        starttok = f"@START/{_domain}@" if add_domain_start else "@START@"
        tokens = [starttok] + tokens
        return tokens

    allex = []
    for traindomain in traindomains:
        ds = OvernightDatasetLoader(simplify_mode="light" if not fullsimplify else "full", simplify_blocks=True, restore_reverse=DATA_RESTORE_REVERSE, validfrac=.10)\
            .load(domain=traindomain)
        allex += ds[(None, None, lambda x: x in
                     ("train", "valid"))].map(lambda x: (x[0], x[1], x[
                         2], traindomain)).examples  # don't use test examples

    testds = OvernightDatasetLoader(simplify_mode="light" if not fullsimplify else "full", simplify_blocks=True, restore_reverse=DATA_RESTORE_REVERSE)\
        .load(domain=testdomain)
    if useall:
        print("using all training examples")
        sortedexamples = testds[(None, None, "train")].examples
    else:
        sortedexamples = get_maximum_spanning_examples(
            testds[(None, None, "train")].examples,
            mincoverage=mincoverage,
            loadedex=[e for e in allex if e[2] == "train"])

    allex += testds[(
        None, None,
        "valid")].map(lambda x: (x[0], x[1], "ftvalid", testdomain)).examples
    allex += testds[(
        None, None,
        "test")].map(lambda x: (x[0], x[1], x[2], testdomain)).examples
    allex += [(ex[0], ex[1], "fttrain", testdomain) for ex in sortedexamples]

    _ds = Dataset(allex)
    ds = _ds.map(lambda x:
                 (x[0], tokenize_and_add_start(x[1], x[3]), x[2], x[3]))

    et = get_lf_abstract_transform(ds[lambda x: x[3] != testdomain].examples)
    ds = ds.map(lambda x: (x[0], et(x[1]), x[1], x[2], x[3]))

    seqenc_vocab = Vocab(padid=0, startid=1, endid=2, unkid=UNKID)
    absseqenc_vocab = Vocab(padid=0, startid=1, endid=2, unkid=UNKID)
    absseqenc = SequenceEncoder(vocab=seqenc_vocab,
                                tokenizer=lambda x: x,
                                add_start_token=False,
                                add_end_token=True)
    fullseqenc = SequenceEncoder(vocab=absseqenc_vocab,
                                 tokenizer=lambda x: x,
                                 add_start_token=False,
                                 add_end_token=True)
    for example in ds.examples:
        absseqenc.inc_build_vocab(example[1],
                                  seen=example[3] in ("train", "fttrain"))
        fullseqenc.inc_build_vocab(example[2],
                                   seen=example[3] in ("train", "fttrain"))
    absseqenc.finalize_vocab(min_freq=min_freq, top_k=top_k)
    fullseqenc.finalize_vocab(min_freq=min_freq, top_k=top_k)

    nl_tokenizer = AutoTokenizer.from_pretrained(nl_mode)

    def tokenize(x):
        ret = (nl_tokenizer.encode(x[0], return_tensors="pt")[0],
               absseqenc.convert(x[1], return_what="tensor"),
               fullseqenc.convert(x[2], return_what="tensor"), x[3], x[0],
               x[1], x[4])
        return ret
    tds, ftds, vds, fvds, xds = ds[(None, None, None, "train", None)].map(tokenize), \
                          ds[(None, None, None, "fttrain", None)].map(tokenize), \
                          ds[(None, None, None, "valid", None)].map(tokenize), \
                          ds[(None, None, None, "ftvalid", None)].map(tokenize), \
                          ds[(None, None, None, "test", None)].map(tokenize)
    return tds, ftds, vds, fvds, xds, nl_tokenizer, fullseqenc, absseqenc
Ejemplo n.º 11
0
def load_ds(domain="restaurants",
            nl_mode="bert-base-uncased",
            trainonvalid=False,
            noreorder=False,
            numbered=False):
    """
    Creates a dataset of examples which have
    * NL question and tensor
    * original FL tree
    * reduced FL tree with slots (this is randomly generated)
    * tensor corresponding to reduced FL tree with slots
    * mask specifying which elements in reduced FL tree are terminated
    * 2D gold that specifies whether a token/action is in gold for every position (compatibility with MML!)
    """
    # orderless = {"op:and", "SW:concat"}     # only use in eval!!
    orderless = ORDERLESS

    ds = OvernightDatasetLoader(simplify_mode="none").load(
        domain=domain, trainonvalid=trainonvalid)
    # ds contains 3-tuples of (input, output tree, split name)

    if not noreorder:
        ds = ds.map(lambda x:
                    (x[0], reorder_tree(x[1], orderless=orderless), x[2]))
    ds = ds.map(lambda x: (x[0], tree_to_seq(x[1]), x[2]))

    if numbered:
        ds = ds.map(lambda x: (x[0], make_numbered_tokens(x[1]), x[2]))

    vocab = Vocab(padid=0, startid=2, endid=3, unkid=1)
    vocab.add_token("@BOS@", seen=np.infty)
    vocab.add_token("@EOS@", seen=np.infty)
    vocab.add_token("@STOP@", seen=np.infty)

    nl_tokenizer = BertTokenizer.from_pretrained(nl_mode)

    tds, vds, xds = ds[lambda x: x[2] == "train"], \
                    ds[lambda x: x[2] == "valid"], \
                    ds[lambda x: x[2] == "test"]

    seqenc = SequenceEncoder(vocab=vocab,
                             tokenizer=lambda x: x,
                             add_start_token=False,
                             add_end_token=False)
    for example in tds.examples:
        query = example[1]
        seqenc.inc_build_vocab(query, seen=True)
    for example in vds.examples:
        query = example[1]
        seqenc.inc_build_vocab(query, seen=False)
    for example in xds.examples:
        query = example[1]
        seqenc.inc_build_vocab(query, seen=False)
    seqenc.finalize_vocab(min_freq=0)

    def mapper(x):
        seq = seqenc.convert(x[1], return_what="tensor")
        ret = (nl_tokenizer.encode(x[0], return_tensors="pt")[0], seq)
        return ret

    tds_seq = tds.map(mapper)
    vds_seq = vds.map(mapper)
    xds_seq = xds.map(mapper)
    return tds_seq, vds_seq, xds_seq, nl_tokenizer, seqenc, orderless
Ejemplo n.º 12
0
    def __init__(self,
                 dim,
                 vocab: Vocab = None,
                 inpvocab: Vocab = None,
                 numlayers: int = 6,
                 numheads: int = 6,
                 userelpos=False,
                 useabspos=True,
                 relposmode="basic",
                 relposrng=10,
                 mode="normal",
                 dropout: float = 0.,
                 worddropout: float = 0.,
                 maxpos=512,
                 bertname="bert-base-uncased",
                 **kw):
        super(TransformerDecoderCell, self).__init__(**kw)
        self.vocab = vocab
        self.inpvocab = inpvocab
        self.vocabsize = vocab.number_of_ids()
        self.dim = dim
        self.userelpos = userelpos
        self.relposrng = relposrng
        self.useabspos = useabspos
        self.mode = mode

        decconfig = TransformerConfig(vocab_size=self.vocabsize,
                                      d_model=self.dim,
                                      d_ff=self.dim * 4,
                                      d_kv=int(self.dim / numheads),
                                      num_layers=numlayers,
                                      num_heads=numheads,
                                      dropout_rate=dropout)

        self.dec_emb = torch.nn.Embedding(self.vocabsize, decconfig.d_model)
        self.slot_emb = torch.nn.Embedding(1, decconfig.d_model)

        self.relposemb = None
        if self.userelpos is True:
            if relposmode == "basic":
                self.relposemb = BasicRelPosEmb(self.dim, relposrng)
            # elif relposmode == "mod":
            #     self.relposemb = ModRelPosEmb(self.dim, relposrng, levels=4)
            else:
                raise Exception(f"Unrecognized relposmode '{relposmode}'")

        self.absposemb = None
        if self.relposemb is None or self.useabspos is True:
            self.absposemb = torch.nn.Embedding(maxpos, decconfig.d_model)

        decoder_config = deepcopy(decconfig)
        decoder_config.is_decoder = True
        decoder_config.use_causal_mask = True
        self.decoder = TransformerStackDecoder(decoder_config,
                                               rel_emb=self.relposemb)

        self.out = torch.nn.Linear(self.dim, self.vocabsize)

        vocab_mask = torch.ones(self.vocabsize)
        # for excl_token in self.exclude:
        #     if excl_token in self.vocab:
        #         vocab_mask[self.vocab[excl_token]] = 0
        self.register_buffer("vocab_mask", vocab_mask)

        self.bertname = bertname
        self.encrelposemb = None
        if self.bertname.startswith("none") or self.bertname == "vanilla":
            if self.userelpos is True:
                if relposmode == "basic":
                    self.encrelposemb = BasicRelPosEmb(self.dim, relposrng)
                # elif relposmode == "mod":
                #     self.relposemb = ModRelPosEmb(self.dim, relposrng, levels=4)
                else:
                    raise Exception(f"Unrecognized relposmode '{relposmode}'")
            bname = "bert" + self.bertname[4:]
            if self.bertname == "vanilla":
                inpvocabsize = inpvocab.number_of_ids()
                self.inpworddropout = WordDropout(
                    worddropout, self.inpvocab[self.inpvocab.masktoken],
                    [self.inpvocab[self.inpvocab.padtoken]])
            else:
                tokenizer = AutoTokenizer.from_pretrained(bname)
                inpvocabsize = tokenizer.vocab_size
                self.inpworddropout = WordDropout(
                    worddropout, self.inpvocab[self.inpvocab.masktoken], [
                        self.inpvocab["[CLS]"], self.inpvocab["[SEP]"],
                        self.inpvocab[self.inpvocab.padtoken]
                    ])
            encconfig = TransformerConfig(vocab_size=inpvocabsize,
                                          d_model=self.dim,
                                          d_ff=self.dim * 4,
                                          d_kv=int(self.dim / numheads),
                                          num_layers=numlayers,
                                          num_heads=numheads,
                                          dropout_rate=dropout)
            encemb = TransformerEmbeddings(encconfig.vocab_size,
                                           encconfig.d_model,
                                           dropout=dropout,
                                           max_position_embeddings=maxpos,
                                           useabspos=useabspos)
            self.encoder_model = TransformerStack(encconfig,
                                                  encemb,
                                                  rel_emb=self.encrelposemb)
        else:
            self.encoder_model = BertModel.from_pretrained(
                self.bertname,
                hidden_dropout_prob=min(dropout, 0.2),
                attention_probs_dropout_prob=min(dropout, 0.1))
            tokenizer = AutoTokenizer.from_pretrained(self.bertname)
            inpvocabsize = tokenizer.vocab_size
            self.inpvocab = Vocab()
            for tok, id in tokenizer.vocab.items():
                self.inpvocab.D[tok] = id
            self.inpvocab.masktoken = "[MASK]"
            self.inpvocab.unktoken = "[UNK]"
            self.inpvocab.padtoken = "[PAD]"
            self.inpworddropout = WordDropout(
                worddropout, self.inpvocab[self.inpvocab.masktoken], [
                    self.inpvocab["[CLS]"], self.inpvocab["[SEP]"],
                    self.inpvocab[self.inpvocab.padtoken]
                ])

        self.adapter = None
        if self.encoder_model.config.hidden_size != decoder_config.d_model:
            self.adapter = torch.nn.Linear(
                self.encoder_model.config.hidden_size,
                decoder_config.d_model,
                bias=False)

        self.worddropout = WordDropout(worddropout,
                                       self.vocab[self.vocab.masktoken],
                                       [self.vocab[self.vocab.padtoken]])

        self.reset_parameters()