Beispiel #1
0
    def _initialize(self, p, sentence_encoder:SequenceEncoder, min_freq:int):
        self.data = {}
        self.sentence_encoder = sentence_encoder

        jp = os.path.join(p, "lcquad_dataset.json")
        with open(jp, "r") as f:
            examples = ujson.load(f)

        examples = self.lines_to_examples(examples)

        questions, queries = tuple(zip(*examples))
        trainlen = int(round(0.8 * len(examples)))
        validlen = int(round(0.1 * len(examples)))
        testlen = int(round(0.1 * len(examples)))
        splits = ["train"] * trainlen + ["valid"] * validlen + ["test"] * testlen
        random.seed(123456)
        random.shuffle(splits)
        assert(len(splits) == len(examples))

        self.query_encoder = SequenceEncoder(tokenizer=partial(tree_query_tokenizer, strtok=sentence_encoder.tokenizer), add_end_token=True)

        # build vocabularies
        for i, (question, query, split) in enumerate(zip(questions, queries, splits)):
            self.sentence_encoder.inc_build_vocab(question, seen=split=="train")
            self.query_encoder.inc_build_vocab(query, seen=split=="train")
        for word, wordid in self.sentence_encoder.vocab.D.items():
            self.query_encoder.vocab.add_token(word, seen=False)
        self.sentence_encoder.finalize_vocab(min_freq=min_freq)
        self.query_encoder.finalize_vocab(min_freq=min_freq)

        self.build_data(questions, queries, splits)
Beispiel #2
0
    def _initialize(self, p, sentence_encoder:SequenceEncoder, min_freq:int):
        self.data = {}
        self.sentence_encoder = sentence_encoder
        trainlines = [x.strip() for x in open(os.path.join(p, "train.txt"), "r").readlines()]
        testlines = [x.strip() for x in open(os.path.join(p, "test.txt"), "r").readlines()]
        if self.cvfolds is None:
            splits = ["train"]*len(trainlines) + ["test"] * len(testlines)
        else:
            cvsplit_len = len(trainlines)/self.cvfolds
            splits = []
            for i in range(0, self.cvfolds):
                splits += [i] * round(cvsplit_len * (i+1) - len(splits))
            random.shuffle(splits)
            splits = ["valid" if x == self.testfold else "train" for x in splits]
            splits = splits + ["test"] * len(testlines)
        questions, queries = zip(*[x.split("\t") for x in trainlines])
        testqs, testxs = zip(*[x.split("\t") for x in testlines])
        questions += testqs
        queries += testxs

        self.query_encoder = SequenceEncoder(tokenizer=partial(basic_query_tokenizer, strtok=sentence_encoder.tokenizer), add_end_token=True)

        # build vocabularies
        for i, (question, query, split) in enumerate(zip(questions, queries, splits)):
            self.sentence_encoder.inc_build_vocab(question, seen=split=="train")
            self.query_encoder.inc_build_vocab(query, seen=split=="train")
        # for word, wordid in self.sentence_encoder.vocab.D.items():
        #     self.query_encoder.vocab.add_token(word, seen=False)
        self.sentence_encoder.finalize_vocab(min_freq=min_freq, keep_rare=True)
        self.query_encoder.finalize_vocab(min_freq=min_freq)

        token_specs = self.build_token_specs(queries)
        self.token_specs = token_specs

        self.build_data(questions, queries, splits)
Beispiel #3
0
    def _initialize(self, p, sentence_encoder: SequenceEncoder, min_freq: int):
        self.data = {}
        self.sentence_encoder = sentence_encoder
        trainlines = [
            x.strip()
            for x in open(os.path.join(p, "train.txt"), "r").readlines()
        ]
        testlines = [
            x.strip()
            for x in open(os.path.join(p, "test.txt"), "r").readlines()
        ]
        splits = ["train"] * len(trainlines) + ["test"] * len(testlines)
        questions, queries = zip(*[x.split("\t") for x in trainlines])
        testqs, testxs = zip(*[x.split("\t") for x in testlines])
        questions += testqs
        queries += testxs

        self.query_encoder = SequenceEncoder(tokenizer=partial(
            basic_query_tokenizer, strtok=sentence_encoder.tokenizer),
                                             add_end_token=True)

        # build vocabularies
        for i, (question, query,
                split) in enumerate(zip(questions, queries, splits)):
            self.sentence_encoder.inc_build_vocab(question,
                                                  seen=split == "train")
            self.query_encoder.inc_build_vocab(query, seen=split == "train")
        # for word, wordid in self.sentence_encoder.vocab.D.items():
        #     self.query_encoder.vocab.add_token(word, seen=False)
        self.sentence_encoder.finalize_vocab(min_freq=min_freq, keep_rare=True)
        self.query_encoder.finalize_vocab(min_freq=min_freq)

        self.build_data(questions, queries, splits)
Beispiel #4
0
    def _initialize(self, p):
        self.data = {}
        with open(os.path.join(p, "trainpreds.json")) as f:
            trainpreds = ujson.load(f)
        with open(os.path.join(p, "testpreds.json")) as f:
            testpreds = ujson.load(f)
        splits = ["train"] * len(trainpreds) + ["test"] * len(testpreds)
        preds = trainpreds + testpreds

        self.sentence_encoder = SequenceEncoder(tokenizer=lambda x: x.split())
        self.query_encoder = SequenceEncoder(tokenizer=lambda x: x.split())

        # build vocabularies
        for i, (example, split) in enumerate(zip(preds, splits)):
            self.sentence_encoder.inc_build_vocab(" ".join(
                example["sentence"]),
                                                  seen=split == "train")
            self.query_encoder.inc_build_vocab(" ".join(example["gold"]),
                                               seen=split == "train")
            for can in example["candidates"]:
                self.query_encoder.inc_build_vocab(" ".join(can["tokens"]),
                                                   seen=False)
        # for word, wordid in self.sentence_encoder.vocab.D.items():
        #     self.query_encoder.vocab.add_token(word, seen=False)
        self.sentence_encoder.finalize_vocab()
        self.query_encoder.finalize_vocab()

        self.build_data(preds, splits)
    def _initialize(self, p, bert_tokenizer, min_freq: int):
        self.data = {}
        self.bert_vocab = Vocab()
        self.bert_vocab.set_dict(bert_tokenizer.vocab)
        self.sentence_encoder = SequenceEncoder(
            lambda x: bert_tokenizer.tokenize(f"[CLS] {x} [SEP]"),
            vocab=self.bert_vocab)
        trainlines = [
            x for x in ujson.load(
                open(os.path.join(p, f"geo-{self.train_lang}.json"), "r"))
        ]
        testlines = [
            x for x in ujson.load(
                open(os.path.join(p, f"geo-{self.train_lang}.json"), "r"))
        ]
        trainlines = [x for x in trainlines if x["split"] == "train"]
        testlines = [x for x in testlines if x["split"] == "test"]
        if self.cvfolds is None:
            splits = ["train"] * len(trainlines) + ["test"] * len(testlines)
        else:
            cvsplit_len = len(trainlines) / self.cvfolds
            splits = []
            for i in range(0, self.cvfolds):
                splits += [i] * round(cvsplit_len * (i + 1) - len(splits))
            random.shuffle(splits)
            splits = [
                "valid" if x == self.testfold else "train" for x in splits
            ]
            splits = splits + ["test"] * len(testlines)
        questions = [x["nl"] for x in trainlines]
        queries = [x["mrl"] for x in trainlines]
        xquestions = [x["nl"] for x in testlines]
        xqueries = [x["mrl"] for x in testlines]
        questions += xquestions
        queries += xqueries

        # initialize output vocabulary
        outvocab = Vocab()
        for token, bertid in self.bert_vocab.D.items():
            outvocab.add_token(token, seen=False)

        self.query_encoder = SequenceEncoder(tokenizer=partial(
            basic_query_tokenizer, strtok=bert_tokenizer),
                                             vocab=outvocab,
                                             add_end_token=True)

        # build vocabularies
        for i, (question, query,
                split) in enumerate(zip(questions, queries, splits)):
            self.query_encoder.inc_build_vocab(query, seen=split == "train")
        keeptokens = set(self.bert_vocab.D.keys())
        self.query_encoder.finalize_vocab(min_freq=min_freq,
                                          keep_tokens=keeptokens)

        token_specs = self.build_token_specs(queries)
        self.token_specs = token_specs

        self.build_data(questions, queries, splits)
Beispiel #6
0
    def _initialize(self, p, domain, sentence_encoder: SequenceEncoder,
                    min_freq: int):
        self.data = {}
        self.sentence_encoder = sentence_encoder

        trainexamples, testexamples = None, None
        if self._usecache:
            try:
                trainexamples, testexamples = self._load_cached()
            except (IOError, ValueError) as e:
                pass

        if trainexamples is None:

            trainlines = [
                x.strip() for x in open(
                    os.path.join(p, f"{domain}.paraphrases.train.examples"),
                    "r").readlines()
            ]
            testlines = [
                x.strip() for x in open(
                    os.path.join(p, f"{domain}.paraphrases.test.examples"),
                    "r").readlines()
            ]

            trainexamples = self.lines_to_examples(trainlines)
            testexamples = self.lines_to_examples(testlines)

            if self._usecache:
                self._cache(trainexamples, testexamples)

        questions, queries = tuple(zip(*(trainexamples + testexamples)))
        trainlen = int(round(0.8 * len(trainexamples)))
        validlen = int(round(0.2 * len(trainexamples)))
        splits = ["train"] * trainlen + ["valid"] * validlen
        # random.seed(1223)
        random.shuffle(splits)
        assert (len(splits) == len(trainexamples))
        splits = splits + ["test"] * len(testexamples)

        self.query_encoder = SequenceEncoder(tokenizer=partial(
            tree_query_tokenizer, strtok=sentence_encoder.tokenizer),
                                             add_end_token=True)

        # build vocabularies
        for i, (question, query,
                split) in enumerate(zip(questions, queries, splits)):
            self.sentence_encoder.inc_build_vocab(question,
                                                  seen=split == "train")
            self.query_encoder.inc_build_vocab(query, seen=split == "train")
        for word, wordid in self.sentence_encoder.vocab.D.items():
            self.query_encoder.vocab.add_token(word, seen=False)
        self.sentence_encoder.finalize_vocab(min_freq=min_freq)
        self.query_encoder.finalize_vocab(min_freq=min_freq)

        self.build_data(questions, queries, splits)
Beispiel #7
0
class GeoQueryDatasetSub(GeoQueryDatasetFunQL):
    def __init__(self,
                 p="../../datasets/geo880dong/",
                 sentence_encoder: SequenceEncoder = None,
                 min_freq: int = 2,
                 **kw):
        super(GeoQueryDatasetSub, self).__init__(p, sentence_encoder, min_freq,
                                                 **kw)

    def _initialize(self, p, sentence_encoder: SequenceEncoder, min_freq: int):
        self.data = {}
        self.sentence_encoder = sentence_encoder
        trainlines = [
            x.strip()
            for x in open(os.path.join(p, "train.txt"), "r").readlines()
        ]
        testlines = [
            x.strip()
            for x in open(os.path.join(p, "test.txt"), "r").readlines()
        ]
        splits = ["train"] * len(trainlines) + ["test"] * len(testlines)
        questions, queries = zip(*[x.split("\t") for x in trainlines])
        testqs, testxs = zip(*[x.split("\t") for x in testlines])
        questions += testqs
        queries += testxs

        queries = self.lisp2prolog(queries)

        self.query_encoder = SequenceEncoder(tokenizer=partial(
            basic_query_tokenizer, strtok=sentence_encoder.tokenizer),
                                             add_end_token=True)

        # build vocabularies
        for i, (question, query,
                split) in enumerate(zip(questions, queries, splits)):
            self.sentence_encoder.inc_build_vocab(question,
                                                  seen=split == "train")
            self.query_encoder.inc_build_vocab(query, seen=split == "train")
        for word, wordid in self.sentence_encoder.vocab.D.items():
            self.query_encoder.vocab.add_token(word, seen=False)
        self.sentence_encoder.finalize_vocab(min_freq=min_freq)
        self.query_encoder.finalize_vocab(min_freq=min_freq)

        self.build_data(questions, queries, splits)

    def lisp2prolog(self, data: List[str]):
        ret = []
        for x in data:
            pas = lisp_to_pas(x)
            prolog = pas_to_prolog(pas)
            ret.append(prolog)
        return ret
Beispiel #8
0
    def _initialize(self, p, sentence_encoder: SequenceEncoder, min_freq: int):
        self.data = {}
        self.sentence_encoder = sentence_encoder
        questions = [
            x.strip()
            for x in open(os.path.join(p, "questions.txt"), "r").readlines()
        ]
        queries = [
            x.strip()
            for x in open(os.path.join(p, "queries.funql"), "r").readlines()
        ]
        trainidxs = set([
            int(x.strip()) for x in open(os.path.join(p, "train_indexes.txt"),
                                         "r").readlines()
        ])
        testidxs = set([
            int(x.strip()) for x in open(os.path.join(p, "test_indexes.txt"),
                                         "r").readlines()
        ])
        splits = [None] * len(questions)
        for trainidx in trainidxs:
            splits[trainidx] = "train"
        for testidx in testidxs:
            splits[testidx] = "test"
        if any([split == None for split in splits]):
            print(
                f"{len([split for split in splits if split == None])} examples not assigned to any split"
            )

        self.query_encoder = SequenceEncoder(tokenizer=partial(
            basic_query_tokenizer, strtok=sentence_encoder.tokenizer),
                                             add_end_token=True)

        # build vocabularies
        unktokens = set()
        for i, (question, query,
                split) in enumerate(zip(questions, queries, splits)):
            question_tokens = self.sentence_encoder.inc_build_vocab(
                question, seen=split == "train")
            query_tokens = self.query_encoder.inc_build_vocab(
                query, seen=split == "train")
            unktokens |= set(query_tokens) - set(question_tokens)
        for word in self.sentence_encoder.vocab.counts.keys():
            self.query_encoder.vocab.add_token(word, seen=False)
        self.sentence_encoder.finalize_vocab(min_freq=min_freq, keep_rare=True)
        self.query_encoder.finalize_vocab(min_freq=min_freq, keep_rare=True)
        unktokens = unktokens & self.query_encoder.vocab.rare_tokens

        self.build_data(questions, queries, splits, unktokens=unktokens)
    def __init__(self,
                 p="../../datasets/geoquery/",
                 sentence_encoder: SequenceEncoder = None,
                 min_freq: int = 2,
                 **kw):
        super(GeoQueryDataset, self).__init__(**kw)
        self.data = {}
        self.sentence_encoder = sentence_encoder
        questions = [
            x.strip()
            for x in open(os.path.join(p, "questions.txt"), "r").readlines()
        ]
        queries = [
            x.strip()
            for x in open(os.path.join(p, "queries.funql"), "r").readlines()
        ]
        trainidxs = set([
            int(x.strip()) for x in open(os.path.join(p, "train_indexes.txt"),
                                         "r").readlines()
        ])
        testidxs = set([
            int(x.strip()) for x in open(os.path.join(p, "test_indexes.txt"),
                                         "r").readlines()
        ])
        splits = [None] * len(questions)
        for trainidx in trainidxs:
            splits[trainidx] = "train"
        for testidx in testidxs:
            splits[testidx] = "test"
        if any([split == None for split in splits]):
            print(
                f"{len([split for split in splits if split == None])} examples not assigned to any split"
            )

        self.query_encoder = SequenceEncoder(tokenizer=partial(
            basic_query_tokenizer, strtok=sentence_encoder.tokenizer),
                                             add_end_token=True)

        # build vocabularies
        for i, (question, query,
                split) in enumerate(zip(questions, queries, splits)):
            self.sentence_encoder.inc_build_vocab(question,
                                                  seen=split == "train")
            self.query_encoder.inc_build_vocab(query, seen=split == "train")
        self.sentence_encoder.finalize_vocab(min_freq=min_freq)
        self.query_encoder.finalize_vocab(min_freq=min_freq)

        self.build_data(questions, queries, splits)
def try_dataset():
    tt = q.ticktock("dataset")
    tt.tick("building dataset")
    ds = GeoDataset(sentence_encoder=SequenceEncoder(tokenizer=lambda x: x.split()))
    train_dl = ds.dataloader("train", batsize=20)
    test_dl = ds.dataloader("test", batsize=20)
    examples = set()
    examples_list = []
    duplicates = []
    testexamples = set()
    testexamples_list = []
    testduplicates = []
    for b in train_dl:
        for i in range(len(b)):
            example = b.inp_strings[i] + " --> " + str(b.gold_trees[i])
            if example in examples:
                duplicates.append(example)
            examples.add(example)
            examples_list.append(example)
            # print(example)
    for b in test_dl:
        for i in range(len(b)):
            example = b.inp_strings[i] + " --> " + str(b.gold_trees[i])
            if example in examples:
                testduplicates.append(example)
            testexamples.add(example)
            testexamples_list.append(example)

    print(f"duplicates within train: {len(duplicates)} from {len(examples_list)} total")
    print(f"duplicates from test to train: {len(testduplicates)} from {len(testexamples_list)} total:")
    for x in testduplicates:
        print(x)
    tt.tock("dataset built")
    def _initialize(self, p, xlmr, min_freq:int):
        self.data = {}
        self.xlmr = xlmr
        self.xlmr_vocab = Vocab()
        self.xlmr_vocab.set_dict(xlmr.model.decoder.dictionary.indices)
        self.sentence_encoder = SequenceEncoder(lambda x: f"<s> {xlmr.bpe.encode(x)} </s>".split(), vocab=self.xlmr_vocab)
        trainlines = [x for x in ujson.load(open(os.path.join(p, f"geo-{self.train_lang}.json"), "r"))]
        testlines = [x for x in ujson.load(open(os.path.join(p, f"geo-{self.test_lang}.json"), "r"))]
        trainlines = [x for x in trainlines if x["split"] == "train"]
        testlines = [x for x in testlines if x["split"] == "test"]
        if self.cvfolds is None:
            splits = ["train"]*len(trainlines) + ["test"] * len(testlines)
        else:
            cvsplit_len = len(trainlines)/self.cvfolds
            splits = []
            for i in range(0, self.cvfolds):
                splits += [i] * round(cvsplit_len * (i+1) - len(splits))
            random.shuffle(splits)
            splits = ["valid" if x == self.testfold else "train" for x in splits]
            splits = splits + ["test"] * len(testlines)
        questions = [x["nl"] for x in trainlines]
        queries = [x["mrl"] for x in trainlines]
        xquestions = [x["nl"] for x in testlines]
        xqueries = [x["mrl"] for x in testlines]
        questions += xquestions
        queries += xqueries

        # initialize output vocabulary
        outvocab = Vocab()
        # for token, bertid in self.xlmr_vocab.D.items():
        #     outvocab.add_token(token, seen=False)

        self.query_encoder = SequenceEncoder(tokenizer=partial(basic_query_tokenizer, strtok=lambda x: xlmr.bpe.encode(x).split()), vocab=outvocab, add_end_token=True)

        # build vocabularies
        for i, (question, query, split) in enumerate(zip(questions, queries, splits)):
            question_tokens = self.sentence_encoder.convert(question, return_what="tokens")[0]
            for token in question_tokens:
                self.query_encoder.vocab.add_token(token, seen=False)
            self.query_encoder.inc_build_vocab(query, seen=split=="train")
        keeptokens = set(self.xlmr_vocab.D.keys())
        self.query_encoder.finalize_vocab(min_freq=min_freq, keep_tokens=keeptokens)

        token_specs = self.build_token_specs(queries)
        self.token_specs = token_specs

        self.build_data(questions, queries, splits)
Beispiel #12
0
    def __init__(self,
                 inp_strings:List[str]=None,
                 gold_strings:List[str]=None,
                 inp_tensor:torch.Tensor=None,
                 gold_tensor:torch.Tensor=None,
                 inp_tokens:List[List[str]]=None,
                 gold_tokens:List[List[str]]=None,
                 sentence_encoder:SequenceEncoder=None,
                 query_encoder:SequenceEncoder=None,
                 **kw):
        if inp_strings is None:
            super(BasicDecoderState, self).__init__(**kw)
        else:
            kw = kw.copy()
            kw.update({"inp_strings": np.asarray(inp_strings), "gold_strings": np.asarray(gold_strings)})
            super(BasicDecoderState, self).__init__(**kw)

            self.sentence_encoder = sentence_encoder
            self.query_encoder = query_encoder

            # self.set(followed_actions_str = np.asarray([None for _ in self.inp_strings]))
            # for i in range(len(self.followed_actions_str)):
            #     self.followed_actions_str[i] = []
            self.set(followed_actions = torch.zeros(len(inp_strings), 0, dtype=torch.long))
            self.set(_is_terminated = np.asarray([False for _ in self.inp_strings]))
            self.set(_timesteps = np.asarray([0 for _ in self.inp_strings]))

            if sentence_encoder is not None:
                x = [sentence_encoder.convert(x, return_what="tensor,tokens") for x in self.inp_strings]
                x = list(zip(*x))
                inp_tokens = np.asarray([None for _ in range(len(x[1]))], dtype=np.object)
                for i, inp_tokens_e in enumerate(x[1]):
                    inp_tokens[i] = tuple(inp_tokens_e)
                x = {"inp_tensor": batchstack(x[0]),
                     "inp_tokens": inp_tokens}
                self.set(**x)
            if self.gold_strings is not None:
                if query_encoder is not None:
                    x = [query_encoder.convert(x, return_what="tensor,tokens") for x in self.gold_strings]
                    x = list(zip(*x))
                    gold_tokens = np.asarray([None for _ in range(len(x[1]))])
                    for i, gold_tokens_e in enumerate(x[1]):
                        gold_tokens[i] = tuple(gold_tokens_e)
                    x = {"gold_tensor": batchstack(x[0]),
                         "gold_tokens": gold_tokens}
                    self.set(**x)
Beispiel #13
0
def try_perturbed_generated_dataset():
    torch.manual_seed(1234)
    ovd = OvernightDatasetLoader().load()
    govd = PCFGDataset(OvernightPCFGBuilder()
                       .build(ovd[(None, None, lambda x: x in {"train", "valid"})]
                              .map(lambda f: f[1]).examples),
                       N=10000)

    print(govd[0])
    # print(govd[lambda x: True][0])

    # print(govd[:])
    # create vocab from pcfg
    vocab = build_vocab_from_pcfg(govd._pcfg)
    seqenc = SequenceEncoder(vocab=vocab, tokenizer=tree_to_lisp_tokens)
    spanmasker = SpanMasker(seed=12345667)
    treemasker = SubtreeMasker(p=.05, seed=2345677)

    perturbed_govd = govd.cache()\
        .map(lambda x: (seqenc.convert(x, "tensor"), x)) \
        .map(lambda x: x + (seqenc.convert(x[-1], "tokens"),)) \
        .map(lambda x: x + (spanmasker(x[-1]),)) \
        .map(lambda x: x + (seqenc.convert(x[-1], "tensor"),)) \
        .map(lambda x: (x[-1], x[0]))

    dl = DataLoader(perturbed_govd, batch_size=10, shuffle=True, collate_fn=pad_and_default_collate)
    batch = next(iter(dl))
    print(batch)
    print(vocab.tostr(batch[0][1]))
    print(vocab.tostr(batch[1][1]))

    tt = q.ticktock()
    tt.tick("first run")
    for i in range(10000):
        y = perturbed_govd[i]
        if i < 10:
            print(f"{y[0]}\n{y[-2]}")
    tt.tock("first run done")
    tt.tick("second run")
    for i in range(10000):
        y = perturbed_govd[i]
        if i < 10:
            print(f"{y[0]}\n{y[-2]}")
    tt.tock("second run done")
Beispiel #14
0
    def __init__(self, maxlen=10, NperY=10, **kw):
        super(ConditionalRecallDataset, self).__init__(**kw)
        self.data = {}
        self.NperY, self.maxlen = NperY, maxlen
        self._seqs, self._ys = gen_data(self.maxlen, self.NperY)
        self.encoder = SequenceEncoder(tokenizer=lambda x: list(x))

        for seq, y in zip(self._seqs, self._ys):
            self.encoder.inc_build_vocab(seq)
            self.encoder.inc_build_vocab(y)

        self.N = len(self._seqs)
        N = self.N

        splits = ["train"] * int(N * 0.8) + ["valid"] * int(
            N * 0.1) + ["test"] * int(N * 0.1)
        random.shuffle(splits)

        self.encoder.finalize_vocab()
        self.build_data(self._seqs, self._ys, splits)
Beispiel #15
0
    def test_beam_transition(self):
        texts = [
            "i went to chocolate @END@", "awesome is @END@",
            "the meaning of life @END@"
        ]
        from parseq.vocab import SequenceEncoder
        se = SequenceEncoder(tokenizer=lambda x: x.split())
        for t in texts:
            se.inc_build_vocab(t)
        se.finalize_vocab()
        x = BasicDecoderState(texts, texts, se, se)
        x.start_decoding()

        class Model(TransitionModel):
            def forward(self, x: BasicDecoderState):
                outprobs = torch.randn(len(x),
                                       x.query_encoder.vocab.number_of_ids())
                outprobs = torch.nn.functional.log_softmax(outprobs, -1)
                return outprobs, x

        model = Model()

        beamsize = 50
        maxtime = 10
        beam_xs = [
            x.make_copy(detach=False, deep=True) for _ in range(beamsize)
        ]
        beam_states = BeamState(beam_xs)

        print(len(beam_xs))
        print(len(beam_states))

        bt = BeamTransition(model, beamsize, maxtime=maxtime)
        i = 0
        _, _, y, _ = bt(x, i)
        i += 1
        _, _, y, _ = bt(y, i)

        all_terminated = y.all_terminated()
        while not all_terminated:
            _, predactions, y, all_terminated = bt(y, i)
            i += 1

        print("timesteps done:")
        print(i)
        print(y)
        print(predactions[0])
        for i in range(beamsize):
            print("-")
            # print(y.bstates[0].get(i).followed_actions)
            # print(predactions[0, i, :])
            pa = predactions[0, i, :]
            # print((pa == se.vocab[se.vocab.endtoken]).cumsum(0))
            pa = ((pa == se.vocab[se.vocab.endtoken]).long().cumsum(0) <
                  1).long() * pa
            yb = y.bstates[0].get(i).followed_actions[0, :]
            yb = yb * (yb != se.vocab[se.vocab.endtoken]).long()
            print(pa)
            print(yb)
            self.assertTrue(torch.allclose(pa, yb))
Beispiel #16
0
    def test_beam_search(self):
        texts = [
            "i went to chocolate @END@", "awesome is @END@",
            "the meaning of life @END@"
        ]
        from parseq.vocab import SequenceEncoder
        se = SequenceEncoder(tokenizer=lambda x: x.split())
        for t in texts:
            se.inc_build_vocab(t)
        se.finalize_vocab()
        x = BasicDecoderState(texts, texts, se, se)
        x.start_decoding()

        class Model(TransitionModel):
            def forward(self, x: BasicDecoderState):
                outprobs = torch.randn(len(x),
                                       x.query_encoder.vocab.number_of_ids())
                outprobs = torch.nn.functional.log_softmax(outprobs, -1)
                return outprobs, x

        model = Model()

        beamsize = 50
        maxtime = 10
        bs = BeamDecoder(model,
                         eval=[CELoss(ignore_index=0),
                               SeqAccuracies()],
                         eval_beam=[BeamSeqAccuracies()],
                         beamsize=beamsize,
                         maxtime=maxtime)

        y = bs(x)
        print(y)
Beispiel #17
0
    def test_free_decoder(self):
        texts = [
            "i went to chocolate a b c d e f g h i j k l m n o p q r @END@",
            "awesome is @END@", "the meaning of life @END@"
        ]
        se = SequenceEncoder(tokenizer=lambda x: x.split())
        for t in texts:
            se.inc_build_vocab(t)
        se.finalize_vocab()

        texts = ["@END@"] * 100

        x = BasicDecoderState(texts, texts, se, se)

        class Model(TransitionModel):
            def forward(self, x: BasicDecoderState):
                outprobs = torch.rand(len(x),
                                      x.query_encoder.vocab.number_of_ids())
                return outprobs, x

        MAXTIME = 10
        dec = SeqDecoder(FreerunningTransition(Model(), maxtime=MAXTIME))

        y = dec(x)
        print(y[1].followed_actions)
        print(max([len(y[1].followed_actions[i]) for i in range(len(y[1]))]))
        print(min([len(y[1].followed_actions[i]) for i in range(len(y[1]))]))
        self.assertTrue(
            max([len(y[1].followed_actions[i])
                 for i in range(len(y[1]))]) <= MAXTIME + 1)
Beispiel #18
0
def load_ds(domain="restaurants",
            min_freq=0,
            top_k=np.infty,
            nl_mode="bart-large",
            trainonvalid=False):
    ds = OvernightDatasetLoader(simplify_mode="light").load(
        domain=domain, trainonvalid=trainonvalid)

    seqenc_vocab = Vocab(padid=1, startid=0, endid=2, unkid=UNKID)
    seqenc = SequenceEncoder(vocab=seqenc_vocab,
                             tokenizer=tree_to_lisp_tokens,
                             add_start_token=True,
                             add_end_token=True)
    for example in ds.examples:
        query = example[1]
        seqenc.inc_build_vocab(query, seen=example[2] == "train")
    seqenc.finalize_vocab(min_freq=min_freq, top_k=top_k)

    nl_tokenizer = AutoTokenizer.from_pretrained(nl_mode)

    def tokenize(x):
        ret = (nl_tokenizer.encode(x[0], return_tensors="pt")[0],
               seqenc.convert(x[1], return_what="tensor"), x[2], x[0], x[1])
        return ret
    tds, vds, xds = ds[(None, None, "train")].map(tokenize), \
                    ds[(None, None, "valid")].map(tokenize), \
                    ds[(None, None, "test")].map(tokenize)
    return tds, vds, xds, nl_tokenizer, seqenc
Beispiel #19
0
    def test_tf_decoder(self):
        texts = [
            "i went to chocolate @END@", "awesome is @END@",
            "the meaning of life @END@"
        ]
        se = SequenceEncoder(tokenizer=lambda x: x.split())
        for t in texts:
            se.inc_build_vocab(t)
        se.finalize_vocab()
        x = BasicDecoderState(texts, texts, se, se)

        class Model(TransitionModel):
            def forward(self, x: BasicDecoderState):
                outprobs = torch.rand(len(x),
                                      x.query_encoder.vocab.number_of_ids())
                return outprobs, x

        dec = SeqDecoder(TFTransition(Model()))

        y = dec(x)
        print(y[1].followed_actions)
        outactions = y[1].followed_actions.detach().cpu().numpy()
        print(outactions[0])
        print(se.vocab.print(outactions[0]))
        print(se.vocab.print(outactions[1]))
        print(se.vocab.print(outactions[2]))
        self.assertTrue(se.vocab.print(outactions[0]) == texts[0])
        self.assertTrue(se.vocab.print(outactions[1]) == texts[1])
        self.assertTrue(se.vocab.print(outactions[2]) == texts[2])
Beispiel #20
0
class ConditionalRecallDataset(object):
    def __init__(self, maxlen=10, NperY=10, **kw):
        super(ConditionalRecallDataset, self).__init__(**kw)
        self.data = {}
        self.NperY, self.maxlen = NperY, maxlen
        self._seqs, self._ys = gen_data(self.maxlen, self.NperY)
        self.encoder = SequenceEncoder(tokenizer=lambda x: list(x))

        for seq, y in zip(self._seqs, self._ys):
            self.encoder.inc_build_vocab(seq)
            self.encoder.inc_build_vocab(y)

        self.N = len(self._seqs)
        N = self.N

        splits = ["train"] * int(N * 0.8) + ["valid"] * int(
            N * 0.1) + ["test"] * int(N * 0.1)
        random.shuffle(splits)

        self.encoder.finalize_vocab()
        self.build_data(self._seqs, self._ys, splits)

    def build_data(self, seqs, ys, splits):
        for seq, y, split in zip(seqs, ys, splits):
            seq_tensor = self.encoder.convert(seq, return_what="tensor")
            y_tensor = self.encoder.convert(y, return_what="tensor")
            if split not in self.data:
                self.data[split] = []
            self.data[split].append((seq_tensor[0], y_tensor[0][0]))

    def get_split(self, split: str):
        return DatasetSplitProxy(self.data[split])

    def dataloader(self, split: str = None, batsize: int = 5, shuffle=None):
        if split is None:  # return all splits
            ret = {}
            for split in self.data.keys():
                ret[split] = self.dataloader(batsize=batsize,
                                             split=split,
                                             shuffle=shuffle)
            return ret
        else:
            assert (split in self.data.keys())
            shuffle = shuffle if shuffle is not None else split in (
                "train", "train+valid")
            dl = DataLoader(self.get_split(split),
                            batch_size=batsize,
                            shuffle=shuffle)
            return dl
Beispiel #21
0
    def test_beam_search_vs_greedy(self):
        with torch.no_grad():
            texts = ["a b"] * 10
            from parseq.vocab import SequenceEncoder
            se = SequenceEncoder(tokenizer=lambda x: x.split())
            for t in texts:
                se.inc_build_vocab(t)
            se.finalize_vocab()
            x = BasicDecoderState(texts, texts, se, se)
            x.start_decoding()

            class Model(TransitionModel):
                transition_tensor = torch.tensor([[0, 0, 0, 0, .51, .49],
                                                  [0, 0, 0, 0, .51, .49],
                                                  [0, 0, 0, 0, .51, .49],
                                                  [0, 0, 0, 0, .51, .49],
                                                  [0, 0, 0, 0, .51, .49],
                                                  [0, 0, 0, 0, .01, .99]])

                def forward(self, x: BasicDecoderState):
                    prev = x.prev_actions
                    outprobs = self.transition_tensor[prev]
                    outprobs = torch.log(outprobs)
                    return outprobs, x

            model = Model()

            beamsize = 50
            maxtime = 10
            beam_xs = [
                x.make_copy(detach=False, deep=True) for _ in range(beamsize)
            ]
            beam_states = BeamState(beam_xs)

            print(len(beam_xs))
            print(len(beam_states))

            bt = BeamTransition(model, beamsize, maxtime=maxtime)
            i = 0
            _, _, y, _ = bt(x, i)
            i += 1
            _, _, y, _ = bt(y, i)

            all_terminated = y.all_terminated()
            while not all_terminated:
                start_time = time.time()
                _, _, y, all_terminated = bt(y, i)
                i += 1
                # print(i)
                end_time = time.time()
                print(f"{i}: {end_time - start_time}")

            print(y)
            print(y.bstates.get(0).followed_actions)
Beispiel #22
0
def try_tokenizer_dataset():
    from transformers import BartTokenizer

    ovd = OvernightDatasetLoader().load()
    seqenc = SequenceEncoder(tokenizer=tree_to_lisp_tokens)
    for example in ovd.examples:
        query = example[1]
        seqenc.inc_build_vocab(query, seen=example[2] == "train")
    seqenc.finalize_vocab()
    nl_tokenizer = BartTokenizer.from_pretrained("bart-large")
    def tokenize(x):
        ret = [xe for xe in x]
        ret.append(nl_tokenizer.tokenize(ret[0]))
        ret.append(nl_tokenizer.encode(ret[0], return_tensors="pt"))
        ret.append(seqenc.convert(ret[1], return_what="tensor")[0][None])
        return ret
    ovd = ovd.map(tokenize)
    print(ovd[0])
Beispiel #23
0
 def test_decoder_API(self):
     texts = ["i went to chocolate", "awesome is", "the meaning of life"]
     se = SequenceEncoder(tokenizer=lambda x: x.split())
     for t in texts:
         se.inc_build_vocab(t)
     se.finalize_vocab()
     x = BasicDecoderState(texts, texts, se, se)
     print(x.inp_tensor)
     print("terminated")
     print(x.is_terminated())
     print(x.all_terminated())
     print("prev_actions")
     x.start_decoding()
     print(x.prev_actions)
     print("step")
     x.step(["i", torch.tensor([7]), "the"])
     print(x.prev_actions)
     print(x.followed_actions)
Beispiel #24
0
    def test_tf_decoder_with_losses_with_gold(self):
        texts = [
            "i went to chocolate @END@", "awesome is @END@",
            "the meaning of life @END@"
        ]
        se = SequenceEncoder(tokenizer=lambda x: x.split())
        for t in texts:
            se.inc_build_vocab(t)
        se.finalize_vocab()
        x = BasicDecoderState(texts, texts, se, se)

        class Model(TransitionModel):
            def forward(self, x: BasicDecoderState):
                outprobs = torch.zeros(len(x),
                                       x.query_encoder.vocab.number_of_ids())
                golds = x.get_gold().gather(
                    1,
                    torch.tensor(x._timesteps).to(torch.long)[:, None])
                outprobs.scatter_(1, golds, 1)
                return outprobs, x

        celoss = CELoss(ignore_index=0)
        accs = SeqAccuracies()

        dec = SeqDecoder(TFTransition(Model()), eval=[celoss, accs])

        y = dec(x)

        print(y[0])
        print(y[1].followed_actions)
        print(y[1].get_gold())

        self.assertEqual(y[0]["seq_acc"], 1)
        self.assertEqual(y[0]["elem_acc"], 1)

        # print(y[1].followed_actions)
        outactions = y[1].followed_actions.detach().cpu().numpy()
        # print(outactions[0])
        # print(se.vocab.print(outactions[0]))
        # print(se.vocab.print(outactions[1]))
        # print(se.vocab.print(outactions[2]))
        self.assertTrue(se.vocab.print(outactions[0]) == texts[0])
        self.assertTrue(se.vocab.print(outactions[1]) == texts[1])
        self.assertTrue(se.vocab.print(outactions[2]) == texts[2])
Beispiel #25
0
def try_dataset():
    tt = q.ticktock("dataset")
    tt.tick("building dataset")
    ds = GeoQueryDatasetFunQL(sentence_encoder=SequenceEncoder(
        tokenizer=lambda x: x.split()))
    train_dl = ds.dataloader("train", batsize=19)
    test_dl = ds.dataloader("test", batsize=20)
    examples = set()
    examples_list = []
    duplicates = []
    for b in train_dl:
        print(len(b))
        for i in range(len(b)):
            example = b.inp_strings[i] + " --> " + b.gold_strings[i]
            if example in examples:
                duplicates.append(example)
            examples.add(example)
            examples_list.append(example)
            # print(example)
        pass
    print(
        f"duplicates within train: {len(duplicates)} from {len(examples_list)} total"
    )
    tt.tock("dataset built")
Beispiel #26
0
 def test_create(self):
     se = SequenceEncoder(tokenizer=lambda x: x.split())
     texts = [
         "i went to chocolate", "awesome is @PAD@ @PAD@",
         "the meaning of life"
     ]
     for t in texts:
         se.inc_build_vocab(t)
     se.finalize_vocab()
     x = [BasicDecoderState([t], [t], se, se) for t in texts]
     merged_x = x[0].merge(x)
     texts = ["i went to chocolate", "awesome is", "the meaning of life"]
     batch_x = BasicDecoderState(texts, texts, se, se)
     print(merged_x.inp_tensor)
     print(batch_x.inp_tensor)
     self.assertTrue(torch.allclose(merged_x.inp_tensor,
                                    batch_x.inp_tensor))
     self.assertTrue(
         torch.allclose(merged_x.gold_tensor, batch_x.gold_tensor))
Beispiel #27
0
    def test_tf_decoder_with_losses(self):
        texts = [
            "i went to chocolate @END@", "awesome is @END@",
            "the meaning of life @END@"
        ]
        se = SequenceEncoder(tokenizer=lambda x: x.split())
        for t in texts:
            se.inc_build_vocab(t)
        se.finalize_vocab()
        x = BasicDecoderState(texts, texts, se, se)

        class Model(TransitionModel):
            def forward(self, x: BasicDecoderState):
                outprobs = torch.rand(len(x),
                                      x.query_encoder.vocab.number_of_ids())
                outprobs = torch.nn.functional.log_softmax(outprobs, -1)
                return outprobs, x

        celoss = CELoss(ignore_index=0)
        accs = SeqAccuracies()

        dec = SeqDecoder(TFTransition(Model()), eval=[celoss, accs])

        y = dec(x)

        print(y[0])
        print(y[1].followed_actions)
        print(y[1].get_gold())

        # print(y[1].followed_actions)
        outactions = y[1].followed_actions.detach().cpu().numpy()
        # print(outactions[0])
        # print(se.vocab.print(outactions[0]))
        # print(se.vocab.print(outactions[1]))
        # print(se.vocab.print(outactions[2]))
        self.assertTrue(se.vocab.print(outactions[0]) == texts[0])
        self.assertTrue(se.vocab.print(outactions[1]) == texts[1])
        self.assertTrue(se.vocab.print(outactions[2]) == texts[2])
Beispiel #28
0
class GeoDatasetRank(object):
    def __init__(self,
                 p="geoquery_gen/run4/",
                 min_freq: int = 2,
                 splits=None,
                 **kw):
        super(GeoDatasetRank, self).__init__(**kw)
        self._initialize(p)
        self.splits_proportions = splits

    def _initialize(self, p):
        self.data = {}
        with open(os.path.join(p, "trainpreds.json")) as f:
            trainpreds = ujson.load(f)
        with open(os.path.join(p, "testpreds.json")) as f:
            testpreds = ujson.load(f)
        splits = ["train"] * len(trainpreds) + ["test"] * len(testpreds)
        preds = trainpreds + testpreds

        self.sentence_encoder = SequenceEncoder(tokenizer=lambda x: x.split())
        self.query_encoder = SequenceEncoder(tokenizer=lambda x: x.split())

        # build vocabularies
        for i, (example, split) in enumerate(zip(preds, splits)):
            self.sentence_encoder.inc_build_vocab(" ".join(
                example["sentence"]),
                                                  seen=split == "train")
            self.query_encoder.inc_build_vocab(" ".join(example["gold"]),
                                               seen=split == "train")
            for can in example["candidates"]:
                self.query_encoder.inc_build_vocab(" ".join(can["tokens"]),
                                                   seen=False)
        # for word, wordid in self.sentence_encoder.vocab.D.items():
        #     self.query_encoder.vocab.add_token(word, seen=False)
        self.sentence_encoder.finalize_vocab()
        self.query_encoder.finalize_vocab()

        self.build_data(preds, splits)

    def build_data(self, examples: Iterable[dict], splits: Iterable[str]):
        maxlen_in, maxlen_out = 0, 0
        for example, split in zip(examples, splits):
            inp, out = " ".join(example["sentence"]), " ".join(example["gold"])
            inp_tensor, inp_tokens = self.sentence_encoder.convert(
                inp, return_what="tensor,tokens")
            gold_tree = lisp_to_tree(" ".join(example["gold"][:-1]))
            if not isinstance(gold_tree, Tree):
                assert (gold_tree is not None)
            gold_tensor, gold_tokens = self.query_encoder.convert(
                out, return_what="tensor,tokens")

            candidate_tensors, candidate_tokens, candidate_align_tensors = [], [], []
            candidate_align_entropies = []
            candidate_trees = []
            candidate_same = []
            for cand in example["candidates"]:
                cand_tree, _ = lisp_to_tree(" ".join(cand["tokens"][:-1]),
                                            None)
                if cand_tree is None:
                    cand_tree = Tree("@UNK@", [])
                assert (cand_tree is not None)
                cand_tensor, cand_tokens = self.query_encoder.convert(
                    " ".join(cand["tokens"]), return_what="tensor,tokens")
                candidate_tensors.append(cand_tensor)
                candidate_tokens.append(cand_tokens)
                candidate_align_tensors.append(torch.tensor(
                    cand["alignments"]))
                candidate_align_entropies.append(
                    torch.tensor(cand["align_entropies"]))
                candidate_trees.append(cand_tree)
                candidate_same.append(
                    are_equal_trees(cand_tree,
                                    gold_tree,
                                    orderless={"and", "or"},
                                    unktoken="@NOUNKTOKENHERE@"))

            candidate_tensor = torch.stack(q.pad_tensors(candidate_tensors, 0),
                                           0)
            candidate_align_tensor = torch.stack(
                q.pad_tensors(candidate_align_tensors, 0), 0)
            candidate_align_entropy = torch.stack(
                q.pad_tensors(candidate_align_entropies, 0), 0)
            candidate_same = torch.tensor(candidate_same)

            state = RankState(
                inp_tensor[None, :],
                gold_tensor[None, :],
                candidate_tensor[None, :, :],
                candidate_same[None, :],
                candidate_align_tensor[None, :],
                candidate_align_entropy[None, :],
                self.sentence_encoder.vocab,
                self.query_encoder.vocab,
            )
            if split not in self.data:
                self.data[split] = []
            self.data[split].append(state)
            maxlen_in = max(maxlen_in, len(inp_tokens))
            maxlen_out = max(maxlen_out, candidate_tensor.size(-1),
                             gold_tensor.size(-1))
        self.maxlen_input = maxlen_in
        self.maxlen_output = maxlen_out

    def get_split(self, split: str):
        return DatasetSplitProxy(self.data[split])

    @staticmethod
    def collate_fn(data: Iterable):
        goldmaxlen = 0
        inpmaxlen = 0
        data = [state.make_copy(detach=True, deep=True) for state in data]
        for state in data:
            goldmaxlen = max(goldmaxlen, state.gold_tensor.size(1))
            inpmaxlen = max(inpmaxlen, state.inp_tensor.size(1))
            goldmaxlen = max(goldmaxlen, state.candtensors.size(-1))
        inp_tensors = q.pad_tensors([state.inp_tensor for state in data], 1, 0)
        gold_tensors = q.pad_tensors([state.gold_tensor for state in data], 1,
                                     0)
        candtensors = q.pad_tensors([state.candtensors for state in data], 2,
                                    0)
        alignments = q.pad_tensors([state.alignments for state in data], 2, 0)
        alignment_entropies = q.pad_tensors(
            [state.alignment_entropies for state in data], 2, 0)

        for i, state in enumerate(data):
            state.inp_tensor = inp_tensors[i]
            state.gold_tensor = gold_tensors[i]
            state.candtensors = candtensors[i]
            state.alignments = alignments[i]
            state.alignment_entropies = alignment_entropies[i]
        ret = data[0].merge(data)
        return ret

    def dataloader(self, split: str = None, batsize: int = 5, shuffle=None):
        if split is None:  # return all splits
            ret = {}
            for split in self.data.keys():
                ret[split] = self.dataloader(batsize=batsize,
                                             split=split,
                                             shuffle=shuffle)
            return ret
        else:
            assert (split in self.data.keys())
            shuffle = shuffle if shuffle is not None else split in (
                "train", "train+valid")
            dl = DataLoader(self.get_split(split),
                            batch_size=batsize,
                            shuffle=shuffle,
                            collate_fn=type(self).collate_fn)
            return dl
Beispiel #29
0
def run(
    lr=0.001,
    batsize=50,
    epochs=50,
    embdim=100,
    encdim=100,
    numlayers=1,
    beamsize=1,
    dropout=.2,
    wreg=1e-10,
    cuda=False,
    gpu=0,
    minfreq=3,
    gradnorm=3.,
    cosine_restarts=1.,
    beta=0.001,
    vib_init=True,
    vib_enc=True,
):
    localargs = locals().copy()
    print(locals())
    tt = q.ticktock("script")
    device = torch.device("cpu") if not cuda else torch.device("cuda", gpu)
    tt.tick("loading data")
    ds = LCQuaDnoENTDataset(
        sentence_encoder=SequenceEncoder(tokenizer=split_tokenizer),
        min_freq=minfreq)
    print(
        f"max lens: {ds.maxlen_input} (input) and {ds.maxlen_output} (output)")
    tt.tock("data loaded")

    do_rare_stats(ds)
    # batch = next(iter(train_dl))
    # print(batch)
    # print("input graph")
    # print(batch.batched_states)

    model = BasicGenModel_VIB(embdim=embdim,
                              hdim=encdim,
                              dropout=dropout,
                              numlayers=numlayers,
                              sentence_encoder=ds.sentence_encoder,
                              query_encoder=ds.query_encoder,
                              feedatt=True,
                              vib_init=vib_init,
                              vib_enc=vib_enc)

    # sentence_rare_tokens = set([ds.sentence_encoder.vocab(i) for i in model.inp_emb.rare_token_ids])
    # do_rare_stats(ds, sentence_rare_tokens=sentence_rare_tokens)
    losses = [CELoss(ignore_index=0, mode="logprobs")]
    if vib_init:
        losses.append(
            StatePenalty(lambda state: sum(state.mstate.vib.init),
                         weight=beta))
    if vib_enc:
        losses.append(StatePenalty("mstate.vib.enc", weight=beta))

    tfdecoder = SeqDecoder(
        model,
        tf_ratio=1.,
        eval=losses + [
            SeqAccuracies(),
            TreeAccuracy(tensor2tree=partial(tensor2tree,
                                             D=ds.query_encoder.vocab),
                         orderless={"select", "count", "ask"})
        ])
    # beamdecoder = BeamActionSeqDecoder(tfdecoder.model, beamsize=beamsize, maxsteps=50)
    if beamsize == 1:
        freedecoder = SeqDecoder(
            model,
            maxtime=40,
            tf_ratio=0.,
            eval=[
                SeqAccuracies(),
                TreeAccuracy(tensor2tree=partial(tensor2tree,
                                                 D=ds.query_encoder.vocab),
                             orderless={"select", "count", "ask"})
            ])
    else:

        freedecoder = BeamDecoder(
            model,
            maxtime=30,
            beamsize=beamsize,
            eval=[
                SeqAccuracies(),
                TreeAccuracy(tensor2tree=partial(tensor2tree,
                                                 D=ds.query_encoder.vocab),
                             orderless={"select", "count", "ask"})
            ])

    # # test
    # tt.tick("doing one epoch")
    # for batch in iter(train_dl):
    #     batch = batch.to(device)
    #     ttt.tick("start batch")
    #     # with torch.no_grad():
    #     out = tfdecoder(batch)
    #     ttt.tock("end batch")
    # tt.tock("done one epoch")
    # print(out)
    # sys.exit()

    # beamdecoder(next(iter(train_dl)))

    # print(dict(tfdecoder.named_parameters()).keys())

    losses = make_array_of_metrics("loss", "elem_acc", "seq_acc", "tree_acc")
    vlosses = make_array_of_metrics("seq_acc", "tree_acc")
    # if beamsize >= 3:
    #     vlosses = make_loss_array("seq_acc", "tree_acc", "tree_acc_at3", "tree_acc_at_last")
    # else:
    #     vlosses = make_loss_array("seq_acc", "tree_acc", "tree_acc_at_last")

    # trainable_params = tfdecoder.named_parameters()
    # exclude_params = set()
    # exclude_params.add("model.model.inp_emb.emb.weight")   # don't train input embeddings if doing glove
    # trainable_params = [v for k, v in trainable_params if k not in exclude_params]

    # 4. define optim
    # optim = torch.optim.Adam(trainable_params, lr=lr, weight_decay=wreg)
    optim = torch.optim.Adam(tfdecoder.parameters(), lr=lr, weight_decay=wreg)

    # lr schedule
    if cosine_restarts >= 0:
        # t_max = epochs * len(train_dl)
        t_max = epochs
        print(f"Total number of updates: {t_max}")
        lr_schedule = q.WarmupCosineWithHardRestartsSchedule(
            optim, 0, t_max, cycles=cosine_restarts)
        reduce_lr = [lambda: lr_schedule.step()]
    else:
        reduce_lr = []

    # 6. define training function
    clipgradnorm = lambda: torch.nn.utils.clip_grad_norm_(
        tfdecoder.parameters(), gradnorm)
    # clipgradnorm = lambda: None
    trainbatch = partial(q.train_batch, on_before_optim_step=[clipgradnorm])
    trainepoch = partial(q.train_epoch,
                         model=tfdecoder,
                         dataloader=ds.dataloader("train", batsize),
                         optim=optim,
                         losses=losses,
                         _train_batch=trainbatch,
                         device=device,
                         on_end=reduce_lr)

    # 7. define validation function (using partial)
    validepoch = partial(q.test_epoch,
                         model=freedecoder,
                         dataloader=ds.dataloader("test", batsize),
                         losses=vlosses,
                         device=device)
    # validepoch = partial(q.test_epoch, model=freedecoder, dataloader=valid_dl, losses=vlosses, device=device)

    # p = q.save_run(freedecoder, localargs, filepath=__file__)
    # q.save_dataset(ds, p)
    # _freedecoder, _localargs = q.load_run(p)
    # _ds = q.load_dataset(p)
    # sys.exit()

    # 7. run training
    tt.tick("training")
    q.run_training(run_train_epoch=trainepoch,
                   run_valid_epoch=validepoch,
                   max_epochs=epochs)
    tt.tock("done training")

    # testing
    tt.tick("testing")
    testresults = q.test_epoch(model=freedecoder,
                               dataloader=ds.dataloader("valid", batsize),
                               losses=vlosses,
                               device=device)
    print("validation test results: ", testresults)
    tt.tock("tested")
    tt.tick("testing")
    testresults = q.test_epoch(model=freedecoder,
                               dataloader=ds.dataloader("test", batsize),
                               losses=vlosses,
                               device=device)
    print("test results: ", testresults)
    tt.tock("tested")

    # save model?
    tosave = input(
        "Save this model? 'y(es)'=Yes, <int>=overwrite previous, otherwise=No) \n>"
    )
    if tosave.lower() == "y" or tosave.lower() == "yes" or re.match(
            "\d+", tosave.lower()):
        overwrite = int(tosave) if re.match("\d+", tosave) else None
        p = q.save_run(model,
                       localargs,
                       filepath=__file__,
                       overwrite=overwrite)
        q.save_dataset(ds, p)
        _model, _localargs = q.load_run(p)
        _ds = q.load_dataset(p)

        _freedecoder = BeamDecoder(
            _model,
            maxtime=50,
            beamsize=beamsize,
            eval_beam=[
                TreeAccuracy(tensor2tree=partial(tensor2tree,
                                                 D=ds.query_encoder.vocab),
                             orderless={"op:and", "SW:concat"})
            ])

        # testing
        tt.tick("testing reloaded")
        _testresults = q.test_epoch(model=_freedecoder,
                                    dataloader=_ds.dataloader("test", batsize),
                                    losses=vlosses,
                                    device=device)
        print(_testresults)
        assert (testresults == _testresults)
        tt.tock("tested")
Beispiel #30
0
class LCQuaDnoENTDataset(object):
    def __init__(self,
                 p="../../datasets/lcquad/",
                 sentence_encoder: SequenceEncoder = None,
                 min_freq: int = 2,
                 splits=None,
                 **kw):
        super(LCQuaDnoENTDataset, self).__init__(**kw)
        self._simplify_filters = True  # if True, filter expressions are converted to orderless and-expressions
        self._initialize(p, sentence_encoder, min_freq)
        self.splits_proportions = splits

    def lines_to_examples(self, lines: List[str]):
        maxsize_before = 0
        avgsize_before = []
        maxsize_after = 0
        avgsize_after = []
        afterstring = set()

        def convert_to_lispstr(_x):
            splits = _x.split()
            assert (sum([1 if xe == "~" else 0 for xe in splits]) == 1)
            assert (splits[1] == "~")
            splits = ["," if xe == "&" else xe for xe in splits]
            pstr = f"{splits[0]} ({' '.join(splits[2:])})"
            return pstr

        ret = []
        ltp = None
        j = 0
        for i, line in enumerate(lines):
            question = line["question"]
            query = line["logical_form"]
            query = convert_to_lispstr(query)
            z, ltp = prolog_to_pas(query, ltp)
            if z is not None:
                ztree = pas_to_tree(z)
                maxsize_before = max(maxsize_before, tree_size(ztree))
                avgsize_before.append(tree_size(ztree))
                lf = ztree
                ret.append((question, lf))
                # print(f"Example {j}:")
                # print(ret[-1][0])
                # print(ret[-1][1])
                # print()
                ltp = None
                maxsize_after = max(maxsize_after, tree_size(lf))
                avgsize_after.append(tree_size(lf))
                j += 1

        avgsize_before = sum(avgsize_before) / len(avgsize_before)
        avgsize_after = sum(avgsize_after) / len(avgsize_after)

        print("Sizes ({j} examples):")
        # print(f"\t Max, Avg size before: {maxsize_before}, {avgsize_before}")
        print(f"\t Max, Avg size: {maxsize_after}, {avgsize_after}")

        return ret

    def _initialize(self, p, sentence_encoder: SequenceEncoder, min_freq: int):
        self.data = {}
        self.sentence_encoder = sentence_encoder

        jp = os.path.join(p, "lcquad_dataset.json")
        with open(jp, "r") as f:
            examples = ujson.load(f)

        examples = self.lines_to_examples(examples)

        questions, queries = tuple(zip(*examples))
        trainlen = int(round(0.8 * len(examples)))
        validlen = int(round(0.1 * len(examples)))
        testlen = int(round(0.1 * len(examples)))
        splits = ["train"] * trainlen + ["valid"] * validlen + ["test"
                                                                ] * testlen
        random.seed(1337)
        random.shuffle(splits)
        assert (len(splits) == len(examples))

        self.query_encoder = SequenceEncoder(tokenizer=partial(
            tree_query_tokenizer, strtok=sentence_encoder.tokenizer),
                                             add_end_token=True)

        # build vocabularies
        for i, (question, query,
                split) in enumerate(zip(questions, queries, splits)):
            self.sentence_encoder.inc_build_vocab(question,
                                                  seen=split == "train")
            self.query_encoder.inc_build_vocab(query, seen=split == "train")
        for word, wordid in self.sentence_encoder.vocab.D.items():
            self.query_encoder.vocab.add_token(word, seen=False)
        self.sentence_encoder.finalize_vocab(min_freq=min_freq)
        self.query_encoder.finalize_vocab(min_freq=min_freq)

        self.build_data(questions, queries, splits)

    def build_data(self, inputs: Iterable[str], outputs: Iterable[str],
                   splits: Iterable[str]):
        maxlen_in, maxlen_out = 0, 0
        eid = 0

        gold_map = torch.arange(
            0, self.query_encoder.vocab.number_of_ids(last_nonrare=False))
        rare_tokens = self.query_encoder.vocab.rare_tokens - set(
            self.sentence_encoder.vocab.D.keys())
        for rare_token in rare_tokens:
            gold_map[self.query_encoder.vocab[rare_token]] = \
                self.query_encoder.vocab[self.query_encoder.vocab.unktoken]

        for inp, out, split in zip(inputs, outputs, splits):
            inp_tensor, inp_tokens = self.sentence_encoder.convert(
                inp, return_what="tensor,tokens")
            out_tensor, out_tokens = self.query_encoder.convert(
                out, return_what="tensor,tokens")
            out_tensor = gold_map[out_tensor]

            state = TreeDecoderState([inp], [out], inp_tensor[None, :],
                                     out_tensor[None, :], [inp_tokens],
                                     [out_tokens], self.sentence_encoder.vocab,
                                     self.query_encoder.vocab)
            state.eids = np.asarray([eid], dtype="int64")
            maxlen_in, maxlen_out = max(maxlen_in,
                                        len(state.inp_tokens[0])), max(
                                            maxlen_out,
                                            len(state.gold_tokens[0]))
            if split not in self.data:
                self.data[split] = []
            self.data[split].append(state)
            eid += 1
        self.maxlen_input, self.maxlen_output = maxlen_in, maxlen_out

    def get_split(self, split: str):
        splits = split.split("+")
        data = []
        for split in splits:
            data += self.data[split]
        return DatasetSplitProxy(data)

    @staticmethod
    def collate_fn(data: Iterable):
        goldmaxlen = 0
        inpmaxlen = 0
        data = [state.make_copy(detach=True, deep=True) for state in data]
        for state in data:
            goldmaxlen = max(goldmaxlen, state.gold_tensor.size(1))
            inpmaxlen = max(inpmaxlen, state.inp_tensor.size(1))
        for state in data:
            state.gold_tensor = torch.cat([
                state.gold_tensor,
                state.gold_tensor.new_zeros(
                    1, goldmaxlen - state.gold_tensor.size(1))
            ], 1)
            state.inp_tensor = torch.cat([
                state.inp_tensor,
                state.inp_tensor.new_zeros(
                    1, inpmaxlen - state.inp_tensor.size(1))
            ], 1)
        ret = data[0].merge(data)
        return ret

    def dataloader(self, split: str = None, batsize: int = 5):
        if split is None:  # return all splits
            ret = {}
            for split in self.data.keys():
                ret[split] = self.dataloader(batsize=batsize, split=split)
            return ret
        else:
            dl = DataLoader(self.get_split(split),
                            batch_size=batsize,
                            shuffle=split in ("train", "train+valid"),
                            collate_fn=type(self).collate_fn)
            return dl