コード例 #1
0
ファイル: lcquad_vib.py プロジェクト: saist1993/parseq
    def build_data(self, inputs: Iterable[str], outputs: Iterable[str],
                   splits: Iterable[str]):
        maxlen_in, maxlen_out = 0, 0
        eid = 0

        gold_map = torch.arange(
            0, self.query_encoder.vocab.number_of_ids(last_nonrare=False))
        rare_tokens = self.query_encoder.vocab.rare_tokens - set(
            self.sentence_encoder.vocab.D.keys())
        for rare_token in rare_tokens:
            gold_map[self.query_encoder.vocab[rare_token]] = \
                self.query_encoder.vocab[self.query_encoder.vocab.unktoken]

        for inp, out, split in zip(inputs, outputs, splits):
            inp_tensor, inp_tokens = self.sentence_encoder.convert(
                inp, return_what="tensor,tokens")
            out_tensor, out_tokens = self.query_encoder.convert(
                out, return_what="tensor,tokens")
            out_tensor = gold_map[out_tensor]

            state = TreeDecoderState([inp], [out], inp_tensor[None, :],
                                     out_tensor[None, :], [inp_tokens],
                                     [out_tokens], self.sentence_encoder.vocab,
                                     self.query_encoder.vocab)
            state.eids = np.asarray([eid], dtype="int64")
            maxlen_in, maxlen_out = max(maxlen_in,
                                        len(state.inp_tokens[0])), max(
                                            maxlen_out,
                                            len(state.gold_tokens[0]))
            if split not in self.data:
                self.data[split] = []
            self.data[split].append(state)
            eid += 1
        self.maxlen_input, self.maxlen_output = maxlen_in, maxlen_out
コード例 #2
0
    def build_data(self, inputs:Iterable[str], outputs:Iterable[str], splits:Iterable[str], unktokens:Set[str]=None):
        gold_map = None
        maxlen_in, maxlen_out = 0, 0
        maxlins = 0
        numlins_counts = [0] * (self.max_lins_allowed + 1)
        if unktokens is not None:
            gold_map = torch.arange(0, self.query_encoder.vocab.number_of_ids(last_nonrare=False))
            for rare_token in unktokens:
                gold_map[self.query_encoder.vocab[rare_token]] = \
                    self.query_encoder.vocab[self.query_encoder.vocab.unktoken]
        for inp, out, split in zip(inputs, outputs, splits):

            inp_tensor, inp_tokens = self.sentence_encoder.convert(inp, return_what="tensor,tokens")
            gold_tree = lisp_to_tree(out)
            assert(gold_tree is not None)
            out_tensor, out_tokens = self.query_encoder.convert(out, return_what="tensor,tokens")

            if split == "train":
                gold_tree_ = tensor2tree(out_tensor, self.query_encoder.vocab)
                numlins = 0
                for gold_tree_reordered in get_tree_permutations(gold_tree_, orderless={"and", "or"}):
                    if numlins >= self.max_lins_allowed:
                        break
                    out_ = tree_to_lisp(gold_tree_reordered)
                    out_tensor_, out_tokens_ = self.query_encoder.convert(out_, return_what="tensor,tokens")
                    if gold_map is not None:
                        out_tensor = gold_map[out_tensor]

                    state = TreeDecoderState([inp], [gold_tree_reordered],
                                              inp_tensor[None, :], out_tensor_[None, :],
                                              [inp_tokens], [out_tokens_],
                                              self.sentence_encoder.vocab, self.query_encoder.vocab,
                                             token_specs=self.token_specs)
                    if split not in self.data:
                        self.data[split] = []
                    self.data[split].append(state)
                    numlins += 1
                numlins_counts[numlins] += 1
                maxlins = max(maxlins, numlins)
            else:
                if gold_map is not None:
                    out_tensor = gold_map[out_tensor]

                state = TreeDecoderState([inp], [gold_tree],
                                         inp_tensor[None, :], out_tensor[None, :],
                                         [inp_tokens], [out_tokens],
                                         self.sentence_encoder.vocab, self.query_encoder.vocab,
                                         token_specs=self.token_specs)
                if split not in self.data:
                    self.data[split] = []
                self.data[split].append(state)
            maxlen_in = max(maxlen_in, len(inp_tokens))
            maxlen_out = max(maxlen_out, len(out_tensor))
        self.maxlen_input = maxlen_in
        self.maxlen_output = maxlen_out
コード例 #3
0
ファイル: geoquery_gen.py プロジェクト: saist1993/parseq
    def build_data(self,
                   inputs: Iterable[str],
                   outputs: Iterable[str],
                   splits: Iterable[str],
                   unktokens: Set[str] = None):
        gold_map = None
        maxlen_in, maxlen_out = 0, 0
        if unktokens is not None:
            gold_map = torch.arange(0,
                                    self.query_encoder.vocab.number_of_ids())
            for rare_token in unktokens:
                gold_map[self.query_encoder.vocab[rare_token]] = \
                    self.query_encoder.vocab[self.query_encoder.vocab.unktoken]
        for inp, out, split in zip(inputs, outputs, splits):

            inp_tensor, inp_tokens = self.sentence_encoder.convert(
                inp, return_what="tensor,tokens")
            gold_tree = lisp_to_tree(out)
            assert (gold_tree is not None)
            out_tensor, out_tokens = self.query_encoder.convert(
                out, return_what="tensor,tokens")
            if gold_map is not None:
                out_tensor = gold_map[out_tensor]

            state = TreeDecoderState([inp], [gold_tree],
                                     inp_tensor[None, :],
                                     out_tensor[None, :], [inp_tokens],
                                     [out_tokens],
                                     self.sentence_encoder.vocab,
                                     self.query_encoder.vocab,
                                     token_specs=self.token_specs)
            if split == "train" and self.reorder_random is True:
                gold_tree_ = tensor2tree(out_tensor, self.query_encoder.vocab)
                random_gold_tree = random.choice(
                    get_tree_permutations(gold_tree_, orderless={"and"}))
                out_ = tree_to_lisp(random_gold_tree)
                out_tensor_, out_tokens_ = self.query_encoder.convert(
                    out_, return_what="tensor,tokens")
                if gold_map is not None:
                    out_tensor_ = gold_map[out_tensor_]
                state.gold_tensor = out_tensor_[None]

            if split not in self.data:
                self.data[split] = []
            self.data[split].append(state)
            maxlen_in = max(maxlen_in, len(inp_tokens))
            maxlen_out = max(maxlen_out, len(out_tensor))
        self.maxlen_input = maxlen_in
        self.maxlen_output = maxlen_out
コード例 #4
0
    def build_data(self, inputs:Iterable[str], outputs:Iterable[str], splits:Iterable[str], unktokens:Set[str]=None):
        gold_map = None
        maxlen_in, maxlen_out = 0, 0
        if unktokens is not None:
            gold_map = torch.arange(0, self.query_encoder.vocab.number_of_ids(last_nonrare=False))
            for rare_token in unktokens:
                gold_map[self.query_encoder.vocab[rare_token]] = \
                    self.query_encoder.vocab[self.query_encoder.vocab.unktoken]
        for inp, out, split in zip(inputs, outputs, splits):

            inp_tensor, inp_tokens = self.sentence_encoder.convert(inp, return_what="tensor,tokens")
            gold_tree = lisp_to_tree(out)
            assert(gold_tree is not None)
            out_tensor, out_tokens = self.query_encoder.convert(out, return_what="tensor,tokens")
            if gold_map is not None:
                out_tensor = gold_map[out_tensor]

            state = TreeDecoderState([inp], [gold_tree],
                                      inp_tensor[None, :], out_tensor[None, :],
                                      [inp_tokens], [out_tokens],
                                      self.sentence_encoder.vocab, self.query_encoder.vocab)
            if split not in self.data:
                self.data[split] = []
            self.data[split].append(state)
            maxlen_in = max(maxlen_in, len(inp_tokens))
            maxlen_out = max(maxlen_out, len(out_tensor))
        self.maxlen_input = maxlen_in
        self.maxlen_output = maxlen_out
コード例 #5
0
ファイル: overnight_basic.py プロジェクト: saist1993/parseq
 def build_data(self, inputs: Iterable[str], outputs: Iterable[str],
                splits: Iterable[str]):
     maxlen_in, maxlen_out = 0, 0
     eid = 0
     for inp, out, split in zip(inputs, outputs, splits):
         state = TreeDecoderState([inp], [out], self.sentence_encoder,
                                  self.query_encoder)
         state.eids = np.asarray([eid], dtype="int64")
         maxlen_in, maxlen_out = max(maxlen_in,
                                     len(state.inp_tokens[0])), max(
                                         maxlen_out,
                                         len(state.gold_tokens[0]))
         if split not in self.data:
             self.data[split] = []
         self.data[split].append(state)
         eid += 1
     self.maxlen_input, self.maxlen_output = maxlen_in, maxlen_out
コード例 #6
0
    def build_data(self, inputs: Iterable[str], outputs: Iterable[str],
                   splits: Iterable[str]):
        maxlen_in, maxlen_out = 0, 0
        for inp, out, split in zip(inputs, outputs, splits):
            # tokenize both input and output
            inp_tokens = self.sentence_encoder.convert(inp,
                                                       return_what="tokens")[0]
            out_tokens = self.query_encoder.convert(out,
                                                    return_what="tokens")[0]
            # get gold tree
            gold_tree = lisp_to_tree(" ".join(out_tokens[:-1]))
            assert (gold_tree is not None)
            # replace words in output that can't be copied from given input to UNK tokens
            unktoken = self.query_encoder.vocab.unktoken
            inp_tokens_ = set(inp_tokens)
            out_tokens = [
                out_token if out_token in inp_tokens_ or
                (out_token in self.query_encoder.vocab
                 and not out_token in self.query_encoder.vocab.rare_tokens)
                else unktoken for out_token in out_tokens
            ]
            # convert token sequences to ids
            inp_tensor = self.sentence_encoder.convert(inp_tokens,
                                                       return_what="tensor")[0]
            out_tensor = self.query_encoder.convert(out_tokens,
                                                    return_what="tensor")[0]

            state = TreeDecoderState([inp], [gold_tree],
                                     inp_tensor[None, :],
                                     out_tensor[None, :], [inp_tokens],
                                     [out_tokens],
                                     self.sentence_encoder.vocab,
                                     self.query_encoder.vocab,
                                     token_specs=self.token_specs)

            if split not in self.data:
                self.data[split] = []
            self.data[split].append(state)
            maxlen_in = max(maxlen_in, len(inp_tokens))
            maxlen_out = max(maxlen_out, len(out_tensor))
        self.maxlen_input = maxlen_in
        self.maxlen_output = maxlen_out