Ejemplo n.º 1
0
    def build_data(self, inputs: Iterable[str], outputs: Iterable[str],
                   splits: Iterable[str]):
        maxlen_in, maxlen_out = 0, 0
        eid = 0

        gold_map = torch.arange(
            0, self.query_encoder.vocab.number_of_ids(last_nonrare=False))
        rare_tokens = self.query_encoder.vocab.rare_tokens - set(
            self.sentence_encoder.vocab.D.keys())
        for rare_token in rare_tokens:
            gold_map[self.query_encoder.vocab[rare_token]] = \
                self.query_encoder.vocab[self.query_encoder.vocab.unktoken]

        for inp, out, split in zip(inputs, outputs, splits):
            inp_tensor, inp_tokens = self.sentence_encoder.convert(
                inp, return_what="tensor,tokens")
            out_tensor, out_tokens = self.query_encoder.convert(
                out, return_what="tensor,tokens")
            out_tensor = gold_map[out_tensor]

            state = TreeDecoderState([inp], [out], inp_tensor[None, :],
                                     out_tensor[None, :], [inp_tokens],
                                     [out_tokens], self.sentence_encoder.vocab,
                                     self.query_encoder.vocab)
            state.eids = np.asarray([eid], dtype="int64")
            maxlen_in, maxlen_out = max(maxlen_in,
                                        len(state.inp_tokens[0])), max(
                                            maxlen_out,
                                            len(state.gold_tokens[0]))
            if split not in self.data:
                self.data[split] = []
            self.data[split].append(state)
            eid += 1
        self.maxlen_input, self.maxlen_output = maxlen_in, maxlen_out
Ejemplo n.º 2
0
 def build_data(self, inputs: Iterable[str], outputs: Iterable[str],
                splits: Iterable[str]):
     maxlen_in, maxlen_out = 0, 0
     eid = 0
     for inp, out, split in zip(inputs, outputs, splits):
         state = TreeDecoderState([inp], [out], self.sentence_encoder,
                                  self.query_encoder)
         state.eids = np.asarray([eid], dtype="int64")
         maxlen_in, maxlen_out = max(maxlen_in,
                                     len(state.inp_tokens[0])), max(
                                         maxlen_out,
                                         len(state.gold_tokens[0]))
         if split not in self.data:
             self.data[split] = []
         self.data[split].append(state)
         eid += 1
     self.maxlen_input, self.maxlen_output = maxlen_in, maxlen_out