Esempio n. 1
0
    def generate_data(self):
        """Generate data for offline training."""

        text, label = load_cls_raw_data(paths=self.paths_after_pre_process,
                                        mode=self.mode)

        text_placeholder = tf.placeholder(tf.string,
                                          shape=(None, ),
                                          name="text")
        label_placeholder = tf.placeholder(tf.string, name="label")
        self.init_feed_dict[text_placeholder] = text
        self.init_feed_dict[label_placeholder] = label
        # logging.debug("init_feed_dict: {}".format(self.init_feed_dict))

        text_ds = tf.data.Dataset.from_tensor_slices(text_placeholder)
        input_pipeline_func = self.get_input_pipeline(for_export=False)

        text_ds = text_ds.map(input_pipeline_func,
                              num_parallel_calls=self.num_parallel_calls)

        text_size_ds = text_ds.map(
            lambda x: compute_sen_lens(x, padding_token=utils.PAD_IDX),
            num_parallel_calls=self.num_parallel_calls)

        text_ds = tf.data.Dataset.zip((text_ds, text_size_ds))

        if self.use_dense:
            dense = load_npy(self.dense_npy)
            dense_ds = load_dense_dataset(dense)

        if self.infer_without_label:
            if self.use_dense:
                data_set = tf.data.Dataset.zip((text_ds, dense_ds))
            else:
                data_set = text_ds
        else:
            label_ds = load_one_label_dataset(label_placeholder, self.config)
            if self.use_dense:
                data_set = tf.data.Dataset.zip((text_ds, dense_ds, label_ds))
            else:
                data_set = tf.data.Dataset.zip((text_ds, label_ds))

        vocab_dict = load_vocab_dict(self.text_vocab_file_path)
        vocab_size = len(vocab_dict)
        data_size = len(text)
        if self.split_token != "":
            if self.split_token not in vocab_dict:
                raise ValueError(
                    "The Model uses split token: {}, not in corpus.".format(
                        self.split_token))
            self.config['data']['split_token'] = int(
                vocab_dict[self.split_token])
        self.config['data']['vocab_size'] = vocab_size
        self.config['data']['{}_data_size'.format(self.mode)] = data_size

        return data_set
Esempio n. 2
0
    def generate_data(self):
        """Generate data for offline training."""
        if self.infer_without_label:
            column_num = 1
            text_ds = load_textline_dataset(self.paths_after_pre_process,
                                            column_num)
        else:
            column_num = 2
            label_ds, text_ds = load_textline_dataset(
                self.paths_after_pre_process, column_num)

        input_pipeline_func = self.get_input_pipeline(for_export=False)

        text_ds = text_ds.map(input_pipeline_func,
                              num_parallel_calls=self.num_parallel_calls)

        text_size_ds = text_ds.map(
            lambda x: compute_sen_lens(x, padding_token=utils.PAD_IDX),
            num_parallel_calls=self.num_parallel_calls)

        text_ds = tf.data.Dataset.zip((text_ds, text_size_ds))

        if self.use_dense:
            dense = load_npy(self.dense_npy)
            dense_ds = load_dense_dataset(dense)

        if self.infer_without_label:
            if self.use_dense:
                data_set = tf.data.Dataset.zip((text_ds, dense_ds))
            else:
                data_set = text_ds
        else:
            label_ds = process_one_label_dataset(label_ds, self.config)
            if self.use_dense:
                data_set = tf.data.Dataset.zip((text_ds, dense_ds, label_ds))
            else:
                data_set = tf.data.Dataset.zip((text_ds, label_ds))

        vocab_dict = load_vocab_dict(self.text_vocab_file_path)
        vocab_size = len(vocab_dict)
        if self.split_token != "":
            if self.split_token not in vocab_dict:
                raise ValueError(
                    "The Model uses split token: {}, not in corpus.".format(
                        self.split_token))
            self.config['data']['split_token'] = int(
                vocab_dict[self.split_token])
        self.config['data']['vocab_size'] = vocab_size
        self.config['data']['{}_data_size'.format(self.mode)] = get_file_len(
            self.paths_after_pre_process)

        return data_set