Esempio n. 1
0
    def sequentialize_data_no_padding(self, train_contents, feature_mode, val_contents=[], tokenizer=None,
                                      max_length=None,
                                      Max_Vocab_Size=None):
        if Max_Vocab_Size is None:
            Vocab_Size = MAX_VOCAB_SIZE
        else:
            Vocab_Size = Max_Vocab_Size
        info("Max Vocab Size is {}".format(Vocab_Size))
        if tokenizer is None:
            if feature_mode == 0:
                tokenizer = text.Tokenizer(num_words=Vocab_Size,
                                           char_level=True,
                                           oov_token="UNK")
            elif feature_mode == 1:
                tokenizer = text.Tokenizer(num_words=Vocab_Size)

            tokenizer.fit_on_texts(train_contents)

        _max_length = max_length
        word_index = tokenizer.word_index
        num_features = min(len(word_index) + 1, Vocab_Size)
        info("vacab_word:", len(word_index))

        if val_contents:
            return word_index, num_features, tokenizer, _max_length
        else:
            return word_index, num_features, tokenizer, _max_length
Esempio n. 2
0
    def get_nlp_test_numpy_prefetch(self):
        if self.test_tfds is None:
            error("Error: test_tfds is None.")
            return self.accum_test_x, self.accum_test_y
        X, Y = [], []

        if len(self.accum_test_x) == 0:
            time_test_np_start = time.time()
            tfds_test_os_iterator = self.test_tfds.make_one_shot_iterator()
            tfds_test_iter_next = tfds_test_os_iterator.get_next()

            with tf.Session(config=tf.ConfigProto(log_device_placement=False)) as sess:
                while True:
                    try:
                        example, labels = sess.run(tfds_test_iter_next)
                        example = np.squeeze(example, (2, 3))
                        example = np.squeeze(example, axis=-1)
                        example = example.astype(np.int)
                        X.extend(example)
                        Y.extend(labels)
                        self.accum_test_x.extend(example)
                        self.accum_test_y.extend(labels)
                        # self.accm_test_cnt += 1
                        self.accm_test_cnt += example.shape[0]

                    except tf.errors.OutOfRangeError:
                        break

            time_test_np_end = time.time()
            info("note: now take test accm_test_cnt={}, cost_time={}s".format(self.accm_test_cnt, round(
                time_test_np_end - time_test_np_start, 3)))
            # self.accum_test_y = np.array(self.accum_test_y)

        return X, Y
Esempio n. 3
0
    def sample_train_index_add(self, add_index):
        train_label_distribution = np.sum(np.array(self.update_y), 0)
        print("new sample train_distribution: ", train_label_distribution)  # 获取对应label的分布
        self.max_sample_num_per_class = int(
            np.max(train_label_distribution) * 4 / 5)

        if self.sample_num_per_class is None:
            if self.num_samples_train < MAX_SAMPLE_TRIAN:
                self.sample_num_per_class = self.max_sample_num_per_class
            else:
                self.sample_num_per_class = min(self.max_sample_num_per_class, self.MAX_TRAIN_PERCLASS_SAMPLE)

        info("start sample data")
        max_sample_num = min(self.sample_num_per_class, int(np.mean(train_label_distribution)))
        if self.imbalance_flg:
            max_sample_num = int(max_sample_num * self.normal_std)
        # max_sample_num = self.sample_num_per_class
        print("max_sample_num is {}".format(max_sample_num))

        meta_train_add_index = []
        for i in range(self.num_classes):  # 按label类别抽取
            if len(add_index[i]) == 0:
                continue
            elif len(add_index[i]) < self.sample_num_per_class and len(add_index[i]) > 0:

                if self.imbalance_flg:
                    if len(add_index[i]) < max_sample_num:

                        tmp = add_index[i] * int(
                            max_sample_num / len(add_index[i]))
                        tmp += random.sample(add_index[i],
                                             max_sample_num - len(tmp))

                    else:
                        tmp = random.sample(
                            add_index[i], max_sample_num)
                    # tmp = add_index[i] * int(
                    #     self.sample_num_per_class / len(add_index[i]))
                    # tmp += random.sample(add_index[i],
                    #                  self.sample_num_per_class - len(tmp))  # 再采样n个样本, n取差值
                else:
                    tmp = add_index[i] * int(
                        self.sample_num_per_class / len(add_index[i]))
                    tmp += random.sample(add_index[i],
                                         self.sample_num_per_class - len(tmp))

                meta_train_add_index += tmp
            else:  # 随机抽取
                if self.imbalance_flg:
                    meta_train_add_index += random.sample(
                        add_index[i], max_sample_num)
                else:
                    meta_train_add_index += random.sample(
                        add_index[i], self.sample_num_per_class)

        info("end sample data")
        random.shuffle(meta_train_add_index)
        self.meta_train_add_index = meta_train_add_index
        return meta_train_add_index
Esempio n. 4
0
    def get_train_numpy(self, update_train_num):
        if self.train_tfds is None:
            error("Error: train_tfds is None.")
            return self.accum_train_x, self.accum_train_y

        if self.tfds_train_os_iterator is None:
            self.tfds_train_os_iterator = self.train_tfds.make_one_shot_iterator()
            self.tfds_train_iter_next =self.tfds_train_os_iterator.get_next()

        X, Y = [], []
        cur_get_cnt = 0
        cur_data_y = list()

        if self.accm_train_cnt < self.train_num:
            info("show the accum_train_cnt {}".format(self.accm_train_cnt))
            time_train_np_start = time.time()
            while True:
                try:

                    example, labels = self.tfds_convertor_sess.run(self.tfds_train_iter_next)
                    example = np.squeeze(example, (2, 3))
                    example = np.squeeze(example, axis=-1)

                    # print("Note:time example shape={}".format(example.shape))
                    example = example.astype(np.int)
                    self.accum_train_x.extend(example)
                    # cur_data_y.extend(labels)
                    X.extend(example)
                    Y.extend(labels)
                    # cur_get_cnt += 1
                    # self.accm_train_cnt += 1
                    cur_get_cnt += example.shape[0]
                    self.accm_train_cnt += example.shape[0]
                    # example_batch_num += 1

                    if cur_get_cnt >= update_train_num or self.accm_train_cnt >= self.train_num:
                        time_train_np_end = time.time()
                        info("note: now take train update={}, accm_train_cnt={}, cost_time={}s".format(cur_get_cnt,
                                                                                                       self.accm_train_cnt,
                                                                                                       time_train_np_end - time_train_np_start))
                        break

                except tf.errors.OutOfRangeError:
                    break

            if self.accum_train_y is None:
                self.accum_train_y = np.array(cur_data_y)
            else:
                print("check accum_train_y shape {}, cur_data_y shape {}".format(self.accum_train_y.shape,
                                                                                 np.array(cur_data_y).shape))
                # self.accum_train_y = np.concatenate((self.accum_train_y, np.array(cur_data_y)), axis=0)
        else:
            self.tfds_convertor_sess.close()

        return X, Y
Esempio n. 5
0
    def __init__(self,
                 x_train, y_train,
                 metadata,
                 imbalance_level=-1,
                 multi_label=False):

        self.meta_data_x, self.meta_data_y = x_train, y_train
        # 第一次增量的数据就是初始的数据
        self.update_x, self.update_y = x_train, y_train

        self.metadata = metadata

        self.num_classes = self.metadata['class_num']
        self.num_samples_train = self.metadata['train_num']
        self.language = metadata['language']
        self.multi_label = multi_label

        print("num_samples_train:", self.num_samples_train)
        print("num_class_train:", self.num_classes)

        self.val_index = None
        self.tokenizer = None
        self.max_length = None
        self.sample_num_per_class = None
        self.data_feature = {}
        self.eda_feature = {}
        self.pseudo_x_train_size = 0
        self.full_x = []
        self.full_y = np.array([])

        self.x_dict = {i: [] for i in range(self.num_classes)}
        self.imbalance_flg = False
        self.do_generate_sample = False
        self.empty_class_ = []
        self.meta_train_x = []
        self.meta_train_y = np.array([])

        self.full_index = None
        self.imbalance_level = imbalance_level
        self.MAX_TRAIN_PERCLASS_SAMPLE = MAX_TRAIN_PERCLASS_SAMPLE
        info("Init Data Manager! Imbalance_level is {}".format(self.imbalance_level))
        if self.num_classes <= 5 and self.imbalance_level <= 1 and self.num_classes > 2:
            self.MAX_TRAIN_PERCLASS_SAMPLE = 3000
        elif self.num_classes == 2 and self.imbalance_level <= 1:
            self.MAX_TRAIN_PERCLASS_SAMPLE = 3500

        if self.multi_label:
            if self.num_classes<50: #类别不是特别多,每类取100条
                self.MAX_TRAIN_PERCLASS_SAMPLE = 100
            elif self.num_classes<100:
                self.MAX_TRAIN_PERCLASS_SAMPLE = 50
            else:
                self.MAX_TRAIN_PERCLASS_SAMPLE = 20

        info("Init Data Manager! MAX_TRAIN_PERCLASS_SAMPLE is {}".format(self.MAX_TRAIN_PERCLASS_SAMPLE))
Esempio n. 6
0
    def generate_pseudo_samples(self, train_x, train_y):
        info("Do Radam Create Samples!")
        for i in range(self.num_classes):
            new_samples = self.new_generate_samples_idx[i]
            if len(new_samples) == 0:
                continue
            train_x.extend(new_samples)
            new_label = np.zeros((len(new_samples), self.num_classes))
            new_label[:, i] = 1
            train_y = np.concatenate([train_y, new_label], axis=0)

        return train_x, train_y
Esempio n. 7
0
    def sample_dataset_pipeline(self, use_val=False, update_train=True, data_x=None, data_y=None):
        """
        全局采样pipeline
        :param use_val: 是否采用val数据
        :param update_train: 是否更新全量train
        :param data_x: 采样数据来源x:增量数据或者全量原始数据
        :param data_y: 采样数据来源y:增量数据或者全量原始数据
        :return: 均衡采样后的训练集/评估集,use_val为True时,评估集为空
        """
        val_diff_x, val_diff_y = None, None
        ############################ 采样准备阶段 ###################################
        if update_train:
            # 增量更新(第一次样本即增量)
            self.add_index, self.add_val_index = self.sample_val_index(data_y)

            val_diff_x, val_diff_y = map_x_y(self.add_val_index, data_x, data_y)
            # 此时的训练集没有进行采样
            train_diff_x, train_diff_y = flat_map_x_y(index=self.add_index, x=data_x, y=data_y)

            if use_val:  # 如果采用val,即当前不分train valid,全部数据更新meta_train
                info(color_msg(msg="use val is True", color='blue'))
                train_diff_x = train_diff_x + val_diff_x
                train_diff_y = np.concatenate([train_diff_y, val_diff_y], axis=0)
                val_diff_x = None
                val_diff_y = None

            self._update_train_meta(train_diff_x, train_diff_y)

        if val_diff_x:
            val_label_distribution = np.sum(np.array(val_diff_y), 0)
            info("val_distribution: {}".format(val_label_distribution))
            info("Check val_diff_x size {}, val_diff_y size {}".format(len(val_diff_x),
                                                                       val_diff_y.shape[0]))

        info("Check meta_train_x size {}, meta_train_y size {}".format(len(self.meta_train_x),
                                                                       self.meta_train_y.shape[0]))
        info("Check meta_data_x size {}, meta_data_y size {}".format(len(self.meta_data_x),
                                                                     self.meta_data_y.shape[0]))

        ############################ 进入采样阶段 ###################################
        train_x, train_y = self.get_sampling_data_frm_full_train()
        return train_x, train_y, val_diff_x, val_diff_y
Esempio n. 8
0
 def get_sampling_data_frm_full_train(self):
     """
     从全局的train data中采样,只看当前的 meta_train_x, meta_train_y
     :return:
     """
     sample_index = get_sample_index(self.meta_train_y, self.num_classes)
     train_label_distribution = np.sum(np.array(self.meta_train_y), 0)
     info(color_msg("before sampling--train_distribution: {}".format(train_label_distribution),
                    color='yellow'))  # 获取对应label的分布
     self.balance_sampling_index(sample_index, train_label_distribution)
     # 每次只看当前需要采样的数据是否均衡,是否需要生成伪样本
     self.normal_std, self.empty_class_ = get_imbalance_statistic(train_label_distribution)
     self.check_imbalance_level(train_label_distribution)
     self.new_generate_samples_idx = self.generate_presudo_samples(sample_index)
     self.show_data_info()
     self.imbalance_flg = False
     train_x, train_y = self.extend_train_data(x=self.meta_train_x, y=self.meta_train_y)
     train_label_distribution = np.sum(np.array(train_y), 0)
     info(color_msg("after sampling--train_distribution: {}".format(train_label_distribution),
                    color='yellow'))  # 获取对应label的分布
     return train_x, train_y
Esempio n. 9
0
    def _set_max_train_sample_num(self, train_label_distribution):
        self.max_sample_num_per_class = int(
            np.max(train_label_distribution) * 4 / 5)

        if self.sample_num_per_class is None:
            if self.num_samples_train < MAX_SAMPLE_TRIAN:
                self.sample_num_per_class = self.max_sample_num_per_class
            else:
                self.sample_num_per_class = min(self.max_sample_num_per_class, self.MAX_TRAIN_PERCLASS_SAMPLE)
        else:
            # 避免类别数多的情况下,第一次进样少,导致后面连续采样过低
            self.sample_num_per_class = max(self.max_sample_num_per_class, int(np.mean(train_label_distribution)))
        info("check sample_num_per_class:{}".format(self.sample_num_per_class))

        if self.imbalance_flg:
            max_sample_num = min(self.sample_num_per_class, int(np.mean(train_label_distribution)))
            max_sample_num = min(max_sample_num, self.MAX_TRAIN_PERCLASS_SAMPLE)
        else:
            max_sample_num = min(self.sample_num_per_class, self.MAX_TRAIN_PERCLASS_SAMPLE)

        info("max_sample_num is {}".format(max_sample_num))
        return max_sample_num
Esempio n. 10
0
 def show_data_info(self):
     info("check empty class {}, imbalance_flg is {}, normalized std is {} and do generate sample flg is {}".format(
         self.empty_class_, self.imbalance_flg,
         round(self.normal_std, 6), self.do_generate_sample))