def construct_correlation(data_source):
     dataset_attributes = Dataset.get_attributes(data_source)
     attribute_object = []
     a = dict()
     for arr in dataset_attributes:
         a['name'] = arr[0]
         a['predict'] = []
         a['correlation'] = []
         a['num'] = arr[1]
         attribute_object.append(a)
         a = dict()
     return attribute_object
from data_stream_xu.data import Data
from data_stream_xu.dataset import Dataset
from filters.project_creator import Project
import copy as cp

# 1.data generator

# 2.test

# 3.evaluate

# 3.train

if __name__ == '__main__':
    for dataset in Dataset.DATASET:
        for sub_dataset in Dataset.get_sub_dataset(dataset):
            # Set variables
            __instance_count = 0
            __window_size = 500
            __step = 1000

            detection = True
            warning_status = False
            drift_status = False

            # classifier flag
            flag = False

            # Creating a data stream
            data = Data(dataset, sub_dataset)
            labels, attributes = data.get_attributes()
    def sub_thread(self, dataset, sub_dataset, spilt_inform, folder_create,
                   length):

        global global_count
        global naive_bayes_batch_count

        self.nb_set[sub_dataset] = NaiveBayes([0, 1], self.attributes)
        self.last_wl_status[sub_dataset] = dict(r_l=0, hit=[])
        self.instance_set[sub_dataset] = []
        self.nb_classifier_accuracy[sub_dataset] = dict(all_count=0,
                                                        right_count=0,
                                                        accuracy=[])
        self.nb_batch_count[sub_dataset] = 0
        self.nb_drift_prob[sub_dataset] = dict(prob=[], ground_truth=[])
        self.plot_risk_level[sub_dataset] = 0
        self.data_statistics[sub_dataset] = dict(
            name=sub_dataset,
            delay=dict(time=[],
                       accuracy=[],
                       nb_prob=[],
                       bingo=[],
                       hit=[],
                       warning=[],
                       drift=[],
                       warningLevel=[],
                       attributes=self.construct_correlation(dataset),
                       batch_delay=2),
            online=dict(weight=[], time=[], dataNum=[]))
        self.configure[sub_dataset] = {}
        self.warning_level_max[sub_dataset] = 0

        # Set variables
        date_time_flag = False
        __batch_size = 24 * 3600

        __instance_count = 0
        __window_size = Dataset.get_online_learning_batch_interval(
            dataset, sub_dataset)
        __step = 1000
        __start_point = 0
        __count = 0
        __last_unix_time = 0
        __last_warning_status = False
        __last_drift_status = False
        __data_length = Dataset.get_length(dataset, sub_dataset)
        __detect_interval = Dataset.get_detect_batch_interval(
            dataset, sub_dataset)

        lc = []
        bc = []

        current_warning_status = {}
        warning_level_set = []
        current_drift_status = {}
        predict_corr = []
        last_batch_attributions = None

        detection = True
        drift_status = False

        # classifier flag
        prsa_flag = False

        # Creating a data stream
        data = Data(dataset, sub_dataset)
        labels, attributes = data.get_attributes()
        attributes_scheme = AttributeScheme.get_scheme(attributes)
        __numeric_attribute_scheme = attributes_scheme['numeric']

        # Creating a save content
        project = Project('./projects/single/{}'.format(dataset), sub_dataset)
        self.sub_file_path[sub_dataset] = folder_create.sub_folder(sub_dataset)

        # Initializing a learner
        learner = Logistic(labels, attributes_scheme['numeric'])
        learner = OnlineAccuracyUpdatedEnsemble(labels,
                                                attributes_scheme['numeric'],
                                                learner,
                                                windowSize=__window_size,
                                                classifierLimit=10)

        # Initializing a drift detector
        drift_detector = DDM(min_instance=__detect_interval)

        # Initializing a evaluator
        evaluator = EvaluateWithWindowSize(learner, drift_detector, project,
                                           __window_size)

        # train & test
        for x, y, attribute in data.data(batch_size=1):
            if attribute is not None:
                attributes_scheme = AttributeScheme.get_scheme(attributes)
                __numeric_attribute_scheme = attributes_scheme['numeric']
                continue

            instance = x.tolist()[0] + [int(y.tolist()[0][0])]
            # print(instance)

            # 每条数据的unix时间戳
            # prsa data
            if dataset == 'prsa_data':
                date_time_flag = True
                date_time = list(map(int, instance[:4]))
                d = datetime.date(date_time[0], date_time[1], date_time[2])
                tt = datetime.time(date_time[3])
                datetime_str = str(d) + ' ' + str(tt)
                unix_time = int(
                    time.mktime(
                        time.strptime(datetime_str, '%Y-%m-%d %H:%M:%S')))
                if unix_time >= 1363881600:
                    prsa_flag = True
            elif dataset == 'movie_data':
                # movie data
                if instance[-2] > 62091:
                    date_time_flag = True
                    prsa_flag = True
                    date_time = DistributedOnlineLearning.get_date(
                        instance[-2])
                    datetime_str = str(date_time) + ' ' + '00:00:00'
                    unix_time = int(
                        time.mktime(
                            time.strptime(datetime_str, '%Y-%m-%d %H:%M:%S')))
                    instance.pop(-2)  # 去掉时间
            elif dataset == 'netease_data':
                date_time_flag = True
                prsa_flag = True
                unix_time = int(instance[0])
                instance.pop(0)  # 去掉时间
            instance[0:len(instance) - 1] = Normalizer.normalize(
                instance[0:len(instance) - 1], __numeric_attribute_scheme)
            # print(instance)
            __instance_count += 1

            if __instance_count % 10000 == 0 or __instance_count == __data_length:
                percentage = (__instance_count / __data_length) * 100
                print(
                    sub_dataset, "%0.2f" % percentage +
                    "% of instances are prequentially processed!")

            if __instance_count > __window_size and date_time_flag and prsa_flag:
                if dataset == 'prsa_data':
                    if unix_time == 1363881600:
                        print(__instance_count)
                        __start_point = unix_time
                        self.data_statistics[sub_dataset]['delay'][
                            'time'].append(__start_point)
                elif dataset == 'movie_data':
                    if __instance_count == __window_size + 1:
                        print(__instance_count)
                        __start_point = unix_time
                        self.data_statistics[sub_dataset]['delay'][
                            'time'].append(__start_point)
                elif dataset == 'netease_data':
                    if __instance_count == __window_size + 1:
                        print(__instance_count)
                        __start_point = unix_time
                        self.data_statistics[sub_dataset]['delay'][
                            'time'].append(__start_point)

                difference_value = unix_time - __start_point
                if difference_value >= __batch_size and __instance_count != __data_length:
                    batch_interval = int(difference_value / __batch_size)
                    for cc in range(batch_interval):
                        if cc == 0:
                            r_f = False
                        else:
                            r_f = True
                        self.con.acquire()  # 获得锁
                        __start_point += __batch_size
                        # for every batch
                        # 权重
                        self.data_statistics[sub_dataset]['online'][
                            'weight'].append(
                                learner.currentClassifierWeights.tolist())
                        # 属性记录
                        self.data_statistics[sub_dataset]['delay'][
                            'attributes'], last_batch_attributions, lc, bc = self.calculate_correlation(
                                predict_corr,
                                last_batch_attributions,
                                self.data_statistics[sub_dataset]['delay']
                                ['attributes'],
                                spilt_inform,
                                lc,
                                bc,
                                repeat_flag=r_f)
                        # 准确度(可能会变, 目前是后端计算drift的准确率)
                        self.data_statistics[sub_dataset]['delay'][
                            'accuracy'].append(drift_detector.accuracy)
                        # batch开始时间
                        self.data_statistics[sub_dataset]['delay'][
                            'time'].append(__start_point)
                        # batch 数量
                        self.data_statistics[sub_dataset]['online'][
                            'dataNum'].append(__count)
                        # warning level
                        self.data_statistics[sub_dataset]['delay'][
                            'warningLevel'], warning_level_set, self.data_statistics[
                                sub_dataset]['delay'][
                                    'hit'], self.warning_level_max[
                                        sub_dataset] = self.calculate_warning_level(
                                            warning_level_set,
                                            self.data_statistics[sub_dataset]
                                            ['delay']['warningLevel'],
                                            self.data_statistics[sub_dataset]
                                            ['delay']['hit'],
                                            self.
                                            warning_level_max[sub_dataset],
                                            repeated_flag=r_f)

                        __count = 0

                        # self.last_wl_status[sub_dataset]['r_l'] = self.wl_transformer(self.data_statistics[sub_dataset]['delay']['warningLevel'][-1]['max'])
                        if len(self.last_wl_status[sub_dataset]['hit']) > 2:
                            self.last_wl_status[sub_dataset]['hit'].pop(0)
                            self.last_wl_status[sub_dataset]['hit'].append(
                                self.data_statistics[sub_dataset]['delay']
                                ['hit'][-1])
                        else:
                            self.last_wl_status[sub_dataset]['hit'].append(
                                self.data_statistics[sub_dataset]['delay']
                                ['hit'][-1])

                        global_count += 1

                        # self.plot_risk_level[sub_dataset] = self.data_statistics[sub_dataset]['delay']['warningLevel'][-1]['max']

                        if global_count == length:
                            global_count = 0
                            # 训练和测试每个模型的贝叶斯
                            d_s_n = self.nb_set.keys()
                            # 训练贝叶斯
                            if len(self.data_statistics[sub_dataset]['delay']
                                   ['warningLevel']) > 2:
                                for data_set_name in d_s_n:
                                    self.instance_set[data_set_name] = [self.wl_transformer(self.data_statistics[value]['delay']['warningLevel'][-2]['max'])
                                                                        for value in d_s_n if value != data_set_name] \
                                                                       + [max(self.last_wl_status[data_set_name]['hit'])]
                                if len(self.data_statistics[sub_dataset]
                                       ['delay']['warningLevel']) > 3:
                                    # testing
                                    for temple_name in d_s_n:
                                        self.nb_set[temple_name].set_ready()
                                        predict = self.nb_set[
                                            temple_name].do_testing(
                                                self.instance_set[temple_name])
                                        self.data_statistics[temple_name][
                                            'delay']['nb_prob'].append(
                                                self.nb_set[temple_name].
                                                drift_prob)
                                        self.nb_drift_prob[temple_name][
                                            'prob'].append(
                                                self.nb_set[temple_name].
                                                drift_prob)
                                        self.nb_drift_prob[temple_name][
                                            'ground_truth'].append(
                                                self.data_statistics[
                                                    temple_name]['delay']
                                                ['warningLevel'][-2]['max'])

                                        #  测试使用3个batch中最大的drift_level
                                        # self.nb_drift_prob[temple_name]['ground_truth'].append(
                                        #     max(
                                        #         self.data_statistics[temple_name]['delay']['warningLevel'][-3]['max'],
                                        #         self.data_statistics[temple_name]['delay']['warningLevel'][-2]['max'],
                                        #         self.data_statistics[temple_name]['delay']['warningLevel'][-1]['max']
                                        #     )
                                        # )

                                        if predict == self.instance_set[
                                                temple_name][-1]:
                                            self.nb_classifier_accuracy[
                                                temple_name][
                                                    'right_count'] += 1
                                        self.nb_classifier_accuracy[
                                            temple_name]['all_count'] += 1
                                        self.nb_classifier_accuracy[
                                            temple_name]['accuracy'].append(
                                                round(
                                                    self.
                                                    nb_classifier_accuracy[
                                                        temple_name]
                                                    ['right_count'] / self.
                                                    nb_classifier_accuracy[
                                                        temple_name]
                                                    ['all_count'], 4))
                                    # training
                                    for temple_name in d_s_n:
                                        self.nb_set[temple_name].do_training(
                                            self.instance_set[temple_name],
                                            drift_status)
                                else:
                                    for temple_name in d_s_n:
                                        self.nb_set[temple_name].do_training(
                                            self.instance_set[temple_name],
                                            drift_status)
                            self.con.notifyAll()
                        else:
                            self.con.wait()

                    predict_corr = [instance]
                    __count = 1
                else:
                    __count += 1
                    predict_corr.append(instance)

                predicted_value = learner.do_testing(instance)  # 每一个实例训练一次

                prediction_status = evaluator.calculate_accuracy(
                    predicted_value,
                    instance[-1],
                    output_size=__step,
                    output_flag=False)

                if detection is True:
                    warning_status, drift_status = drift_detector.detect(
                        prediction_status)
                    if warning_status is not __last_warning_status:
                        if warning_status:
                            current_warning_status['start'] = unix_time
                            current_warning_status['max_accuracy'] = [
                                drift_detector.o_s_d_min
                            ]
                            current_warning_status['max_accuracy_time'] = [
                                unix_time
                            ]
                            current_warning_status['backend_accuracy'] = [
                                drift_detector.accuracy
                            ]
                        else:
                            current_warning_status['end'] = __last_unix_time
                            self.data_statistics[sub_dataset]['delay'][
                                'warning'].append(current_warning_status)
                            current_warning_status = {}
                    else:
                        if warning_status:
                            current_warning_status['max_accuracy'].append(
                                drift_detector.o_s_d_min)
                            current_warning_status['max_accuracy_time'].append(
                                unix_time)
                            current_warning_status['backend_accuracy'].append(
                                drift_detector.accuracy)
                    if drift_status is not __last_drift_status:
                        if drift_status:
                            current_drift_status['start'] = unix_time
                            current_drift_status['max_accuracy'] = [
                                drift_detector.o_s_d_min
                            ]
                            current_drift_status['max_accuracy_time'] = [
                                unix_time
                            ]
                            current_drift_status['backend_accuracy'] = [
                                drift_detector.accuracy
                            ]
                        else:
                            current_drift_status['end'] = __last_unix_time
                            self.data_statistics[sub_dataset]['delay'][
                                'drift'].append(current_drift_status)
                            current_drift_status = {}
                    else:
                        if drift_status:
                            current_drift_status['max_accuracy'].append(
                                drift_detector.o_s_d_min)
                            current_drift_status['max_accuracy_time'].append(
                                unix_time)
                            current_drift_status['backend_accuracy'].append(
                                drift_detector.accuracy)

                    __last_warning_status = warning_status
                    __last_drift_status = drift_status
                    __last_unix_time = unix_time
                else:
                    # warning_status = False
                    drift_status = False

                warning_level_set.append(
                    drift_detector.risk)  # 记录ddm检测后的warning_level
                # if 1365912000-12*3600 <= unix_time <= 1365998400-12*3600:
                #     if unix_time == 1365998400 - 13*3600:
                #         print(sub_dataset, len(warning_level_set), warning_level_set)
                #     if unix_time == 1365998400-12*3600:
                #         print(sub_dataset, self.data_statistics[sub_dataset]['delay']['warningLevel'][-1])
                #     if drift_status:
                #         print('数据集{}真实drift点->'.format(sub_dataset), unix_time, drift_detector.risk)
                # if 1366401600 <= unix_time <= 1366488000:
                #     # print(sub_dataset, len(warning_level_set), warning_level_set)
                #     if unix_time == 1366488000:
                #         print(sub_dataset, self.data_statistics[sub_dataset]['delay']['warningLevel'][-1])
                #     if drift_status:
                #         print('数据集{}真实drift点->'.format(sub_dataset), unix_time, drift_detector.risk)

                if __instance_count == __data_length:  # 最后一个batch可能只有少部分数据,要考虑
                    print(sub_dataset, global_count)
                    self.con.acquire()  # 获得锁
                    # 权重
                    self.data_statistics[sub_dataset]['online'][
                        'weight'].append(
                            learner.currentClassifierWeights.tolist())
                    # 属性记录
                    self.data_statistics[sub_dataset]['delay'][
                        'attributes'], last_batch_attributions, lc, bc = self.calculate_correlation(
                            predict_corr,
                            last_batch_attributions,
                            self.data_statistics[sub_dataset]['delay']
                            ['attributes'],
                            spilt_inform,
                            lc,
                            bc,
                            repeat_flag=False)
                    # 准确度(可能会变, 目前是后端计算drift的准确率)
                    self.data_statistics[sub_dataset]['delay'][
                        'accuracy'].append(drift_detector.accuracy)
                    # batch 数量
                    self.data_statistics[sub_dataset]['online'][
                        'dataNum'].append(__count)
                    # warning level
                    self.data_statistics[sub_dataset]['delay'][
                        'warningLevel'], warning_level_set, self.data_statistics[
                            sub_dataset]['delay'][
                                'hit'], self.warning_level_max[
                                    sub_dataset] = self.calculate_warning_level(
                                        warning_level_set,
                                        self.data_statistics[sub_dataset]
                                        ['delay']['warningLevel'],
                                        self.data_statistics[sub_dataset]
                                        ['delay']['hit'],
                                        self.warning_level_max[sub_dataset],
                                        repeated_flag=False)

                    self.last_wl_status[sub_dataset][
                        'r_l'] = self.wl_transformer(
                            self.data_statistics[sub_dataset]['delay']
                            ['warningLevel'][-1]['max'])
                    if len(self.last_wl_status[sub_dataset]['hit']) > 2:
                        self.last_wl_status[sub_dataset]['hit'].pop(0)
                        self.last_wl_status[sub_dataset]['hit'].append(
                            self.data_statistics[sub_dataset]['delay']['hit']
                            [-1])
                    else:
                        self.last_wl_status[sub_dataset]['hit'].append(
                            self.data_statistics[sub_dataset]['delay']['hit']
                            [-1])

                    global_count += 1

                    # 画 drift probability
                    # Zip.plot_multi_1(self.nb_drift_prob[sub_dataset], sub_dataset)

                    if global_count == length:
                        global_count = 0
                        # # 训练和测试每个模型的贝叶斯
                        d_s_n = self.nb_set.keys()
                        for data_set_name in d_s_n:
                            self.instance_set[data_set_name] = [self.wl_transformer(self.data_statistics[value]['delay']['warningLevel'][-2]['max'])
                                                                for value in d_s_n if value != data_set_name] \
                                                               + [max(self.last_wl_status[data_set_name]['hit'])]
                        # testing
                        for temple_name in d_s_n:
                            self.nb_set[temple_name].set_ready()
                            predict = self.nb_set[temple_name].do_testing(
                                self.instance_set[temple_name])
                            self.data_statistics[temple_name]['delay'][
                                'nb_prob'].append(
                                    self.nb_set[temple_name].drift_prob)
                            self.nb_drift_prob[temple_name]['prob'].append(
                                self.nb_set[temple_name].drift_prob)
                            self.nb_drift_prob[temple_name][
                                'ground_truth'].append(
                                    self.data_statistics[temple_name]['delay']
                                    ['warningLevel'][-2]['max'])

                            #  测试使用3个batch中最大的drift_level
                            # self.nb_drift_prob[temple_name]['ground_truth'].append(
                            #     max(
                            #         self.data_statistics[temple_name]['delay']['warningLevel'][-3]['max'],
                            #         self.data_statistics[temple_name]['delay']['warningLevel'][-2]['max'],
                            #         self.data_statistics[temple_name]['delay']['warningLevel'][-1]['max']
                            #     )
                            # )

                            if predict == self.instance_set[temple_name][-1]:
                                self.nb_classifier_accuracy[temple_name][
                                    'right_count'] += 1
                            self.nb_classifier_accuracy[temple_name][
                                'all_count'] += 1
                            self.nb_classifier_accuracy[temple_name][
                                'accuracy'].append(
                                    round(
                                        self.nb_classifier_accuracy[
                                            temple_name]['right_count'] /
                                        self.nb_classifier_accuracy[
                                            temple_name]['all_count'], 4))
                        # training
                        for temple_name in d_s_n:
                            self.nb_set[temple_name].do_training(
                                self.instance_set[temple_name], drift_status)

                        # 保存每个数据源的状态
                        # ① 每个数据源概念漂移检测+贝叶斯drift概率  + configure

                        for key_name in self.data_statistics.keys():
                            self.configure[key_name][
                                'timeStart'] = self.data_statistics[key_name][
                                    'delay']['time'][0]
                            self.configure[key_name][
                                'timeEnd'] = self.data_statistics[key_name][
                                    'delay']['time'][-1]
                            self.configure[key_name]['timeUnit'] = __batch_size
                            self.configure[key_name]['dataNumMax'] = max(
                                self.data_statistics[key_name]['online']
                                ['dataNum'])
                            self.configure[key_name][
                                'warningLevelMax'] = self.warning_level_max[
                                    key_name]
                            self.configure[key_name]['warningLevel'] = [[
                                0, 2
                            ], [2, 3], [3, 10000]]
                            self.data_statistics[key_name]['delay'][
                                'hit'] = self.data_statistics[key_name][
                                    'delay']['hit'][:-1]

                        self.save_file_1(self.configure,
                                         self.sub_file_path,
                                         type_name='configure')
                        self.save_file_1(self.data_statistics,
                                         self.sub_file_path,
                                         type_name=None)

                        self.save_file(self.nb_drift_prob,
                                       folder_create.get_path(),
                                       type_name='experiment_with_the_figure')
                        Zip.plot_multi(self.nb_classifier_accuracy)
                        Zip(folder_create.get_path())
                        # 提示所有数据训练完成,可结束主进程
                        print(
                            'All data has been trained. Please finish the main process manually!'
                        )
                        self.con.notifyAll()
                    else:
                        self.con.wait()

                # training
                learner.do_training(instance, drift_status)
            else:
                # training
                learner.do_training(instance, drift_status)
                # training
                learner.do_training(instance, drift_status)
            else:
                # training
                learner.do_training(instance, drift_status)


if __name__ == "__main__":
    data_path = '/home/hdwu/xu/movie_data'
    Dataset.PATH = data_path
    global_count = 0
    naive_bayes_batch_count = 0
    global_test_flag = False
    lock_con = threading.Condition()
    threads = []
    for ds in Dataset.DATASET:
        information = Dataset.get_spilt_inform(ds)
        f_c = Folder('../result/{}'.format(ds))
        sub_data_set_list = Dataset.get_sub_dataset(ds)
        print(sub_data_set_list)
        dol = DistributedOnlineLearning(lock_con,
                                        data_length=len(sub_data_set_list) - 1)
        for sds_id, sds in enumerate(sub_data_set_list):
            t = threading.Thread(target=dol.sub_thread,
                                 args=(ds, sds, information, f_c,
                                       len(sub_data_set_list)))
            t.start()
            threads.append(t)
        for thread in threads:
            thread.join()
Beispiel #5
0
    def run(self, data_set, sub_data_set):

        if data_set == 'prsa_data':
            self.__batch_size = 24 * 3600  # 3600 represent 1 hour
        elif data_set == 'movie_data':
            self.__batch_size = 24 * 7 * 3600  # 3600 represent 1 hour

        self.__data_length = Dataset.get_length(data_set, sub_data_set)
        self.attributes_set = SaveDifferentData.construct_correlation(data_set)
        self.spilt_inform = Dataset.get_spilt_inform(data_set)

        data = Data(data_set, sub_data_set)
        labels, attributes = data.get_attributes()
        attributes_scheme = AttributeScheme.get_scheme(attributes)
        self.__numeric_attribute_scheme = attributes_scheme['nominal']
        self.__numeric_attribute_scheme = attributes_scheme['numeric']

        # Initializing a learner
        learner = Logistic(labels, attributes_scheme['numeric'])
        learner = OnlineAccuracyUpdatedEnsemble(
            labels,
            attributes_scheme['numeric'],
            learner,
            windowSize=self.__window_size,
            classifierLimit=self.__classifier_limit)

        # Initializing a drift detector
        drift_detector = DDM()

        # Creating a save content
        project = Project('./projects/distributed/{}'.format(data_set),
                          sub_data_set)

        # Initializing a evaluator
        evaluator = EvaluateWithWindowSize(learner, drift_detector, project,
                                           self.__window_size)

        # train & test
        for x, y, attribute in data.data(batch_size=1):
            if attribute is not None:
                attributes_scheme = AttributeScheme.get_scheme(attributes)
                self.__numeric_attribute_scheme = attributes_scheme['nominal']
                self.__numeric_attribute_scheme = attributes_scheme['numeric']
                continue

            instance = x.tolist()[0] + [int(y.tolist()[0][0])]

            # 每条数据的unix时间戳
            # prsa data
            if data_set == 'prsa_data':
                self.date_time_flag = True
                date_time = list(map(int, instance[:4]))
                d = datetime.date(date_time[0], date_time[1], date_time[2])
                t = datetime.time(date_time[3])
                datetime_str = str(d) + ' ' + str(t)
                self.unix_time = int(
                    time.mktime(
                        time.strptime(datetime_str, '%Y-%m-%d %H:%M:%S')))
                if self.unix_time >= 1363881600:
                    self.prsa_flag = True
            elif data_set == 'movie_data':
                # movie data
                if instance[-2] > 62091:
                    self.date_time_flag = True
                    self.prsa_flag = True
                    date_time = self._get_date(instance[-2])
                    datetime_str = str(date_time) + ' ' + '00:00:00'
                    self.unix_time = int(
                        time.mktime(
                            time.strptime(datetime_str, '%Y-%m-%d %H:%M:%S')))

            instance[0:len(instance) - 1] = Normalizer.normalize(
                instance[0:len(instance) - 1], self.__numeric_attribute_scheme)
            self.__instance_count += 1

            if self.__instance_count > self.__window_size and self.date_time_flag and self.prsa_flag:
                if self.unix_time == 1363881600:
                    self.__start_point = self.unix_time
                    self.batch_start_time.append(self.__start_point)
                if self.unix_time - self.__start_point >= self.__batch_size:
                    self.__start_point = self.unix_time
                    # for every batch
                    # 权重
                    self.weights.append(
                        learner.currentClassifierWeights.tolist())
                    # 属性记录
                    self.calculate_correlation()
                    # 准确度(可能会变, 目前是后端计算drift的准确率)
                    self.accuracy.append(drift_detector.accuracy)
                    # batch开始时间
                    self.batch_start_time.append(self.__start_point)
                    # batch 数量
                    self.batch_count.append(self.__count)
                    # 目前是 batch 中预测正确的个数, 后面改为concept drift 预测正确个数(后面为drift预测正确的个数)
                    self.right_count.append(self.__bingo)
                    # warning level
                    self.calculate_warning_level()

                    # print(batch_start_time, batch_count)
                    self.predict_corr = [instance]
                    self.__count = 1
                    self.__bingo = 0
                else:
                    self.__count += 1
                    self.predict_corr.append(instance)
                self.warning_level_set.append(drift_detector.risk)

                predicted_value = learner.do_testing(instance)

                prediction_status = evaluator.calculate_accuracy(
                    predicted_value,
                    instance[-1],
                    output_size=self.__step,
                    output_flag=False)
                if prediction_status:
                    self.__bingo += 1

                if self.detection is True:
                    self.warning_status, self.drift_status = drift_detector.detect(
                        prediction_status)
                    if self.warning_status is not self.__last_warning_status:
                        if self.warning_status:
                            self.current_warning_status[
                                'start'] = self.unix_time
                            self.current_warning_status['max_accuracy'] = [
                                drift_detector.o_s_d_min
                            ]
                            self.current_warning_status[
                                'max_accuracy_time'] = [self.unix_time]
                        else:
                            self.current_warning_status[
                                'end'] = self.__last_unix_time
                            self.warning.append(self.current_warning_status)
                            self.current_warning_status = {}
                    else:
                        if self.warning_status:
                            self.current_warning_status['max_accuracy'].append(
                                drift_detector.o_s_d_min)
                            self.current_warning_status[
                                'max_accuracy_time'].append(self.unix_time)
                    if self.drift_status is not self.__last_drift_status:
                        if self.drift_status:
                            self.current_drift_status['start'] = self.unix_time
                            self.current_drift_status['max_accuracy'] = [
                                drift_detector.o_s_d_min
                            ]
                            self.current_drift_status['max_accuracy_time'] = [
                                self.unix_time
                            ]
                        else:
                            self.current_drift_status[
                                'end'] = self.__last_unix_time
                            self.drift.append(self.current_drift_status)
                            self.current_drift_status = {}
                    else:
                        if self.drift_status:
                            self.current_drift_status['max_accuracy'].append(
                                drift_detector.o_s_d_min)
                            self.current_drift_status[
                                'max_accuracy_time'].append(self.unix_time)

                    self.__last_warning_status = self.warning_status
                    self.__last_drift_status = self.drift_status
                    self.__last_unix_time = self.unix_time
                else:
                    self.warning_status = False
                    self.drift_status = False

                if self.__instance_count == self.__data_length:  # 最后一个batch可能只有少部分数据,要考虑
                    # 权重
                    self.weights.append(
                        learner.currentClassifierWeights.tolist())
                    # 属性记录
                    self.calculate_correlation()
                    # 准确度(可能会变, 目前是后端计算drift的准确率)
                    self.accuracy.append(drift_detector.accuracy)
                    # batch 数量
                    self.batch_count.append(self.__count)
                    # 目前是 batch 中预测正确的个数, 后面改为concept drift 预测正确个数
                    self.right_count.append(self.__bingo)
                    # warning level
                    self.calculate_warning_level()

                # training
                learner.do_training(instance, self.drift_status)
            else:
                # training
                learner.do_training(instance, self.drift_status)

        self.save_file()
            nb_property.append(att)
    return nb_property


def wl_transformer(num):
    if num < 2:
        return 1
    elif 2 <= num < 3:
        return 2
    else:
        return 3


if __name__ == '__main__':
    for dataset in Dataset.DATASET:
        spilt_inform = Dataset.get_spilt_inform(dataset)
        folder_create = Folder('E:/zju/result/{}'.format(dataset))
        accuracy_nb = {}
        for sub_dataset in Dataset.get_sub_dataset(dataset):

            # Set variables
            date_time = 0
            date_time_flag = False

            if dataset == 'prsa_data':
                __batch_size = 24 * 3600  # 3600 represent 1 hour
            elif dataset == 'movie_data':
                __batch_size = 24 * 7 * 3600  # 3600 represent 1 hour

            accuracy_nb[sub_dataset] = []
Beispiel #7
0
    def sub_thread(self, dataset, d_id, sub_dataset, spilt_inform,
                   folder_create, length):

        global global_count

        # Set variables
        date_time_flag = False
        data_set_id = d_id

        if dataset == 'prsa_data':
            __batch_size = 24 * 3600  # 3600 represent 1 hour
        elif dataset == 'movie_data':
            __batch_size = 24 * 7 * 3600  # 3600 represent 1 hour
        else:
            __batch_size = 0

        __instance_count = 0
        __window_size = 500
        __step = 1000
        __start_point = 0
        __count = 0
        __bingo = 0
        __last_unix_time = 0
        __last_warning_status = False
        __last_drift_status = False
        __data_length = Dataset.get_length(dataset, sub_dataset)

        configure = {}
        data_statistics = dict()
        delay = {}
        online = {}

        weights = []
        accuracy = []
        batch_start_time = []
        batch_count = []
        right_count = []
        warning = []
        current_warning_status = {}
        warning_level = []
        warning_level_set = []
        drift = []
        current_drift_status = {}
        predict_corr = []
        hit = []
        last_batch_attributions = None

        detection = True
        drift_status = False

        # classifier flag
        prsa_flag = False

        attributes_set = DistributedOnlineLearning.construct_correlation(
            dataset)

        # Creating a data stream
        data = Data(dataset, sub_dataset)
        labels, attributes = data.get_attributes()
        attributes_scheme = AttributeScheme.get_scheme(attributes)
        __numeric_attribute_scheme = attributes_scheme['numeric']

        # Creating a save content
        project = Project('projects/single/{}'.format(dataset), sub_dataset)
        sub_folder_path = folder_create.sub_folder(sub_dataset)

        # Initializing a learner
        learner = Logistic(labels, attributes_scheme['numeric'])
        learner = OnlineAccuracyUpdatedEnsemble(labels,
                                                attributes_scheme['numeric'],
                                                learner,
                                                windowSize=__window_size,
                                                classifierLimit=10)

        # Initializing a drift detector
        drift_detector = DDM()

        # Initializing a evaluator
        evaluator = EvaluateWithWindowSize(learner, drift_detector, project,
                                           __window_size)

        # train & test
        for x, y, attribute in data.data(batch_size=1):
            if attribute is not None:
                attributes_scheme = AttributeScheme.get_scheme(attributes)
                __numeric_attribute_scheme = attributes_scheme['numeric']
                continue

            instance = x.tolist()[0] + [int(y.tolist()[0][0])]

            # 每条数据的unix时间戳
            # prsa data
            if dataset == 'prsa_data':
                date_time_flag = True
                date_time = list(map(int, instance[:4]))
                d = datetime.date(date_time[0], date_time[1], date_time[2])
                t = datetime.time(date_time[3])
                datetime_str = str(d) + ' ' + str(t)
                unix_time = int(
                    time.mktime(
                        time.strptime(datetime_str, '%Y-%m-%d %H:%M:%S')))
                if unix_time >= 1363881600:
                    prsa_flag = True
            elif dataset == 'movie_data':
                # movie data
                if instance[-2] > 62091:
                    date_time_flag = True
                    prsa_flag = True
                    date_time = DistributedOnlineLearning.get_date(
                        instance[-2])
                    datetime_str = str(date_time) + ' ' + '00:00:00'
                    unix_time = int(
                        time.mktime(
                            time.strptime(datetime_str, '%Y-%m-%d %H:%M:%S')))

            instance[0:len(instance) - 1] = Normalizer.normalize(
                instance[0:len(instance) - 1], __numeric_attribute_scheme)
            __instance_count += 1

            if __instance_count > __window_size and date_time_flag and prsa_flag:
                if dataset == 'prsa_data':
                    if unix_time == 1363881600:
                        __start_point = unix_time
                        batch_start_time.append(__start_point)
                elif dataset == 'movie_data':
                    if __instance_count == __window_size + 1:
                        __start_point = unix_time
                        batch_start_time.append(__start_point)
                if unix_time - __start_point >= __batch_size:

                    self.con.acquire()  # 获得锁

                    __start_point = unix_time
                    # for every batch
                    # 权重
                    weights.append(learner.currentClassifierWeights.tolist())
                    # 属性记录
                    attributes_set, last_batch_attributions = DistributedOnlineLearning.calculate_correlation(
                        predict_corr, last_batch_attributions, attributes_set,
                        spilt_inform)
                    # 准确度(可能会变, 目前是后端计算drift的准确率)
                    accuracy.append(drift_detector.accuracy)
                    # batch开始时间
                    batch_start_time.append(__start_point)
                    # batch 数量
                    batch_count.append(__count)
                    # 目前是 batch 中预测正确的个数, 后面改为concept drift 预测正确个数
                    right_count.append(__bingo)
                    # warning level
                    warning_level, warning_level_set, hit = DistributedOnlineLearning.calculate_warning_level(
                        warning_level_set, warning_level, hit)

                    predict_corr = [instance]
                    __count = 1
                    __bingo = 0

                    # 新增bayes
                    if len(warning_level) > 1:
                        inst = [
                            self.wl_transformer(warning_level[-2]['max']),
                            hit[-1]
                        ]
                        if len(warning_level) > 2:
                            self.nb.set_ready()
                            predicted = self.nb.do_testing(inst)
                            print(data_set_id, predicted)
                            self.nb.do_training(inst)
                        else:
                            self.nb.do_training(inst)
                    global_count += 1

                    if global_count == length:
                        global_count = 0
                        self.con.notifyAll()
                    else:
                        self.con.wait()

                else:
                    __count += 1
                    predict_corr.append(instance)

                warning_level_set.append(drift_detector.risk)

                predicted_value = learner.do_testing(instance)

                prediction_status = evaluator.calculate_accuracy(
                    predicted_value,
                    instance[-1],
                    output_size=__step,
                    output_flag=False)
                if prediction_status:
                    __bingo += 1

                if detection is True:
                    warning_status, drift_status = drift_detector.detect(
                        prediction_status)
                    if warning_status is not __last_warning_status:
                        if warning_status:
                            current_warning_status['start'] = unix_time
                            current_warning_status['max_accuracy'] = [
                                drift_detector.o_s_d_min
                            ]
                            current_warning_status['max_accuracy_time'] = [
                                unix_time
                            ]
                            current_warning_status['backend_accuracy'] = [
                                drift_detector.accuracy
                            ]
                        else:
                            current_warning_status['end'] = __last_unix_time
                            warning.append(current_warning_status)
                            current_warning_status = {}
                    else:
                        if warning_status:
                            current_warning_status['max_accuracy'].append(
                                drift_detector.o_s_d_min)
                            current_warning_status['max_accuracy_time'].append(
                                unix_time)
                            current_warning_status['backend_accuracy'].append(
                                drift_detector.accuracy)
                    if drift_status is not __last_drift_status:
                        if drift_status:
                            current_drift_status['start'] = unix_time
                            current_drift_status['max_accuracy'] = [
                                drift_detector.o_s_d_min
                            ]
                            current_drift_status['max_accuracy_time'] = [
                                unix_time
                            ]
                            current_drift_status['backend_accuracy'] = [
                                drift_detector.accuracy
                            ]
                        else:
                            current_drift_status['end'] = __last_unix_time
                            drift.append(current_drift_status)
                            current_drift_status = {}
                    else:
                        if drift_status:
                            current_drift_status['max_accuracy'].append(
                                drift_detector.o_s_d_min)
                            current_drift_status['max_accuracy_time'].append(
                                unix_time)
                            current_drift_status['backend_accuracy'].append(
                                drift_detector.accuracy)

                    __last_warning_status = warning_status
                    __last_drift_status = drift_status
                    __last_unix_time = unix_time
                else:
                    warning_status = False
                    drift_status = False
                # if 1393401600 - 12*3600 <= unix_time <= 1393401600 + 12*3600:
                #     print("准确率为, S, P", evaluator.accuracy, drift_detector.S, drift_detector.P)

                if __instance_count == __data_length:  # 最后一个batch可能只有少部分数据,要考虑

                    self.con.acquire()  # 获得锁

                    # 权重
                    weights.append(learner.currentClassifierWeights.tolist())
                    # 属性记录
                    attributes_set, last_batch_attributions = DistributedOnlineLearning.calculate_correlation(
                        predict_corr, last_batch_attributions, attributes_set,
                        spilt_inform)
                    # 准确度(可能会变, 目前是后端计算drift的准确率)
                    accuracy.append(drift_detector.accuracy)
                    # batch 数量
                    batch_count.append(__count)
                    # 目前是 batch 中预测正确的个数, 后面改为concept drift 预测正确个数
                    right_count.append(__bingo)
                    # warning level
                    warning_level, warning_level_set, hit = DistributedOnlineLearning.calculate_warning_level(
                        warning_level_set, warning_level, hit)

                    # 新增bayes
                    if len(warning_level) > 1:
                        inst = [
                            self.wl_transformer(warning_level[-2]['max']),
                            hit[-1]
                        ]
                        if len(warning_level) > 2:
                            self.nb.do_testing(inst)
                            self.nb.do_training(inst)
                        else:
                            self.nb.do_training(inst)
                    global_count += 1

                    if global_count == length:
                        global_count = 0
                        if __instance_count == __data_length:
                            # 保存各种数据
                            pass
                            self.con.notifyAll()
                    else:
                        if __instance_count == __data_length:
                            # 保存各种数据
                            pass
                        self.con.wait()

                # training
                learner.do_training(instance, drift_status)
            else:
                # training
                learner.do_training(instance, drift_status)

        configure['timeStart'] = batch_start_time[0]
        configure['timeEnd'] = batch_start_time[-1]
        configure['timeUnit'] = __batch_size
        configure['dataNumMax'] = max(batch_count)

        data_statistics['name'] = sub_dataset
        delay['time'] = batch_start_time
        delay['accuracy'] = accuracy
        delay['bingo'] = []
        delay['hit'] = hit
        delay['warning'] = warning
        delay['drift'] = drift
        delay['warningLevel'] = warning_level
        delay['attributes'] = attributes_set

        online['weight'] = weights
        online['time'] = batch_start_time
        online['dataNum'] = batch_count

        data_statistics['delay'] = delay
        data_statistics['online'] = online
    def sub_thread(self, dataset, d_id, sub_dataset, spilt_inform,
                   folder_create, length):

        global global_count

        self.nb_set[sub_dataset] = NaiveBayes([0, 1], self.attributes)
        self.last_wl_status[sub_dataset] = []
        self.instance_set[sub_dataset] = []
        self.nb_classifier_accuracy[sub_dataset] = dict(all_count=0,
                                                        right_count=0,
                                                        accuracy=[])

        # Set variables
        date_time_flag = False
        data_set_id = d_id

        __batch_size = 0

        __instance_count = 0
        __window_size = 500
        __step = 1000
        __start_point = 0
        __count = 0
        # __bingo = 0
        __last_unix_time = 0
        __last_warning_status = False
        __last_drift_status = False
        __warning_level_max = 0
        __data_length = Dataset.get_length(dataset, sub_dataset)

        lc = []
        bc = []

        configure = {}
        data_statistics = dict()
        delay = {}
        online = {}

        weights = []
        accuracy = []
        batch_start_time = []
        batch_count = []
        # right_count = []
        warning = []
        current_warning_status = {}
        warning_level = []
        warning_level_set = []
        drift = []
        current_drift_status = {}
        predict_corr = []
        hit = []
        last_batch_attributions = None

        detection = True
        drift_status = False

        # classifier flag
        prsa_flag = False

        attributes_set = DistributedOnlineLearning.construct_correlation(
            dataset)

        # Creating a data stream
        data = Data(dataset, sub_dataset)
        labels, attributes = data.get_attributes()
        attributes_scheme = AttributeScheme.get_scheme(attributes)
        __numeric_attribute_scheme = attributes_scheme['numeric']

        # Creating a save content
        project = Project('projects/single/{}'.format(dataset), sub_dataset)
        sub_folder_path = folder_create.sub_folder(sub_dataset)

        # Initializing a learner
        learner = Logistic(labels, attributes_scheme['numeric'])
        learner = OnlineAccuracyUpdatedEnsemble(labels,
                                                attributes_scheme['numeric'],
                                                learner,
                                                windowSize=__window_size,
                                                classifierLimit=10)

        # Initializing a drift detector
        drift_detector = DDM()

        # Initializing a evaluator
        evaluator = EvaluateWithWindowSize(learner, drift_detector, project,
                                           __window_size)

        # train & test
        for x, y, attribute in data.data(batch_size=1):
            if attribute is not None:
                attributes_scheme = AttributeScheme.get_scheme(attributes)
                __numeric_attribute_scheme = attributes_scheme['numeric']
                continue

            instance = x.tolist()[0] + [int(y.tolist()[0][0])]

            # 每条数据的unix时间戳
            # prsa data
            if dataset == 'prsa_data':
                date_time_flag = True
                date_time = list(map(int, instance[:4]))
                d = datetime.date(date_time[0], date_time[1], date_time[2])
                tt = datetime.time(date_time[3])
                datetime_str = str(d) + ' ' + str(tt)
                unix_time = int(
                    time.mktime(
                        time.strptime(datetime_str, '%Y-%m-%d %H:%M:%S')))
                if unix_time >= 1363881600:
                    prsa_flag = True
            elif dataset == 'movie_data':
                # movie data
                if instance[-2] > 62091:
                    date_time_flag = True
                    prsa_flag = True
                    date_time = DistributedOnlineLearning.get_date(
                        instance[-2])
                    datetime_str = str(date_time) + ' ' + '00:00:00'
                    unix_time = int(
                        time.mktime(
                            time.strptime(datetime_str, '%Y-%m-%d %H:%M:%S')))

            instance[0:len(instance) - 1] = Normalizer.normalize(
                instance[0:len(instance) - 1], __numeric_attribute_scheme)
            __instance_count += 1

            if __instance_count > __window_size and date_time_flag and prsa_flag:
                if dataset == 'prsa_data':
                    if unix_time == 1363881600:
                        __start_point = unix_time
                        batch_start_time.append(__start_point)
                elif dataset == 'movie_data':
                    if __instance_count == __window_size + 1:
                        __start_point = unix_time
                        batch_start_time.append(__start_point)

                difference_value = unix_time - __start_point
                if difference_value >= __batch_size:
                    self.con.acquire()  # 获得锁
                    batch_interval = int(difference_value / __batch_size)
                    for cc in range(batch_interval):
                        if cc == 0:
                            r_f = False
                        else:
                            r_f = True
                        __start_point += __batch_size
                        # for every batch
                        # 权重
                        weights.append(
                            learner.currentClassifierWeights.tolist())
                        # 属性记录
                        attributes_set, last_batch_attributions, lc, bc = self.calculate_correlation(
                            predict_corr,
                            last_batch_attributions,
                            attributes_set,
                            spilt_inform,
                            lc,
                            bc,
                            repeat_flag=r_f)
                        # 准确度(可能会变, 目前是后端计算drift的准确率)
                        accuracy.append(drift_detector.accuracy)
                        # batch开始时间
                        batch_start_time.append(__start_point)
                        # batch 数量
                        batch_count.append(__count)
                        # warning level
                        warning_level, warning_level_set, hit, __warning_level_max = self.calculate_warning_level(
                            warning_level_set,
                            warning_level,
                            hit,
                            __warning_level_max,
                            repeated_flag=r_f)
                        # 保存每个数据源的当前状态
                        self.last_wl_status[sub_dataset] = [
                            self.wl_transformer(warning_level[-1]['max']),
                            hit[-1]
                        ]
                        __count = 0

                        global_count += 1

                        if global_count == length:
                            global_count = 0
                            # 训练和测试每个模型的贝叶斯
                            if len(warning_level) > 1:
                                d_s_n = self.nb_set.keys()
                                for data_set_name in d_s_n:
                                    self.instance_set[data_set_name] = [self.last_wl_status[value][0]
                                                                        for value in d_s_n if value != data_set_name]\
                                                                       + [self.last_wl_status[data_set_name][-1]]
                                if len(warning_level) > 2:
                                    # testing
                                    for temple_name in d_s_n:
                                        self.nb_set[temple_name].set_ready()
                                        predict = self.nb_set[
                                            temple_name].do_testing(
                                                self.instance_set[temple_name])
                                        if predict == self.instance_set[
                                                temple_name][-1]:
                                            self.nb_classifier_accuracy[
                                                temple_name][
                                                    'right_count'] += 1
                                        self.nb_classifier_accuracy[
                                            temple_name]['all_count'] += 1
                                        self.nb_classifier_accuracy[
                                            temple_name]['accuracy'].append(
                                                round(
                                                    self.
                                                    nb_classifier_accuracy[
                                                        temple_name]
                                                    ['right_count'] / self.
                                                    nb_classifier_accuracy[
                                                        temple_name]
                                                    ['all_count'], 4))
                                    # training
                                    for temple_name in d_s_n:
                                        self.nb_set[temple_name].do_training(
                                            self.instance_set[temple_name],
                                            drift_status)
                                else:
                                    for temple_name in d_s_n:
                                        self.nb_set[temple_name].do_training(
                                            self.instance_set[temple_name],
                                            drift_status)
                            # print(self.nb_classifier_accuracy)
                            self.con.notifyAll()
                        else:
                            self.con.wait()

                    predict_corr = [instance]
                    __count = 1
                else:
                    __count += 1
                    predict_corr.append(instance)

                warning_level_set.append(drift_detector.risk)

                predicted_value = learner.do_testing(instance)

                prediction_status = evaluator.calculate_accuracy(
                    predicted_value,
                    instance[-1],
                    output_size=__step,
                    output_flag=False)
                # if prediction_status:
                #     __bingo += 1

                if detection is True:
                    warning_status, drift_status = drift_detector.detect(
                        prediction_status)
                    if warning_status is not __last_warning_status:
                        if warning_status:
                            current_warning_status['start'] = unix_time
                            current_warning_status['max_accuracy'] = [
                                drift_detector.o_s_d_min
                            ]
                            current_warning_status['max_accuracy_time'] = [
                                unix_time
                            ]
                            current_warning_status['backend_accuracy'] = [
                                drift_detector.accuracy
                            ]
                        else:
                            current_warning_status['end'] = __last_unix_time
                            warning.append(current_warning_status)
                            current_warning_status = {}
                    else:
                        if warning_status:
                            current_warning_status['max_accuracy'].append(
                                drift_detector.o_s_d_min)
                            current_warning_status['max_accuracy_time'].append(
                                unix_time)
                            current_warning_status['backend_accuracy'].append(
                                drift_detector.accuracy)
                    if drift_status is not __last_drift_status:
                        if drift_status:
                            current_drift_status['start'] = unix_time
                            current_drift_status['max_accuracy'] = [
                                drift_detector.o_s_d_min
                            ]
                            current_drift_status['max_accuracy_time'] = [
                                unix_time
                            ]
                            current_drift_status['backend_accuracy'] = [
                                drift_detector.accuracy
                            ]
                        else:
                            current_drift_status['end'] = __last_unix_time
                            drift.append(current_drift_status)
                            current_drift_status = {}
                    else:
                        if drift_status:
                            current_drift_status['max_accuracy'].append(
                                drift_detector.o_s_d_min)
                            current_drift_status['max_accuracy_time'].append(
                                unix_time)
                            current_drift_status['backend_accuracy'].append(
                                drift_detector.accuracy)

                    __last_warning_status = warning_status
                    __last_drift_status = drift_status
                    __last_unix_time = unix_time
                else:
                    # warning_status = False
                    drift_status = False

                if __instance_count == __data_length:  # 最后一个batch可能只有少部分数据,要考虑

                    self.con.acquire()  # 获得锁

                    # 权重
                    weights.append(learner.currentClassifierWeights.tolist())
                    # 属性记录
                    attributes_set, last_batch_attributions, lc, bc = self.calculate_correlation(
                        predict_corr,
                        last_batch_attributions,
                        attributes_set,
                        spilt_inform,
                        lc,
                        bc,
                        repeat_flag=False)
                    # 准确度(可能会变, 目前是后端计算drift的准确率)
                    accuracy.append(drift_detector.accuracy)
                    # batch 数量
                    batch_count.append(__count)
                    # # 目前是 batch 中预测正确的个数, 后面改为concept drift 预测正确个数
                    # right_count.append(__bingo)
                    # warning level
                    warning_level, warning_level_set, hit, __warning_level_max = self.calculate_warning_level(
                        warning_level_set,
                        warning_level,
                        hit,
                        __warning_level_max,
                        repeated_flag=False)

                    global_count += 1

                    if global_count == length:
                        global_count = 0
                        # 训练和测试每个模型的贝叶斯
                        if len(warning_level) > 1:
                            d_s_n = self.nb_set.keys()
                            for data_set_name in d_s_n:
                                self.instance_set[data_set_name] = [self.last_wl_status[value][0]
                                                                    for value in d_s_n if value != data_set_name] \
                                                                   + [self.last_wl_status[data_set_name][-1]]
                            if len(warning_level) > 2:
                                # testing
                                for temple_name in d_s_n:
                                    predict = self.nb_set[
                                        temple_name].do_testing(
                                            self.instance_set[temple_name])
                                    if predict == self.instance_set[
                                            temple_name][-1]:
                                        self.nb_classifier_accuracy[
                                            temple_name]['right_count'] += 1
                                    self.nb_classifier_accuracy[temple_name][
                                        'all_count'] += 1
                                    self.nb_classifier_accuracy[temple_name][
                                        'accuracy'].append(
                                            round(
                                                self.nb_classifier_accuracy[
                                                    temple_name]['right_count']
                                                / self.nb_classifier_accuracy[
                                                    temple_name]['all_count'],
                                                4))
                                # training
                                for temple_name in d_s_n:
                                    self.nb_set[temple_name].do_training(
                                        self.instance_set[temple_name],
                                        drift_status)
                            else:
                                for temple_name in d_s_n:
                                    self.nb_set[temple_name].do_training(
                                        self.instance_set[temple_name],
                                        drift_status)
                        # 保存最后一个数据源的状态
                        configure['timeStart'] = batch_start_time[0]
                        configure['timeEnd'] = batch_start_time[-1]
                        configure['timeUnit'] = __batch_size
                        configure['dataNumMax'] = max(batch_count)
                        configure['warningLevelMax'] = __warning_level_max

                        data_statistics['name'] = sub_dataset
                        delay['time'] = batch_start_time
                        delay['accuracy'] = accuracy
                        delay['bingo'] = []
                        delay['hit'] = hit[1:]
                        delay['warning'] = warning
                        delay['drift'] = drift
                        delay['warningLevel'] = warning_level
                        delay['attributes'] = attributes_set

                        online['weight'] = weights
                        online['time'] = batch_start_time
                        online['dataNum'] = batch_count

                        data_statistics['delay'] = delay
                        data_statistics['online'] = online

                        save_path = sub_folder_path + '/'
                        self.save_file(configure,
                                       save_path,
                                       type_name='configure')
                        self.save_file(data_statistics,
                                       save_path,
                                       type_name=None)

                        # 提示所有数据训练完成,可结束主进程
                        print(
                            'All data has been trained. Please finish the main process manually!'
                        )
                        Zip.plot_multi(self.nb_classifier_accuracy)
                        Zip(folder_create.get_path())
                        self.con.notifyAll()
                    else:
                        configure['timeStart'] = batch_start_time[0]
                        configure['timeEnd'] = batch_start_time[-1]
                        configure['timeUnit'] = __batch_size
                        configure['dataNumMax'] = max(batch_count)

                        data_statistics['name'] = sub_dataset
                        delay['time'] = batch_start_time
                        delay['accuracy'] = accuracy
                        delay['bingo'] = []
                        delay['hit'] = hit[1:]
                        delay['warning'] = warning
                        delay['drift'] = drift
                        delay['warningLevel'] = warning_level
                        delay['attributes'] = attributes_set

                        online['weight'] = weights
                        online['time'] = batch_start_time
                        online['dataNum'] = batch_count

                        data_statistics['delay'] = delay
                        data_statistics['online'] = online

                        save_path = sub_folder_path + '/'
                        self.save_file(configure,
                                       save_path,
                                       type_name='configure')
                        self.save_file(data_statistics,
                                       save_path,
                                       type_name=None)

                        self.con.wait()

                # training
                learner.do_training(instance, drift_status)
            else:
                # training
                learner.do_training(instance, drift_status)
    @staticmethod
    def _get_date(days):
        aa = time.strptime('1800-01-01', "%Y-%m-%d")
        datetime.date(aa[0], aa[1], aa[2])
        return datetime.date(aa[0], aa[1],
                             aa[2]) + datetime.timedelta(days=days)

    @staticmethod
    def construct_correlation(data_source):
        dataset_attributes = Dataset.get_attributes(data_source)
        attribute_object = []
        a = dict()
        for arr in dataset_attributes:
            a['name'] = arr[0]
            a['predict'] = []
            a['correlation'] = []
            a['num'] = arr[1]
            attribute_object.append(a)
            a = dict()
        return attribute_object


if __name__ == '__main__':
    global_count = 0
    lock_con = threading.Condition()
    threads = []
    dol = DistributedOnlineLearning()
    for dataset in Dataset.DATASET:
        spilt_inform = Dataset.get_spilt_inform(dataset)
        for sub_dataset in Dataset.get_sub_dataset(dataset):
            pass