Ejemplo n.º 1
0
def run_trial(stream_path, target_dir, drift_points, pairs):

    # for example, stream_path='./benchmark_data/circles/circles_0.arff'

    # 1. Creating a project
    stream_name = os.path.splitext(os.path.basename(stream_path))[0]
    project = Project(target_dir, stream_name)

    # 2. Loading an arff file
    labels, attributes, stream_records = ARFFReader.read(stream_path)
    attributes_scheme = AttributeScheme.get_scheme(attributes)

    # 4. Creating a color set for plotting results
    #     colors = [Color.Indigo[1], Color.Blue[1], Color.Green[1], Color.Lime[1], Color.Yellow[1],
    #           Color.Amber[1], Color.Orange[1], Color.Red[1], Color.Purple[1], Color.Pink[1],
    #           Color.Indigo[2], Color.Blue[2], Color.Green[2], Color.Lime[2], Color.Yellow[2],
    #           Color.Amber[2], Color.Orange[2], Color.Red[2], Color.Purple[2], Color.Pink[2],
    #           Color.Indigo[3], Color.Blue[3], Color.Green[3], Color.Lime[3], Color.Yellow[3],
    #           Color.Amber[3], Color.Orange[3], Color.Red[3], Color.Purple[3], Color.Pink[3]][:len(pairs)]

    # 5. Defining actual locations of drifts, acceptance delay interval, and vector of weights
    actual_drift_points = drift_points
    drift_acceptance_interval = 250
    w_vec = [1, 1, 1, 1, 1, 1]

    # 6. Creating a Prequential Evaluation Process
    pairs = [[pair[0](labels, attributes_scheme[pair[1]]),
              copy(pair[2])] for pair in pairs]
    prequential = PrequentialMultiPairs(
        pairs,
        attributes,
        attributes_scheme,
        actual_drift_points,
        drift_acceptance_interval,
        w_vec,
        project,
        legend_param=False)  # color_set=colors,

    prequential.run(stream_records, 1)
            # Initializing a learner
            learner = Logistic(labels, attributes_scheme['numeric'])
            learner_copy = cp.deepcopy(learner)

            # Initializing a Classifier-Detector Pairs
            pairs = [[
                OnlineAccuracyUpdatedEnsemble(labels,
                                              attributes_scheme['numeric'],
                                              learner_copy,
                                              windowSize=__window_size,
                                              classifierLimit=10),
                DDM()
            ], [learner, DDM()]]

            # Creating a save content
            project = Project('projects/multi/{}'.format(dataset), sub_dataset)

            # Creating a color set for plotting results
            colors = ['#FF0000', '#3E8ABF', '#1891FF', '#4A0083', '#00FFFF']

            # Initializing a evaluator
            evaluator = EvaluateWithWindowSizeMulti(pairs, project, colors,
                                                    __window_size)

            # train & test
            for x, y, attribute in data.data(batch_size=1):
                if attribute is not None:
                    attributes_scheme = AttributeScheme.get_scheme(attributes)
                    __numeric_attribute_scheme = attributes_scheme['nominal']
                    __numeric_attribute_scheme = attributes_scheme['numeric']
                    continue
Ejemplo n.º 3
0
The Tornado Framework
By Ali Pesaranghader
University of Ottawa, Ontario, Canada
E-mail: apesaran -at- uottawa -dot- ca / alipsgh -at- gmail -dot- com
"""

from data_structures.attribute_scheme import AttributeScheme
from classifier.__init__ import *
from drift_detection.__init__ import *
from filters.project_creator import Project
from streams.readers.arff_reader import ARFFReader
from tasks.__init__ import *


# 1. Creating a project
project = Project("projects/single", "sine1")

# 2. Loading an arff file
# labels, attributes, stream_records = ARFFReader.read("data_streams/sine1_w_50_n_0.1/sine1_w_50_n_0.1_101.arff")
labels, attributes, stream_records = ARFFReader.read("data_streams/catsdogs/catsdogs.arff")
attributes_scheme = AttributeScheme.get_scheme(attributes)

print(labels, attributes)
print(attributes_scheme)

# 3. Initializing a Learner
# learner = NaiveBayes(labels, attributes_scheme['nominal'])
learner = Catsdogs(labels, attributes_scheme['nominal'])

# 4. Initializing a drift detector
detector = FHDDM(n=50)
The Tornado Framework
By Ali Pesaranghader
University of Ottawa, Ontario, Canada
E-mail: apesaran -at- uottawa -dot- ca / alipsgh -at- gmail -dot- com
"""

from data_structures.attribute_scheme import AttributeScheme
from classifier.__init__ import *
from drift_detection.__init__ import *
from filters.project_creator import Project
from graphic.hex_colors import Color
from streams.readers.arff_reader import ARFFReader
from tasks.prequential_learner_detector_pairs import PrequentialMultiPairs

# 1. Creating a project
project = Project("projects/multi", "sine1")

# 2. Loading an arff file
labels, attributes, stream_records = ARFFReader.read(
    "data_streams/sine1_w_50_n_0.1/sine1_w_50_n_0.1_101.arff")
attributes_scheme = AttributeScheme.get_scheme(attributes)

# 3. Initializing a Classifier-Detector Pairs
pairs = [
    [NaiveBayes(labels, attributes_scheme['nominal']),
     FHDDM()], [NaiveBayes(labels, attributes_scheme['nominal']),
                FHDDMS()],
    [NaiveBayes(labels, attributes_scheme['nominal']),
     CUSUM()], [NaiveBayes(labels, attributes_scheme['nominal']),
                PH()],
    [NaiveBayes(labels, attributes_scheme['nominal']),
    def sub_thread(self, dataset, sub_dataset, spilt_inform, folder_create,
                   length):

        global global_count
        global naive_bayes_batch_count

        self.nb_set[sub_dataset] = NaiveBayes([0, 1], self.attributes)
        self.last_wl_status[sub_dataset] = dict(r_l=0, hit=[])
        self.instance_set[sub_dataset] = []
        self.nb_classifier_accuracy[sub_dataset] = dict(all_count=0,
                                                        right_count=0,
                                                        accuracy=[])
        self.nb_batch_count[sub_dataset] = 0
        self.nb_drift_prob[sub_dataset] = dict(prob=[], ground_truth=[])
        self.plot_risk_level[sub_dataset] = 0
        self.data_statistics[sub_dataset] = dict(
            name=sub_dataset,
            delay=dict(time=[],
                       accuracy=[],
                       nb_prob=[],
                       bingo=[],
                       hit=[],
                       warning=[],
                       drift=[],
                       warningLevel=[],
                       attributes=self.construct_correlation(dataset),
                       batch_delay=2),
            online=dict(weight=[], time=[], dataNum=[]))
        self.configure[sub_dataset] = {}
        self.warning_level_max[sub_dataset] = 0

        # Set variables
        date_time_flag = False
        __batch_size = 24 * 3600

        __instance_count = 0
        __window_size = Dataset.get_online_learning_batch_interval(
            dataset, sub_dataset)
        __step = 1000
        __start_point = 0
        __count = 0
        __last_unix_time = 0
        __last_warning_status = False
        __last_drift_status = False
        __data_length = Dataset.get_length(dataset, sub_dataset)
        __detect_interval = Dataset.get_detect_batch_interval(
            dataset, sub_dataset)

        lc = []
        bc = []

        current_warning_status = {}
        warning_level_set = []
        current_drift_status = {}
        predict_corr = []
        last_batch_attributions = None

        detection = True
        drift_status = False

        # classifier flag
        prsa_flag = False

        # Creating a data stream
        data = Data(dataset, sub_dataset)
        labels, attributes = data.get_attributes()
        attributes_scheme = AttributeScheme.get_scheme(attributes)
        __numeric_attribute_scheme = attributes_scheme['numeric']

        # Creating a save content
        project = Project('./projects/single/{}'.format(dataset), sub_dataset)
        self.sub_file_path[sub_dataset] = folder_create.sub_folder(sub_dataset)

        # Initializing a learner
        learner = Logistic(labels, attributes_scheme['numeric'])
        learner = OnlineAccuracyUpdatedEnsemble(labels,
                                                attributes_scheme['numeric'],
                                                learner,
                                                windowSize=__window_size,
                                                classifierLimit=10)

        # Initializing a drift detector
        drift_detector = DDM(min_instance=__detect_interval)

        # Initializing a evaluator
        evaluator = EvaluateWithWindowSize(learner, drift_detector, project,
                                           __window_size)

        # train & test
        for x, y, attribute in data.data(batch_size=1):
            if attribute is not None:
                attributes_scheme = AttributeScheme.get_scheme(attributes)
                __numeric_attribute_scheme = attributes_scheme['numeric']
                continue

            instance = x.tolist()[0] + [int(y.tolist()[0][0])]
            # print(instance)

            # 每条数据的unix时间戳
            # prsa data
            if dataset == 'prsa_data':
                date_time_flag = True
                date_time = list(map(int, instance[:4]))
                d = datetime.date(date_time[0], date_time[1], date_time[2])
                tt = datetime.time(date_time[3])
                datetime_str = str(d) + ' ' + str(tt)
                unix_time = int(
                    time.mktime(
                        time.strptime(datetime_str, '%Y-%m-%d %H:%M:%S')))
                if unix_time >= 1363881600:
                    prsa_flag = True
            elif dataset == 'movie_data':
                # movie data
                if instance[-2] > 62091:
                    date_time_flag = True
                    prsa_flag = True
                    date_time = DistributedOnlineLearning.get_date(
                        instance[-2])
                    datetime_str = str(date_time) + ' ' + '00:00:00'
                    unix_time = int(
                        time.mktime(
                            time.strptime(datetime_str, '%Y-%m-%d %H:%M:%S')))
                    instance.pop(-2)  # 去掉时间
            elif dataset == 'netease_data':
                date_time_flag = True
                prsa_flag = True
                unix_time = int(instance[0])
                instance.pop(0)  # 去掉时间
            instance[0:len(instance) - 1] = Normalizer.normalize(
                instance[0:len(instance) - 1], __numeric_attribute_scheme)
            # print(instance)
            __instance_count += 1

            if __instance_count % 10000 == 0 or __instance_count == __data_length:
                percentage = (__instance_count / __data_length) * 100
                print(
                    sub_dataset, "%0.2f" % percentage +
                    "% of instances are prequentially processed!")

            if __instance_count > __window_size and date_time_flag and prsa_flag:
                if dataset == 'prsa_data':
                    if unix_time == 1363881600:
                        print(__instance_count)
                        __start_point = unix_time
                        self.data_statistics[sub_dataset]['delay'][
                            'time'].append(__start_point)
                elif dataset == 'movie_data':
                    if __instance_count == __window_size + 1:
                        print(__instance_count)
                        __start_point = unix_time
                        self.data_statistics[sub_dataset]['delay'][
                            'time'].append(__start_point)
                elif dataset == 'netease_data':
                    if __instance_count == __window_size + 1:
                        print(__instance_count)
                        __start_point = unix_time
                        self.data_statistics[sub_dataset]['delay'][
                            'time'].append(__start_point)

                difference_value = unix_time - __start_point
                if difference_value >= __batch_size and __instance_count != __data_length:
                    batch_interval = int(difference_value / __batch_size)
                    for cc in range(batch_interval):
                        if cc == 0:
                            r_f = False
                        else:
                            r_f = True
                        self.con.acquire()  # 获得锁
                        __start_point += __batch_size
                        # for every batch
                        # 权重
                        self.data_statistics[sub_dataset]['online'][
                            'weight'].append(
                                learner.currentClassifierWeights.tolist())
                        # 属性记录
                        self.data_statistics[sub_dataset]['delay'][
                            'attributes'], last_batch_attributions, lc, bc = self.calculate_correlation(
                                predict_corr,
                                last_batch_attributions,
                                self.data_statistics[sub_dataset]['delay']
                                ['attributes'],
                                spilt_inform,
                                lc,
                                bc,
                                repeat_flag=r_f)
                        # 准确度(可能会变, 目前是后端计算drift的准确率)
                        self.data_statistics[sub_dataset]['delay'][
                            'accuracy'].append(drift_detector.accuracy)
                        # batch开始时间
                        self.data_statistics[sub_dataset]['delay'][
                            'time'].append(__start_point)
                        # batch 数量
                        self.data_statistics[sub_dataset]['online'][
                            'dataNum'].append(__count)
                        # warning level
                        self.data_statistics[sub_dataset]['delay'][
                            'warningLevel'], warning_level_set, self.data_statistics[
                                sub_dataset]['delay'][
                                    'hit'], self.warning_level_max[
                                        sub_dataset] = self.calculate_warning_level(
                                            warning_level_set,
                                            self.data_statistics[sub_dataset]
                                            ['delay']['warningLevel'],
                                            self.data_statistics[sub_dataset]
                                            ['delay']['hit'],
                                            self.
                                            warning_level_max[sub_dataset],
                                            repeated_flag=r_f)

                        __count = 0

                        # self.last_wl_status[sub_dataset]['r_l'] = self.wl_transformer(self.data_statistics[sub_dataset]['delay']['warningLevel'][-1]['max'])
                        if len(self.last_wl_status[sub_dataset]['hit']) > 2:
                            self.last_wl_status[sub_dataset]['hit'].pop(0)
                            self.last_wl_status[sub_dataset]['hit'].append(
                                self.data_statistics[sub_dataset]['delay']
                                ['hit'][-1])
                        else:
                            self.last_wl_status[sub_dataset]['hit'].append(
                                self.data_statistics[sub_dataset]['delay']
                                ['hit'][-1])

                        global_count += 1

                        # self.plot_risk_level[sub_dataset] = self.data_statistics[sub_dataset]['delay']['warningLevel'][-1]['max']

                        if global_count == length:
                            global_count = 0
                            # 训练和测试每个模型的贝叶斯
                            d_s_n = self.nb_set.keys()
                            # 训练贝叶斯
                            if len(self.data_statistics[sub_dataset]['delay']
                                   ['warningLevel']) > 2:
                                for data_set_name in d_s_n:
                                    self.instance_set[data_set_name] = [self.wl_transformer(self.data_statistics[value]['delay']['warningLevel'][-2]['max'])
                                                                        for value in d_s_n if value != data_set_name] \
                                                                       + [max(self.last_wl_status[data_set_name]['hit'])]
                                if len(self.data_statistics[sub_dataset]
                                       ['delay']['warningLevel']) > 3:
                                    # testing
                                    for temple_name in d_s_n:
                                        self.nb_set[temple_name].set_ready()
                                        predict = self.nb_set[
                                            temple_name].do_testing(
                                                self.instance_set[temple_name])
                                        self.data_statistics[temple_name][
                                            'delay']['nb_prob'].append(
                                                self.nb_set[temple_name].
                                                drift_prob)
                                        self.nb_drift_prob[temple_name][
                                            'prob'].append(
                                                self.nb_set[temple_name].
                                                drift_prob)
                                        self.nb_drift_prob[temple_name][
                                            'ground_truth'].append(
                                                self.data_statistics[
                                                    temple_name]['delay']
                                                ['warningLevel'][-2]['max'])

                                        #  测试使用3个batch中最大的drift_level
                                        # self.nb_drift_prob[temple_name]['ground_truth'].append(
                                        #     max(
                                        #         self.data_statistics[temple_name]['delay']['warningLevel'][-3]['max'],
                                        #         self.data_statistics[temple_name]['delay']['warningLevel'][-2]['max'],
                                        #         self.data_statistics[temple_name]['delay']['warningLevel'][-1]['max']
                                        #     )
                                        # )

                                        if predict == self.instance_set[
                                                temple_name][-1]:
                                            self.nb_classifier_accuracy[
                                                temple_name][
                                                    'right_count'] += 1
                                        self.nb_classifier_accuracy[
                                            temple_name]['all_count'] += 1
                                        self.nb_classifier_accuracy[
                                            temple_name]['accuracy'].append(
                                                round(
                                                    self.
                                                    nb_classifier_accuracy[
                                                        temple_name]
                                                    ['right_count'] / self.
                                                    nb_classifier_accuracy[
                                                        temple_name]
                                                    ['all_count'], 4))
                                    # training
                                    for temple_name in d_s_n:
                                        self.nb_set[temple_name].do_training(
                                            self.instance_set[temple_name],
                                            drift_status)
                                else:
                                    for temple_name in d_s_n:
                                        self.nb_set[temple_name].do_training(
                                            self.instance_set[temple_name],
                                            drift_status)
                            self.con.notifyAll()
                        else:
                            self.con.wait()

                    predict_corr = [instance]
                    __count = 1
                else:
                    __count += 1
                    predict_corr.append(instance)

                predicted_value = learner.do_testing(instance)  # 每一个实例训练一次

                prediction_status = evaluator.calculate_accuracy(
                    predicted_value,
                    instance[-1],
                    output_size=__step,
                    output_flag=False)

                if detection is True:
                    warning_status, drift_status = drift_detector.detect(
                        prediction_status)
                    if warning_status is not __last_warning_status:
                        if warning_status:
                            current_warning_status['start'] = unix_time
                            current_warning_status['max_accuracy'] = [
                                drift_detector.o_s_d_min
                            ]
                            current_warning_status['max_accuracy_time'] = [
                                unix_time
                            ]
                            current_warning_status['backend_accuracy'] = [
                                drift_detector.accuracy
                            ]
                        else:
                            current_warning_status['end'] = __last_unix_time
                            self.data_statistics[sub_dataset]['delay'][
                                'warning'].append(current_warning_status)
                            current_warning_status = {}
                    else:
                        if warning_status:
                            current_warning_status['max_accuracy'].append(
                                drift_detector.o_s_d_min)
                            current_warning_status['max_accuracy_time'].append(
                                unix_time)
                            current_warning_status['backend_accuracy'].append(
                                drift_detector.accuracy)
                    if drift_status is not __last_drift_status:
                        if drift_status:
                            current_drift_status['start'] = unix_time
                            current_drift_status['max_accuracy'] = [
                                drift_detector.o_s_d_min
                            ]
                            current_drift_status['max_accuracy_time'] = [
                                unix_time
                            ]
                            current_drift_status['backend_accuracy'] = [
                                drift_detector.accuracy
                            ]
                        else:
                            current_drift_status['end'] = __last_unix_time
                            self.data_statistics[sub_dataset]['delay'][
                                'drift'].append(current_drift_status)
                            current_drift_status = {}
                    else:
                        if drift_status:
                            current_drift_status['max_accuracy'].append(
                                drift_detector.o_s_d_min)
                            current_drift_status['max_accuracy_time'].append(
                                unix_time)
                            current_drift_status['backend_accuracy'].append(
                                drift_detector.accuracy)

                    __last_warning_status = warning_status
                    __last_drift_status = drift_status
                    __last_unix_time = unix_time
                else:
                    # warning_status = False
                    drift_status = False

                warning_level_set.append(
                    drift_detector.risk)  # 记录ddm检测后的warning_level
                # if 1365912000-12*3600 <= unix_time <= 1365998400-12*3600:
                #     if unix_time == 1365998400 - 13*3600:
                #         print(sub_dataset, len(warning_level_set), warning_level_set)
                #     if unix_time == 1365998400-12*3600:
                #         print(sub_dataset, self.data_statistics[sub_dataset]['delay']['warningLevel'][-1])
                #     if drift_status:
                #         print('数据集{}真实drift点->'.format(sub_dataset), unix_time, drift_detector.risk)
                # if 1366401600 <= unix_time <= 1366488000:
                #     # print(sub_dataset, len(warning_level_set), warning_level_set)
                #     if unix_time == 1366488000:
                #         print(sub_dataset, self.data_statistics[sub_dataset]['delay']['warningLevel'][-1])
                #     if drift_status:
                #         print('数据集{}真实drift点->'.format(sub_dataset), unix_time, drift_detector.risk)

                if __instance_count == __data_length:  # 最后一个batch可能只有少部分数据,要考虑
                    print(sub_dataset, global_count)
                    self.con.acquire()  # 获得锁
                    # 权重
                    self.data_statistics[sub_dataset]['online'][
                        'weight'].append(
                            learner.currentClassifierWeights.tolist())
                    # 属性记录
                    self.data_statistics[sub_dataset]['delay'][
                        'attributes'], last_batch_attributions, lc, bc = self.calculate_correlation(
                            predict_corr,
                            last_batch_attributions,
                            self.data_statistics[sub_dataset]['delay']
                            ['attributes'],
                            spilt_inform,
                            lc,
                            bc,
                            repeat_flag=False)
                    # 准确度(可能会变, 目前是后端计算drift的准确率)
                    self.data_statistics[sub_dataset]['delay'][
                        'accuracy'].append(drift_detector.accuracy)
                    # batch 数量
                    self.data_statistics[sub_dataset]['online'][
                        'dataNum'].append(__count)
                    # warning level
                    self.data_statistics[sub_dataset]['delay'][
                        'warningLevel'], warning_level_set, self.data_statistics[
                            sub_dataset]['delay'][
                                'hit'], self.warning_level_max[
                                    sub_dataset] = self.calculate_warning_level(
                                        warning_level_set,
                                        self.data_statistics[sub_dataset]
                                        ['delay']['warningLevel'],
                                        self.data_statistics[sub_dataset]
                                        ['delay']['hit'],
                                        self.warning_level_max[sub_dataset],
                                        repeated_flag=False)

                    self.last_wl_status[sub_dataset][
                        'r_l'] = self.wl_transformer(
                            self.data_statistics[sub_dataset]['delay']
                            ['warningLevel'][-1]['max'])
                    if len(self.last_wl_status[sub_dataset]['hit']) > 2:
                        self.last_wl_status[sub_dataset]['hit'].pop(0)
                        self.last_wl_status[sub_dataset]['hit'].append(
                            self.data_statistics[sub_dataset]['delay']['hit']
                            [-1])
                    else:
                        self.last_wl_status[sub_dataset]['hit'].append(
                            self.data_statistics[sub_dataset]['delay']['hit']
                            [-1])

                    global_count += 1

                    # 画 drift probability
                    # Zip.plot_multi_1(self.nb_drift_prob[sub_dataset], sub_dataset)

                    if global_count == length:
                        global_count = 0
                        # # 训练和测试每个模型的贝叶斯
                        d_s_n = self.nb_set.keys()
                        for data_set_name in d_s_n:
                            self.instance_set[data_set_name] = [self.wl_transformer(self.data_statistics[value]['delay']['warningLevel'][-2]['max'])
                                                                for value in d_s_n if value != data_set_name] \
                                                               + [max(self.last_wl_status[data_set_name]['hit'])]
                        # testing
                        for temple_name in d_s_n:
                            self.nb_set[temple_name].set_ready()
                            predict = self.nb_set[temple_name].do_testing(
                                self.instance_set[temple_name])
                            self.data_statistics[temple_name]['delay'][
                                'nb_prob'].append(
                                    self.nb_set[temple_name].drift_prob)
                            self.nb_drift_prob[temple_name]['prob'].append(
                                self.nb_set[temple_name].drift_prob)
                            self.nb_drift_prob[temple_name][
                                'ground_truth'].append(
                                    self.data_statistics[temple_name]['delay']
                                    ['warningLevel'][-2]['max'])

                            #  测试使用3个batch中最大的drift_level
                            # self.nb_drift_prob[temple_name]['ground_truth'].append(
                            #     max(
                            #         self.data_statistics[temple_name]['delay']['warningLevel'][-3]['max'],
                            #         self.data_statistics[temple_name]['delay']['warningLevel'][-2]['max'],
                            #         self.data_statistics[temple_name]['delay']['warningLevel'][-1]['max']
                            #     )
                            # )

                            if predict == self.instance_set[temple_name][-1]:
                                self.nb_classifier_accuracy[temple_name][
                                    'right_count'] += 1
                            self.nb_classifier_accuracy[temple_name][
                                'all_count'] += 1
                            self.nb_classifier_accuracy[temple_name][
                                'accuracy'].append(
                                    round(
                                        self.nb_classifier_accuracy[
                                            temple_name]['right_count'] /
                                        self.nb_classifier_accuracy[
                                            temple_name]['all_count'], 4))
                        # training
                        for temple_name in d_s_n:
                            self.nb_set[temple_name].do_training(
                                self.instance_set[temple_name], drift_status)

                        # 保存每个数据源的状态
                        # ① 每个数据源概念漂移检测+贝叶斯drift概率  + configure

                        for key_name in self.data_statistics.keys():
                            self.configure[key_name][
                                'timeStart'] = self.data_statistics[key_name][
                                    'delay']['time'][0]
                            self.configure[key_name][
                                'timeEnd'] = self.data_statistics[key_name][
                                    'delay']['time'][-1]
                            self.configure[key_name]['timeUnit'] = __batch_size
                            self.configure[key_name]['dataNumMax'] = max(
                                self.data_statistics[key_name]['online']
                                ['dataNum'])
                            self.configure[key_name][
                                'warningLevelMax'] = self.warning_level_max[
                                    key_name]
                            self.configure[key_name]['warningLevel'] = [[
                                0, 2
                            ], [2, 3], [3, 10000]]
                            self.data_statistics[key_name]['delay'][
                                'hit'] = self.data_statistics[key_name][
                                    'delay']['hit'][:-1]

                        self.save_file_1(self.configure,
                                         self.sub_file_path,
                                         type_name='configure')
                        self.save_file_1(self.data_statistics,
                                         self.sub_file_path,
                                         type_name=None)

                        self.save_file(self.nb_drift_prob,
                                       folder_create.get_path(),
                                       type_name='experiment_with_the_figure')
                        Zip.plot_multi(self.nb_classifier_accuracy)
                        Zip(folder_create.get_path())
                        # 提示所有数据训练完成,可结束主进程
                        print(
                            'All data has been trained. Please finish the main process manually!'
                        )
                        self.con.notifyAll()
                    else:
                        self.con.wait()

                # training
                learner.do_training(instance, drift_status)
            else:
                # training
                learner.do_training(instance, drift_status)
Ejemplo n.º 6
0
    def run(self, data_set, sub_data_set):

        if data_set == 'prsa_data':
            self.__batch_size = 24 * 3600  # 3600 represent 1 hour
        elif data_set == 'movie_data':
            self.__batch_size = 24 * 7 * 3600  # 3600 represent 1 hour

        self.__data_length = Dataset.get_length(data_set, sub_data_set)
        self.attributes_set = SaveDifferentData.construct_correlation(data_set)
        self.spilt_inform = Dataset.get_spilt_inform(data_set)

        data = Data(data_set, sub_data_set)
        labels, attributes = data.get_attributes()
        attributes_scheme = AttributeScheme.get_scheme(attributes)
        self.__numeric_attribute_scheme = attributes_scheme['nominal']
        self.__numeric_attribute_scheme = attributes_scheme['numeric']

        # Initializing a learner
        learner = Logistic(labels, attributes_scheme['numeric'])
        learner = OnlineAccuracyUpdatedEnsemble(
            labels,
            attributes_scheme['numeric'],
            learner,
            windowSize=self.__window_size,
            classifierLimit=self.__classifier_limit)

        # Initializing a drift detector
        drift_detector = DDM()

        # Creating a save content
        project = Project('./projects/distributed/{}'.format(data_set),
                          sub_data_set)

        # Initializing a evaluator
        evaluator = EvaluateWithWindowSize(learner, drift_detector, project,
                                           self.__window_size)

        # train & test
        for x, y, attribute in data.data(batch_size=1):
            if attribute is not None:
                attributes_scheme = AttributeScheme.get_scheme(attributes)
                self.__numeric_attribute_scheme = attributes_scheme['nominal']
                self.__numeric_attribute_scheme = attributes_scheme['numeric']
                continue

            instance = x.tolist()[0] + [int(y.tolist()[0][0])]

            # 每条数据的unix时间戳
            # prsa data
            if data_set == 'prsa_data':
                self.date_time_flag = True
                date_time = list(map(int, instance[:4]))
                d = datetime.date(date_time[0], date_time[1], date_time[2])
                t = datetime.time(date_time[3])
                datetime_str = str(d) + ' ' + str(t)
                self.unix_time = int(
                    time.mktime(
                        time.strptime(datetime_str, '%Y-%m-%d %H:%M:%S')))
                if self.unix_time >= 1363881600:
                    self.prsa_flag = True
            elif data_set == 'movie_data':
                # movie data
                if instance[-2] > 62091:
                    self.date_time_flag = True
                    self.prsa_flag = True
                    date_time = self._get_date(instance[-2])
                    datetime_str = str(date_time) + ' ' + '00:00:00'
                    self.unix_time = int(
                        time.mktime(
                            time.strptime(datetime_str, '%Y-%m-%d %H:%M:%S')))

            instance[0:len(instance) - 1] = Normalizer.normalize(
                instance[0:len(instance) - 1], self.__numeric_attribute_scheme)
            self.__instance_count += 1

            if self.__instance_count > self.__window_size and self.date_time_flag and self.prsa_flag:
                if self.unix_time == 1363881600:
                    self.__start_point = self.unix_time
                    self.batch_start_time.append(self.__start_point)
                if self.unix_time - self.__start_point >= self.__batch_size:
                    self.__start_point = self.unix_time
                    # for every batch
                    # 权重
                    self.weights.append(
                        learner.currentClassifierWeights.tolist())
                    # 属性记录
                    self.calculate_correlation()
                    # 准确度(可能会变, 目前是后端计算drift的准确率)
                    self.accuracy.append(drift_detector.accuracy)
                    # batch开始时间
                    self.batch_start_time.append(self.__start_point)
                    # batch 数量
                    self.batch_count.append(self.__count)
                    # 目前是 batch 中预测正确的个数, 后面改为concept drift 预测正确个数(后面为drift预测正确的个数)
                    self.right_count.append(self.__bingo)
                    # warning level
                    self.calculate_warning_level()

                    # print(batch_start_time, batch_count)
                    self.predict_corr = [instance]
                    self.__count = 1
                    self.__bingo = 0
                else:
                    self.__count += 1
                    self.predict_corr.append(instance)
                self.warning_level_set.append(drift_detector.risk)

                predicted_value = learner.do_testing(instance)

                prediction_status = evaluator.calculate_accuracy(
                    predicted_value,
                    instance[-1],
                    output_size=self.__step,
                    output_flag=False)
                if prediction_status:
                    self.__bingo += 1

                if self.detection is True:
                    self.warning_status, self.drift_status = drift_detector.detect(
                        prediction_status)
                    if self.warning_status is not self.__last_warning_status:
                        if self.warning_status:
                            self.current_warning_status[
                                'start'] = self.unix_time
                            self.current_warning_status['max_accuracy'] = [
                                drift_detector.o_s_d_min
                            ]
                            self.current_warning_status[
                                'max_accuracy_time'] = [self.unix_time]
                        else:
                            self.current_warning_status[
                                'end'] = self.__last_unix_time
                            self.warning.append(self.current_warning_status)
                            self.current_warning_status = {}
                    else:
                        if self.warning_status:
                            self.current_warning_status['max_accuracy'].append(
                                drift_detector.o_s_d_min)
                            self.current_warning_status[
                                'max_accuracy_time'].append(self.unix_time)
                    if self.drift_status is not self.__last_drift_status:
                        if self.drift_status:
                            self.current_drift_status['start'] = self.unix_time
                            self.current_drift_status['max_accuracy'] = [
                                drift_detector.o_s_d_min
                            ]
                            self.current_drift_status['max_accuracy_time'] = [
                                self.unix_time
                            ]
                        else:
                            self.current_drift_status[
                                'end'] = self.__last_unix_time
                            self.drift.append(self.current_drift_status)
                            self.current_drift_status = {}
                    else:
                        if self.drift_status:
                            self.current_drift_status['max_accuracy'].append(
                                drift_detector.o_s_d_min)
                            self.current_drift_status[
                                'max_accuracy_time'].append(self.unix_time)

                    self.__last_warning_status = self.warning_status
                    self.__last_drift_status = self.drift_status
                    self.__last_unix_time = self.unix_time
                else:
                    self.warning_status = False
                    self.drift_status = False

                if self.__instance_count == self.__data_length:  # 最后一个batch可能只有少部分数据,要考虑
                    # 权重
                    self.weights.append(
                        learner.currentClassifierWeights.tolist())
                    # 属性记录
                    self.calculate_correlation()
                    # 准确度(可能会变, 目前是后端计算drift的准确率)
                    self.accuracy.append(drift_detector.accuracy)
                    # batch 数量
                    self.batch_count.append(self.__count)
                    # 目前是 batch 中预测正确的个数, 后面改为concept drift 预测正确个数
                    self.right_count.append(self.__bingo)
                    # warning level
                    self.calculate_warning_level()

                # training
                learner.do_training(instance, self.drift_status)
            else:
                # training
                learner.do_training(instance, self.drift_status)

        self.save_file()
Ejemplo n.º 7
0
            # classifier flag
            flag = False
            prsa_flag = False

            attributes_set = construct_correlation(dataset)

            # Creating a data stream
            data = Data(dataset, sub_dataset)
            labels, attributes = data.get_attributes()
            attributes_scheme = AttributeScheme.get_scheme(attributes)
            __numeric_attribute_scheme = attributes_scheme['nominal']
            __numeric_attribute_scheme = attributes_scheme['numeric']

            # Creating a save content
            project = Project('projects/single/{}'.format(dataset),
                              sub_dataset)
            sub_folder_path = folder_create.sub_folder(sub_dataset)

            # Initializing a learner
            learner = Logistic(labels, attributes_scheme['numeric'])
            learner = OnlineAccuracyUpdatedEnsemble(
                labels,
                attributes_scheme['numeric'],
                learner,
                windowSize=__window_size,
                classifierLimit=10)

            # Initializing naive bayes
            nb_attributes = construct_attribute()
            nb_learner = NaiveBayes([0, 1], nb_attributes)
Ejemplo n.º 8
0
    def sub_thread(self, dataset, d_id, sub_dataset, spilt_inform,
                   folder_create, length):

        global global_count

        # Set variables
        date_time_flag = False
        data_set_id = d_id

        if dataset == 'prsa_data':
            __batch_size = 24 * 3600  # 3600 represent 1 hour
        elif dataset == 'movie_data':
            __batch_size = 24 * 7 * 3600  # 3600 represent 1 hour
        else:
            __batch_size = 0

        __instance_count = 0
        __window_size = 500
        __step = 1000
        __start_point = 0
        __count = 0
        __bingo = 0
        __last_unix_time = 0
        __last_warning_status = False
        __last_drift_status = False
        __data_length = Dataset.get_length(dataset, sub_dataset)

        configure = {}
        data_statistics = dict()
        delay = {}
        online = {}

        weights = []
        accuracy = []
        batch_start_time = []
        batch_count = []
        right_count = []
        warning = []
        current_warning_status = {}
        warning_level = []
        warning_level_set = []
        drift = []
        current_drift_status = {}
        predict_corr = []
        hit = []
        last_batch_attributions = None

        detection = True
        drift_status = False

        # classifier flag
        prsa_flag = False

        attributes_set = DistributedOnlineLearning.construct_correlation(
            dataset)

        # Creating a data stream
        data = Data(dataset, sub_dataset)
        labels, attributes = data.get_attributes()
        attributes_scheme = AttributeScheme.get_scheme(attributes)
        __numeric_attribute_scheme = attributes_scheme['numeric']

        # Creating a save content
        project = Project('projects/single/{}'.format(dataset), sub_dataset)
        sub_folder_path = folder_create.sub_folder(sub_dataset)

        # Initializing a learner
        learner = Logistic(labels, attributes_scheme['numeric'])
        learner = OnlineAccuracyUpdatedEnsemble(labels,
                                                attributes_scheme['numeric'],
                                                learner,
                                                windowSize=__window_size,
                                                classifierLimit=10)

        # Initializing a drift detector
        drift_detector = DDM()

        # Initializing a evaluator
        evaluator = EvaluateWithWindowSize(learner, drift_detector, project,
                                           __window_size)

        # train & test
        for x, y, attribute in data.data(batch_size=1):
            if attribute is not None:
                attributes_scheme = AttributeScheme.get_scheme(attributes)
                __numeric_attribute_scheme = attributes_scheme['numeric']
                continue

            instance = x.tolist()[0] + [int(y.tolist()[0][0])]

            # 每条数据的unix时间戳
            # prsa data
            if dataset == 'prsa_data':
                date_time_flag = True
                date_time = list(map(int, instance[:4]))
                d = datetime.date(date_time[0], date_time[1], date_time[2])
                t = datetime.time(date_time[3])
                datetime_str = str(d) + ' ' + str(t)
                unix_time = int(
                    time.mktime(
                        time.strptime(datetime_str, '%Y-%m-%d %H:%M:%S')))
                if unix_time >= 1363881600:
                    prsa_flag = True
            elif dataset == 'movie_data':
                # movie data
                if instance[-2] > 62091:
                    date_time_flag = True
                    prsa_flag = True
                    date_time = DistributedOnlineLearning.get_date(
                        instance[-2])
                    datetime_str = str(date_time) + ' ' + '00:00:00'
                    unix_time = int(
                        time.mktime(
                            time.strptime(datetime_str, '%Y-%m-%d %H:%M:%S')))

            instance[0:len(instance) - 1] = Normalizer.normalize(
                instance[0:len(instance) - 1], __numeric_attribute_scheme)
            __instance_count += 1

            if __instance_count > __window_size and date_time_flag and prsa_flag:
                if dataset == 'prsa_data':
                    if unix_time == 1363881600:
                        __start_point = unix_time
                        batch_start_time.append(__start_point)
                elif dataset == 'movie_data':
                    if __instance_count == __window_size + 1:
                        __start_point = unix_time
                        batch_start_time.append(__start_point)
                if unix_time - __start_point >= __batch_size:

                    self.con.acquire()  # 获得锁

                    __start_point = unix_time
                    # for every batch
                    # 权重
                    weights.append(learner.currentClassifierWeights.tolist())
                    # 属性记录
                    attributes_set, last_batch_attributions = DistributedOnlineLearning.calculate_correlation(
                        predict_corr, last_batch_attributions, attributes_set,
                        spilt_inform)
                    # 准确度(可能会变, 目前是后端计算drift的准确率)
                    accuracy.append(drift_detector.accuracy)
                    # batch开始时间
                    batch_start_time.append(__start_point)
                    # batch 数量
                    batch_count.append(__count)
                    # 目前是 batch 中预测正确的个数, 后面改为concept drift 预测正确个数
                    right_count.append(__bingo)
                    # warning level
                    warning_level, warning_level_set, hit = DistributedOnlineLearning.calculate_warning_level(
                        warning_level_set, warning_level, hit)

                    predict_corr = [instance]
                    __count = 1
                    __bingo = 0

                    # 新增bayes
                    if len(warning_level) > 1:
                        inst = [
                            self.wl_transformer(warning_level[-2]['max']),
                            hit[-1]
                        ]
                        if len(warning_level) > 2:
                            self.nb.set_ready()
                            predicted = self.nb.do_testing(inst)
                            print(data_set_id, predicted)
                            self.nb.do_training(inst)
                        else:
                            self.nb.do_training(inst)
                    global_count += 1

                    if global_count == length:
                        global_count = 0
                        self.con.notifyAll()
                    else:
                        self.con.wait()

                else:
                    __count += 1
                    predict_corr.append(instance)

                warning_level_set.append(drift_detector.risk)

                predicted_value = learner.do_testing(instance)

                prediction_status = evaluator.calculate_accuracy(
                    predicted_value,
                    instance[-1],
                    output_size=__step,
                    output_flag=False)
                if prediction_status:
                    __bingo += 1

                if detection is True:
                    warning_status, drift_status = drift_detector.detect(
                        prediction_status)
                    if warning_status is not __last_warning_status:
                        if warning_status:
                            current_warning_status['start'] = unix_time
                            current_warning_status['max_accuracy'] = [
                                drift_detector.o_s_d_min
                            ]
                            current_warning_status['max_accuracy_time'] = [
                                unix_time
                            ]
                            current_warning_status['backend_accuracy'] = [
                                drift_detector.accuracy
                            ]
                        else:
                            current_warning_status['end'] = __last_unix_time
                            warning.append(current_warning_status)
                            current_warning_status = {}
                    else:
                        if warning_status:
                            current_warning_status['max_accuracy'].append(
                                drift_detector.o_s_d_min)
                            current_warning_status['max_accuracy_time'].append(
                                unix_time)
                            current_warning_status['backend_accuracy'].append(
                                drift_detector.accuracy)
                    if drift_status is not __last_drift_status:
                        if drift_status:
                            current_drift_status['start'] = unix_time
                            current_drift_status['max_accuracy'] = [
                                drift_detector.o_s_d_min
                            ]
                            current_drift_status['max_accuracy_time'] = [
                                unix_time
                            ]
                            current_drift_status['backend_accuracy'] = [
                                drift_detector.accuracy
                            ]
                        else:
                            current_drift_status['end'] = __last_unix_time
                            drift.append(current_drift_status)
                            current_drift_status = {}
                    else:
                        if drift_status:
                            current_drift_status['max_accuracy'].append(
                                drift_detector.o_s_d_min)
                            current_drift_status['max_accuracy_time'].append(
                                unix_time)
                            current_drift_status['backend_accuracy'].append(
                                drift_detector.accuracy)

                    __last_warning_status = warning_status
                    __last_drift_status = drift_status
                    __last_unix_time = unix_time
                else:
                    warning_status = False
                    drift_status = False
                # if 1393401600 - 12*3600 <= unix_time <= 1393401600 + 12*3600:
                #     print("准确率为, S, P", evaluator.accuracy, drift_detector.S, drift_detector.P)

                if __instance_count == __data_length:  # 最后一个batch可能只有少部分数据,要考虑

                    self.con.acquire()  # 获得锁

                    # 权重
                    weights.append(learner.currentClassifierWeights.tolist())
                    # 属性记录
                    attributes_set, last_batch_attributions = DistributedOnlineLearning.calculate_correlation(
                        predict_corr, last_batch_attributions, attributes_set,
                        spilt_inform)
                    # 准确度(可能会变, 目前是后端计算drift的准确率)
                    accuracy.append(drift_detector.accuracy)
                    # batch 数量
                    batch_count.append(__count)
                    # 目前是 batch 中预测正确的个数, 后面改为concept drift 预测正确个数
                    right_count.append(__bingo)
                    # warning level
                    warning_level, warning_level_set, hit = DistributedOnlineLearning.calculate_warning_level(
                        warning_level_set, warning_level, hit)

                    # 新增bayes
                    if len(warning_level) > 1:
                        inst = [
                            self.wl_transformer(warning_level[-2]['max']),
                            hit[-1]
                        ]
                        if len(warning_level) > 2:
                            self.nb.do_testing(inst)
                            self.nb.do_training(inst)
                        else:
                            self.nb.do_training(inst)
                    global_count += 1

                    if global_count == length:
                        global_count = 0
                        if __instance_count == __data_length:
                            # 保存各种数据
                            pass
                            self.con.notifyAll()
                    else:
                        if __instance_count == __data_length:
                            # 保存各种数据
                            pass
                        self.con.wait()

                # training
                learner.do_training(instance, drift_status)
            else:
                # training
                learner.do_training(instance, drift_status)

        configure['timeStart'] = batch_start_time[0]
        configure['timeEnd'] = batch_start_time[-1]
        configure['timeUnit'] = __batch_size
        configure['dataNumMax'] = max(batch_count)

        data_statistics['name'] = sub_dataset
        delay['time'] = batch_start_time
        delay['accuracy'] = accuracy
        delay['bingo'] = []
        delay['hit'] = hit
        delay['warning'] = warning
        delay['drift'] = drift
        delay['warningLevel'] = warning_level
        delay['attributes'] = attributes_set

        online['weight'] = weights
        online['time'] = batch_start_time
        online['dataNum'] = batch_count

        data_statistics['delay'] = delay
        data_statistics['online'] = online
Ejemplo n.º 9
0
    def sub_thread(self, dataset, d_id, sub_dataset, spilt_inform,
                   folder_create, length):

        global global_count

        self.nb_set[sub_dataset] = NaiveBayes([0, 1], self.attributes)
        self.last_wl_status[sub_dataset] = []
        self.instance_set[sub_dataset] = []
        self.nb_classifier_accuracy[sub_dataset] = dict(all_count=0,
                                                        right_count=0,
                                                        accuracy=[])

        # Set variables
        date_time_flag = False
        data_set_id = d_id

        __batch_size = 0

        __instance_count = 0
        __window_size = 500
        __step = 1000
        __start_point = 0
        __count = 0
        # __bingo = 0
        __last_unix_time = 0
        __last_warning_status = False
        __last_drift_status = False
        __warning_level_max = 0
        __data_length = Dataset.get_length(dataset, sub_dataset)

        lc = []
        bc = []

        configure = {}
        data_statistics = dict()
        delay = {}
        online = {}

        weights = []
        accuracy = []
        batch_start_time = []
        batch_count = []
        # right_count = []
        warning = []
        current_warning_status = {}
        warning_level = []
        warning_level_set = []
        drift = []
        current_drift_status = {}
        predict_corr = []
        hit = []
        last_batch_attributions = None

        detection = True
        drift_status = False

        # classifier flag
        prsa_flag = False

        attributes_set = DistributedOnlineLearning.construct_correlation(
            dataset)

        # Creating a data stream
        data = Data(dataset, sub_dataset)
        labels, attributes = data.get_attributes()
        attributes_scheme = AttributeScheme.get_scheme(attributes)
        __numeric_attribute_scheme = attributes_scheme['numeric']

        # Creating a save content
        project = Project('projects/single/{}'.format(dataset), sub_dataset)
        sub_folder_path = folder_create.sub_folder(sub_dataset)

        # Initializing a learner
        learner = Logistic(labels, attributes_scheme['numeric'])
        learner = OnlineAccuracyUpdatedEnsemble(labels,
                                                attributes_scheme['numeric'],
                                                learner,
                                                windowSize=__window_size,
                                                classifierLimit=10)

        # Initializing a drift detector
        drift_detector = DDM()

        # Initializing a evaluator
        evaluator = EvaluateWithWindowSize(learner, drift_detector, project,
                                           __window_size)

        # train & test
        for x, y, attribute in data.data(batch_size=1):
            if attribute is not None:
                attributes_scheme = AttributeScheme.get_scheme(attributes)
                __numeric_attribute_scheme = attributes_scheme['numeric']
                continue

            instance = x.tolist()[0] + [int(y.tolist()[0][0])]

            # 每条数据的unix时间戳
            # prsa data
            if dataset == 'prsa_data':
                date_time_flag = True
                date_time = list(map(int, instance[:4]))
                d = datetime.date(date_time[0], date_time[1], date_time[2])
                tt = datetime.time(date_time[3])
                datetime_str = str(d) + ' ' + str(tt)
                unix_time = int(
                    time.mktime(
                        time.strptime(datetime_str, '%Y-%m-%d %H:%M:%S')))
                if unix_time >= 1363881600:
                    prsa_flag = True
            elif dataset == 'movie_data':
                # movie data
                if instance[-2] > 62091:
                    date_time_flag = True
                    prsa_flag = True
                    date_time = DistributedOnlineLearning.get_date(
                        instance[-2])
                    datetime_str = str(date_time) + ' ' + '00:00:00'
                    unix_time = int(
                        time.mktime(
                            time.strptime(datetime_str, '%Y-%m-%d %H:%M:%S')))

            instance[0:len(instance) - 1] = Normalizer.normalize(
                instance[0:len(instance) - 1], __numeric_attribute_scheme)
            __instance_count += 1

            if __instance_count > __window_size and date_time_flag and prsa_flag:
                if dataset == 'prsa_data':
                    if unix_time == 1363881600:
                        __start_point = unix_time
                        batch_start_time.append(__start_point)
                elif dataset == 'movie_data':
                    if __instance_count == __window_size + 1:
                        __start_point = unix_time
                        batch_start_time.append(__start_point)

                difference_value = unix_time - __start_point
                if difference_value >= __batch_size:
                    self.con.acquire()  # 获得锁
                    batch_interval = int(difference_value / __batch_size)
                    for cc in range(batch_interval):
                        if cc == 0:
                            r_f = False
                        else:
                            r_f = True
                        __start_point += __batch_size
                        # for every batch
                        # 权重
                        weights.append(
                            learner.currentClassifierWeights.tolist())
                        # 属性记录
                        attributes_set, last_batch_attributions, lc, bc = self.calculate_correlation(
                            predict_corr,
                            last_batch_attributions,
                            attributes_set,
                            spilt_inform,
                            lc,
                            bc,
                            repeat_flag=r_f)
                        # 准确度(可能会变, 目前是后端计算drift的准确率)
                        accuracy.append(drift_detector.accuracy)
                        # batch开始时间
                        batch_start_time.append(__start_point)
                        # batch 数量
                        batch_count.append(__count)
                        # warning level
                        warning_level, warning_level_set, hit, __warning_level_max = self.calculate_warning_level(
                            warning_level_set,
                            warning_level,
                            hit,
                            __warning_level_max,
                            repeated_flag=r_f)
                        # 保存每个数据源的当前状态
                        self.last_wl_status[sub_dataset] = [
                            self.wl_transformer(warning_level[-1]['max']),
                            hit[-1]
                        ]
                        __count = 0

                        global_count += 1

                        if global_count == length:
                            global_count = 0
                            # 训练和测试每个模型的贝叶斯
                            if len(warning_level) > 1:
                                d_s_n = self.nb_set.keys()
                                for data_set_name in d_s_n:
                                    self.instance_set[data_set_name] = [self.last_wl_status[value][0]
                                                                        for value in d_s_n if value != data_set_name]\
                                                                       + [self.last_wl_status[data_set_name][-1]]
                                if len(warning_level) > 2:
                                    # testing
                                    for temple_name in d_s_n:
                                        self.nb_set[temple_name].set_ready()
                                        predict = self.nb_set[
                                            temple_name].do_testing(
                                                self.instance_set[temple_name])
                                        if predict == self.instance_set[
                                                temple_name][-1]:
                                            self.nb_classifier_accuracy[
                                                temple_name][
                                                    'right_count'] += 1
                                        self.nb_classifier_accuracy[
                                            temple_name]['all_count'] += 1
                                        self.nb_classifier_accuracy[
                                            temple_name]['accuracy'].append(
                                                round(
                                                    self.
                                                    nb_classifier_accuracy[
                                                        temple_name]
                                                    ['right_count'] / self.
                                                    nb_classifier_accuracy[
                                                        temple_name]
                                                    ['all_count'], 4))
                                    # training
                                    for temple_name in d_s_n:
                                        self.nb_set[temple_name].do_training(
                                            self.instance_set[temple_name],
                                            drift_status)
                                else:
                                    for temple_name in d_s_n:
                                        self.nb_set[temple_name].do_training(
                                            self.instance_set[temple_name],
                                            drift_status)
                            # print(self.nb_classifier_accuracy)
                            self.con.notifyAll()
                        else:
                            self.con.wait()

                    predict_corr = [instance]
                    __count = 1
                else:
                    __count += 1
                    predict_corr.append(instance)

                warning_level_set.append(drift_detector.risk)

                predicted_value = learner.do_testing(instance)

                prediction_status = evaluator.calculate_accuracy(
                    predicted_value,
                    instance[-1],
                    output_size=__step,
                    output_flag=False)
                # if prediction_status:
                #     __bingo += 1

                if detection is True:
                    warning_status, drift_status = drift_detector.detect(
                        prediction_status)
                    if warning_status is not __last_warning_status:
                        if warning_status:
                            current_warning_status['start'] = unix_time
                            current_warning_status['max_accuracy'] = [
                                drift_detector.o_s_d_min
                            ]
                            current_warning_status['max_accuracy_time'] = [
                                unix_time
                            ]
                            current_warning_status['backend_accuracy'] = [
                                drift_detector.accuracy
                            ]
                        else:
                            current_warning_status['end'] = __last_unix_time
                            warning.append(current_warning_status)
                            current_warning_status = {}
                    else:
                        if warning_status:
                            current_warning_status['max_accuracy'].append(
                                drift_detector.o_s_d_min)
                            current_warning_status['max_accuracy_time'].append(
                                unix_time)
                            current_warning_status['backend_accuracy'].append(
                                drift_detector.accuracy)
                    if drift_status is not __last_drift_status:
                        if drift_status:
                            current_drift_status['start'] = unix_time
                            current_drift_status['max_accuracy'] = [
                                drift_detector.o_s_d_min
                            ]
                            current_drift_status['max_accuracy_time'] = [
                                unix_time
                            ]
                            current_drift_status['backend_accuracy'] = [
                                drift_detector.accuracy
                            ]
                        else:
                            current_drift_status['end'] = __last_unix_time
                            drift.append(current_drift_status)
                            current_drift_status = {}
                    else:
                        if drift_status:
                            current_drift_status['max_accuracy'].append(
                                drift_detector.o_s_d_min)
                            current_drift_status['max_accuracy_time'].append(
                                unix_time)
                            current_drift_status['backend_accuracy'].append(
                                drift_detector.accuracy)

                    __last_warning_status = warning_status
                    __last_drift_status = drift_status
                    __last_unix_time = unix_time
                else:
                    # warning_status = False
                    drift_status = False

                if __instance_count == __data_length:  # 最后一个batch可能只有少部分数据,要考虑

                    self.con.acquire()  # 获得锁

                    # 权重
                    weights.append(learner.currentClassifierWeights.tolist())
                    # 属性记录
                    attributes_set, last_batch_attributions, lc, bc = self.calculate_correlation(
                        predict_corr,
                        last_batch_attributions,
                        attributes_set,
                        spilt_inform,
                        lc,
                        bc,
                        repeat_flag=False)
                    # 准确度(可能会变, 目前是后端计算drift的准确率)
                    accuracy.append(drift_detector.accuracy)
                    # batch 数量
                    batch_count.append(__count)
                    # # 目前是 batch 中预测正确的个数, 后面改为concept drift 预测正确个数
                    # right_count.append(__bingo)
                    # warning level
                    warning_level, warning_level_set, hit, __warning_level_max = self.calculate_warning_level(
                        warning_level_set,
                        warning_level,
                        hit,
                        __warning_level_max,
                        repeated_flag=False)

                    global_count += 1

                    if global_count == length:
                        global_count = 0
                        # 训练和测试每个模型的贝叶斯
                        if len(warning_level) > 1:
                            d_s_n = self.nb_set.keys()
                            for data_set_name in d_s_n:
                                self.instance_set[data_set_name] = [self.last_wl_status[value][0]
                                                                    for value in d_s_n if value != data_set_name] \
                                                                   + [self.last_wl_status[data_set_name][-1]]
                            if len(warning_level) > 2:
                                # testing
                                for temple_name in d_s_n:
                                    predict = self.nb_set[
                                        temple_name].do_testing(
                                            self.instance_set[temple_name])
                                    if predict == self.instance_set[
                                            temple_name][-1]:
                                        self.nb_classifier_accuracy[
                                            temple_name]['right_count'] += 1
                                    self.nb_classifier_accuracy[temple_name][
                                        'all_count'] += 1
                                    self.nb_classifier_accuracy[temple_name][
                                        'accuracy'].append(
                                            round(
                                                self.nb_classifier_accuracy[
                                                    temple_name]['right_count']
                                                / self.nb_classifier_accuracy[
                                                    temple_name]['all_count'],
                                                4))
                                # training
                                for temple_name in d_s_n:
                                    self.nb_set[temple_name].do_training(
                                        self.instance_set[temple_name],
                                        drift_status)
                            else:
                                for temple_name in d_s_n:
                                    self.nb_set[temple_name].do_training(
                                        self.instance_set[temple_name],
                                        drift_status)
                        # 保存最后一个数据源的状态
                        configure['timeStart'] = batch_start_time[0]
                        configure['timeEnd'] = batch_start_time[-1]
                        configure['timeUnit'] = __batch_size
                        configure['dataNumMax'] = max(batch_count)
                        configure['warningLevelMax'] = __warning_level_max

                        data_statistics['name'] = sub_dataset
                        delay['time'] = batch_start_time
                        delay['accuracy'] = accuracy
                        delay['bingo'] = []
                        delay['hit'] = hit[1:]
                        delay['warning'] = warning
                        delay['drift'] = drift
                        delay['warningLevel'] = warning_level
                        delay['attributes'] = attributes_set

                        online['weight'] = weights
                        online['time'] = batch_start_time
                        online['dataNum'] = batch_count

                        data_statistics['delay'] = delay
                        data_statistics['online'] = online

                        save_path = sub_folder_path + '/'
                        self.save_file(configure,
                                       save_path,
                                       type_name='configure')
                        self.save_file(data_statistics,
                                       save_path,
                                       type_name=None)

                        # 提示所有数据训练完成,可结束主进程
                        print(
                            'All data has been trained. Please finish the main process manually!'
                        )
                        Zip.plot_multi(self.nb_classifier_accuracy)
                        Zip(folder_create.get_path())
                        self.con.notifyAll()
                    else:
                        configure['timeStart'] = batch_start_time[0]
                        configure['timeEnd'] = batch_start_time[-1]
                        configure['timeUnit'] = __batch_size
                        configure['dataNumMax'] = max(batch_count)

                        data_statistics['name'] = sub_dataset
                        delay['time'] = batch_start_time
                        delay['accuracy'] = accuracy
                        delay['bingo'] = []
                        delay['hit'] = hit[1:]
                        delay['warning'] = warning
                        delay['drift'] = drift
                        delay['warningLevel'] = warning_level
                        delay['attributes'] = attributes_set

                        online['weight'] = weights
                        online['time'] = batch_start_time
                        online['dataNum'] = batch_count

                        data_statistics['delay'] = delay
                        data_statistics['online'] = online

                        save_path = sub_folder_path + '/'
                        self.save_file(configure,
                                       save_path,
                                       type_name='configure')
                        self.save_file(data_statistics,
                                       save_path,
                                       type_name=None)

                        self.con.wait()

                # training
                learner.do_training(instance, drift_status)
            else:
                # training
                learner.do_training(instance, drift_status)
Ejemplo n.º 10
0
from data_structures.attribute_scheme import AttributeScheme
from classifier.__init__ import *
from drift_detection.__init__ import *
from filters.project_creator import Project
from streams.readers.arff_reader import ARFFReader
from tasks.__init__ import *

project_name = "airlines"

project = Project("projects", f"{project_name}")

labels, attributes, stream_records = ARFFReader.read(
    f"data_streams/{project_name}.arff")
attributes_scheme = AttributeScheme.get_scheme(attributes)

actual_drift_points = [20000, 40000, 60000, 80000]
drift_acceptance_interval = 250

# 3. Creating classifier and drifter pairs
pairs = [
    (HoeffdingTree(labels, attributes_scheme['nominal']), DDM(), "ddm"),
    (NaiveBayes(labels, attributes_scheme['nominal']), DDM(), "ddm"),
    (HoeffdingTree(labels,
                   attributes_scheme['nominal']), HDDM_A_test(), "hddm_a"),
    (HoeffdingTree(labels,
                   attributes_scheme['nominal']), HDDM_W_test(), "hddm_w"),
    (HoeffdingTree(labels, attributes_scheme['nominal']), EDDM(), "eddm"),
    (HoeffdingTree(labels, attributes_scheme['nominal']),
     ADWINChangeDetector(), "ADWIN")
]
from classifier.__init__ import *
from drift_detection.__init__ import *
from filters.project_creator import Project
from streams.readers.arff_reader import ARFFReader
from tasks.__init__ import *
from regression.__init__ import *

# 1. Creating a project
project = Project("projects/single", "expo")

# 2. Loading an arff file
labels, attributes, stream_records = ARFFReader.read(
    "data_streams/_synthetic/EXPO/EXPO.arff")
# attributes_scheme = AttributeScheme.get_scheme(attributes)

# 3. Initializing a Learner
# learner = NaiveBayes(labels, attributes_scheme['nominal'])
learner = Expo()

# 4. Initializing a drift detector
detector = FHDDMS(n=100)
# actual_drift_points = [20000, 40000, 60000, 80000]
actual_drift_points = [500, 1000, 1500, 2000]
drift_acceptance_interval = 100

# 5. Creating a Prequential Evaluation Process
prequential = PrequentialRegressionDriftEvaluator(learner, detector,
                                                  actual_drift_points,
                                                  drift_acceptance_interval,
                                                  project)