for dataset in Dataset.DATASET: for sub_dataset in Dataset.get_sub_dataset(dataset): # Set variables __instance_count = 0 __window_size = 500 __step = 1000 detection = True warning_status = False drift_status = False # classifier flag flag = False # Creating a data stream data = Data(dataset, sub_dataset) labels, attributes = data.get_attributes() attributes_scheme = AttributeScheme.get_scheme(attributes) __numeric_attribute_scheme = attributes_scheme['nominal'] __numeric_attribute_scheme = attributes_scheme['numeric'] # Initializing a learner learner = Logistic(labels, attributes_scheme['numeric']) learner_copy = cp.deepcopy(learner) # Initializing a Classifier-Detector Pairs pairs = [[ OnlineAccuracyUpdatedEnsemble(labels, attributes_scheme['numeric'], learner_copy, windowSize=__window_size,
def sub_thread(self, dataset, sub_dataset, spilt_inform, folder_create, length): global global_count global naive_bayes_batch_count self.nb_set[sub_dataset] = NaiveBayes([0, 1], self.attributes) self.last_wl_status[sub_dataset] = dict(r_l=0, hit=[]) self.instance_set[sub_dataset] = [] self.nb_classifier_accuracy[sub_dataset] = dict(all_count=0, right_count=0, accuracy=[]) self.nb_batch_count[sub_dataset] = 0 self.nb_drift_prob[sub_dataset] = dict(prob=[], ground_truth=[]) self.plot_risk_level[sub_dataset] = 0 self.data_statistics[sub_dataset] = dict( name=sub_dataset, delay=dict(time=[], accuracy=[], nb_prob=[], bingo=[], hit=[], warning=[], drift=[], warningLevel=[], attributes=self.construct_correlation(dataset), batch_delay=2), online=dict(weight=[], time=[], dataNum=[])) self.configure[sub_dataset] = {} self.warning_level_max[sub_dataset] = 0 # Set variables date_time_flag = False __batch_size = 24 * 3600 __instance_count = 0 __window_size = Dataset.get_online_learning_batch_interval( dataset, sub_dataset) __step = 1000 __start_point = 0 __count = 0 __last_unix_time = 0 __last_warning_status = False __last_drift_status = False __data_length = Dataset.get_length(dataset, sub_dataset) __detect_interval = Dataset.get_detect_batch_interval( dataset, sub_dataset) lc = [] bc = [] current_warning_status = {} warning_level_set = [] current_drift_status = {} predict_corr = [] last_batch_attributions = None detection = True drift_status = False # classifier flag prsa_flag = False # Creating a data stream data = Data(dataset, sub_dataset) labels, attributes = data.get_attributes() attributes_scheme = AttributeScheme.get_scheme(attributes) __numeric_attribute_scheme = attributes_scheme['numeric'] # Creating a save content project = Project('./projects/single/{}'.format(dataset), sub_dataset) self.sub_file_path[sub_dataset] = folder_create.sub_folder(sub_dataset) # Initializing a learner learner = Logistic(labels, attributes_scheme['numeric']) learner = OnlineAccuracyUpdatedEnsemble(labels, attributes_scheme['numeric'], learner, windowSize=__window_size, classifierLimit=10) # Initializing a drift detector drift_detector = DDM(min_instance=__detect_interval) # Initializing a evaluator evaluator = EvaluateWithWindowSize(learner, drift_detector, project, __window_size) # train & test for x, y, attribute in data.data(batch_size=1): if attribute is not None: attributes_scheme = AttributeScheme.get_scheme(attributes) __numeric_attribute_scheme = attributes_scheme['numeric'] continue instance = x.tolist()[0] + [int(y.tolist()[0][0])] # print(instance) # 每条数据的unix时间戳 # prsa data if dataset == 'prsa_data': date_time_flag = True date_time = list(map(int, instance[:4])) d = datetime.date(date_time[0], date_time[1], date_time[2]) tt = datetime.time(date_time[3]) datetime_str = str(d) + ' ' + str(tt) unix_time = int( time.mktime( time.strptime(datetime_str, '%Y-%m-%d %H:%M:%S'))) if unix_time >= 1363881600: prsa_flag = True elif dataset == 'movie_data': # movie data if instance[-2] > 62091: date_time_flag = True prsa_flag = True date_time = DistributedOnlineLearning.get_date( instance[-2]) datetime_str = str(date_time) + ' ' + '00:00:00' unix_time = int( time.mktime( time.strptime(datetime_str, '%Y-%m-%d %H:%M:%S'))) instance.pop(-2) # 去掉时间 elif dataset == 'netease_data': date_time_flag = True prsa_flag = True unix_time = int(instance[0]) instance.pop(0) # 去掉时间 instance[0:len(instance) - 1] = Normalizer.normalize( instance[0:len(instance) - 1], __numeric_attribute_scheme) # print(instance) __instance_count += 1 if __instance_count % 10000 == 0 or __instance_count == __data_length: percentage = (__instance_count / __data_length) * 100 print( sub_dataset, "%0.2f" % percentage + "% of instances are prequentially processed!") if __instance_count > __window_size and date_time_flag and prsa_flag: if dataset == 'prsa_data': if unix_time == 1363881600: print(__instance_count) __start_point = unix_time self.data_statistics[sub_dataset]['delay'][ 'time'].append(__start_point) elif dataset == 'movie_data': if __instance_count == __window_size + 1: print(__instance_count) __start_point = unix_time self.data_statistics[sub_dataset]['delay'][ 'time'].append(__start_point) elif dataset == 'netease_data': if __instance_count == __window_size + 1: print(__instance_count) __start_point = unix_time self.data_statistics[sub_dataset]['delay'][ 'time'].append(__start_point) difference_value = unix_time - __start_point if difference_value >= __batch_size and __instance_count != __data_length: batch_interval = int(difference_value / __batch_size) for cc in range(batch_interval): if cc == 0: r_f = False else: r_f = True self.con.acquire() # 获得锁 __start_point += __batch_size # for every batch # 权重 self.data_statistics[sub_dataset]['online'][ 'weight'].append( learner.currentClassifierWeights.tolist()) # 属性记录 self.data_statistics[sub_dataset]['delay'][ 'attributes'], last_batch_attributions, lc, bc = self.calculate_correlation( predict_corr, last_batch_attributions, self.data_statistics[sub_dataset]['delay'] ['attributes'], spilt_inform, lc, bc, repeat_flag=r_f) # 准确度(可能会变, 目前是后端计算drift的准确率) self.data_statistics[sub_dataset]['delay'][ 'accuracy'].append(drift_detector.accuracy) # batch开始时间 self.data_statistics[sub_dataset]['delay'][ 'time'].append(__start_point) # batch 数量 self.data_statistics[sub_dataset]['online'][ 'dataNum'].append(__count) # warning level self.data_statistics[sub_dataset]['delay'][ 'warningLevel'], warning_level_set, self.data_statistics[ sub_dataset]['delay'][ 'hit'], self.warning_level_max[ sub_dataset] = self.calculate_warning_level( warning_level_set, self.data_statistics[sub_dataset] ['delay']['warningLevel'], self.data_statistics[sub_dataset] ['delay']['hit'], self. warning_level_max[sub_dataset], repeated_flag=r_f) __count = 0 # self.last_wl_status[sub_dataset]['r_l'] = self.wl_transformer(self.data_statistics[sub_dataset]['delay']['warningLevel'][-1]['max']) if len(self.last_wl_status[sub_dataset]['hit']) > 2: self.last_wl_status[sub_dataset]['hit'].pop(0) self.last_wl_status[sub_dataset]['hit'].append( self.data_statistics[sub_dataset]['delay'] ['hit'][-1]) else: self.last_wl_status[sub_dataset]['hit'].append( self.data_statistics[sub_dataset]['delay'] ['hit'][-1]) global_count += 1 # self.plot_risk_level[sub_dataset] = self.data_statistics[sub_dataset]['delay']['warningLevel'][-1]['max'] if global_count == length: global_count = 0 # 训练和测试每个模型的贝叶斯 d_s_n = self.nb_set.keys() # 训练贝叶斯 if len(self.data_statistics[sub_dataset]['delay'] ['warningLevel']) > 2: for data_set_name in d_s_n: self.instance_set[data_set_name] = [self.wl_transformer(self.data_statistics[value]['delay']['warningLevel'][-2]['max']) for value in d_s_n if value != data_set_name] \ + [max(self.last_wl_status[data_set_name]['hit'])] if len(self.data_statistics[sub_dataset] ['delay']['warningLevel']) > 3: # testing for temple_name in d_s_n: self.nb_set[temple_name].set_ready() predict = self.nb_set[ temple_name].do_testing( self.instance_set[temple_name]) self.data_statistics[temple_name][ 'delay']['nb_prob'].append( self.nb_set[temple_name]. drift_prob) self.nb_drift_prob[temple_name][ 'prob'].append( self.nb_set[temple_name]. drift_prob) self.nb_drift_prob[temple_name][ 'ground_truth'].append( self.data_statistics[ temple_name]['delay'] ['warningLevel'][-2]['max']) # 测试使用3个batch中最大的drift_level # self.nb_drift_prob[temple_name]['ground_truth'].append( # max( # self.data_statistics[temple_name]['delay']['warningLevel'][-3]['max'], # self.data_statistics[temple_name]['delay']['warningLevel'][-2]['max'], # self.data_statistics[temple_name]['delay']['warningLevel'][-1]['max'] # ) # ) if predict == self.instance_set[ temple_name][-1]: self.nb_classifier_accuracy[ temple_name][ 'right_count'] += 1 self.nb_classifier_accuracy[ temple_name]['all_count'] += 1 self.nb_classifier_accuracy[ temple_name]['accuracy'].append( round( self. nb_classifier_accuracy[ temple_name] ['right_count'] / self. nb_classifier_accuracy[ temple_name] ['all_count'], 4)) # training for temple_name in d_s_n: self.nb_set[temple_name].do_training( self.instance_set[temple_name], drift_status) else: for temple_name in d_s_n: self.nb_set[temple_name].do_training( self.instance_set[temple_name], drift_status) self.con.notifyAll() else: self.con.wait() predict_corr = [instance] __count = 1 else: __count += 1 predict_corr.append(instance) predicted_value = learner.do_testing(instance) # 每一个实例训练一次 prediction_status = evaluator.calculate_accuracy( predicted_value, instance[-1], output_size=__step, output_flag=False) if detection is True: warning_status, drift_status = drift_detector.detect( prediction_status) if warning_status is not __last_warning_status: if warning_status: current_warning_status['start'] = unix_time current_warning_status['max_accuracy'] = [ drift_detector.o_s_d_min ] current_warning_status['max_accuracy_time'] = [ unix_time ] current_warning_status['backend_accuracy'] = [ drift_detector.accuracy ] else: current_warning_status['end'] = __last_unix_time self.data_statistics[sub_dataset]['delay'][ 'warning'].append(current_warning_status) current_warning_status = {} else: if warning_status: current_warning_status['max_accuracy'].append( drift_detector.o_s_d_min) current_warning_status['max_accuracy_time'].append( unix_time) current_warning_status['backend_accuracy'].append( drift_detector.accuracy) if drift_status is not __last_drift_status: if drift_status: current_drift_status['start'] = unix_time current_drift_status['max_accuracy'] = [ drift_detector.o_s_d_min ] current_drift_status['max_accuracy_time'] = [ unix_time ] current_drift_status['backend_accuracy'] = [ drift_detector.accuracy ] else: current_drift_status['end'] = __last_unix_time self.data_statistics[sub_dataset]['delay'][ 'drift'].append(current_drift_status) current_drift_status = {} else: if drift_status: current_drift_status['max_accuracy'].append( drift_detector.o_s_d_min) current_drift_status['max_accuracy_time'].append( unix_time) current_drift_status['backend_accuracy'].append( drift_detector.accuracy) __last_warning_status = warning_status __last_drift_status = drift_status __last_unix_time = unix_time else: # warning_status = False drift_status = False warning_level_set.append( drift_detector.risk) # 记录ddm检测后的warning_level # if 1365912000-12*3600 <= unix_time <= 1365998400-12*3600: # if unix_time == 1365998400 - 13*3600: # print(sub_dataset, len(warning_level_set), warning_level_set) # if unix_time == 1365998400-12*3600: # print(sub_dataset, self.data_statistics[sub_dataset]['delay']['warningLevel'][-1]) # if drift_status: # print('数据集{}真实drift点->'.format(sub_dataset), unix_time, drift_detector.risk) # if 1366401600 <= unix_time <= 1366488000: # # print(sub_dataset, len(warning_level_set), warning_level_set) # if unix_time == 1366488000: # print(sub_dataset, self.data_statistics[sub_dataset]['delay']['warningLevel'][-1]) # if drift_status: # print('数据集{}真实drift点->'.format(sub_dataset), unix_time, drift_detector.risk) if __instance_count == __data_length: # 最后一个batch可能只有少部分数据,要考虑 print(sub_dataset, global_count) self.con.acquire() # 获得锁 # 权重 self.data_statistics[sub_dataset]['online'][ 'weight'].append( learner.currentClassifierWeights.tolist()) # 属性记录 self.data_statistics[sub_dataset]['delay'][ 'attributes'], last_batch_attributions, lc, bc = self.calculate_correlation( predict_corr, last_batch_attributions, self.data_statistics[sub_dataset]['delay'] ['attributes'], spilt_inform, lc, bc, repeat_flag=False) # 准确度(可能会变, 目前是后端计算drift的准确率) self.data_statistics[sub_dataset]['delay'][ 'accuracy'].append(drift_detector.accuracy) # batch 数量 self.data_statistics[sub_dataset]['online'][ 'dataNum'].append(__count) # warning level self.data_statistics[sub_dataset]['delay'][ 'warningLevel'], warning_level_set, self.data_statistics[ sub_dataset]['delay'][ 'hit'], self.warning_level_max[ sub_dataset] = self.calculate_warning_level( warning_level_set, self.data_statistics[sub_dataset] ['delay']['warningLevel'], self.data_statistics[sub_dataset] ['delay']['hit'], self.warning_level_max[sub_dataset], repeated_flag=False) self.last_wl_status[sub_dataset][ 'r_l'] = self.wl_transformer( self.data_statistics[sub_dataset]['delay'] ['warningLevel'][-1]['max']) if len(self.last_wl_status[sub_dataset]['hit']) > 2: self.last_wl_status[sub_dataset]['hit'].pop(0) self.last_wl_status[sub_dataset]['hit'].append( self.data_statistics[sub_dataset]['delay']['hit'] [-1]) else: self.last_wl_status[sub_dataset]['hit'].append( self.data_statistics[sub_dataset]['delay']['hit'] [-1]) global_count += 1 # 画 drift probability # Zip.plot_multi_1(self.nb_drift_prob[sub_dataset], sub_dataset) if global_count == length: global_count = 0 # # 训练和测试每个模型的贝叶斯 d_s_n = self.nb_set.keys() for data_set_name in d_s_n: self.instance_set[data_set_name] = [self.wl_transformer(self.data_statistics[value]['delay']['warningLevel'][-2]['max']) for value in d_s_n if value != data_set_name] \ + [max(self.last_wl_status[data_set_name]['hit'])] # testing for temple_name in d_s_n: self.nb_set[temple_name].set_ready() predict = self.nb_set[temple_name].do_testing( self.instance_set[temple_name]) self.data_statistics[temple_name]['delay'][ 'nb_prob'].append( self.nb_set[temple_name].drift_prob) self.nb_drift_prob[temple_name]['prob'].append( self.nb_set[temple_name].drift_prob) self.nb_drift_prob[temple_name][ 'ground_truth'].append( self.data_statistics[temple_name]['delay'] ['warningLevel'][-2]['max']) # 测试使用3个batch中最大的drift_level # self.nb_drift_prob[temple_name]['ground_truth'].append( # max( # self.data_statistics[temple_name]['delay']['warningLevel'][-3]['max'], # self.data_statistics[temple_name]['delay']['warningLevel'][-2]['max'], # self.data_statistics[temple_name]['delay']['warningLevel'][-1]['max'] # ) # ) if predict == self.instance_set[temple_name][-1]: self.nb_classifier_accuracy[temple_name][ 'right_count'] += 1 self.nb_classifier_accuracy[temple_name][ 'all_count'] += 1 self.nb_classifier_accuracy[temple_name][ 'accuracy'].append( round( self.nb_classifier_accuracy[ temple_name]['right_count'] / self.nb_classifier_accuracy[ temple_name]['all_count'], 4)) # training for temple_name in d_s_n: self.nb_set[temple_name].do_training( self.instance_set[temple_name], drift_status) # 保存每个数据源的状态 # ① 每个数据源概念漂移检测+贝叶斯drift概率 + configure for key_name in self.data_statistics.keys(): self.configure[key_name][ 'timeStart'] = self.data_statistics[key_name][ 'delay']['time'][0] self.configure[key_name][ 'timeEnd'] = self.data_statistics[key_name][ 'delay']['time'][-1] self.configure[key_name]['timeUnit'] = __batch_size self.configure[key_name]['dataNumMax'] = max( self.data_statistics[key_name]['online'] ['dataNum']) self.configure[key_name][ 'warningLevelMax'] = self.warning_level_max[ key_name] self.configure[key_name]['warningLevel'] = [[ 0, 2 ], [2, 3], [3, 10000]] self.data_statistics[key_name]['delay'][ 'hit'] = self.data_statistics[key_name][ 'delay']['hit'][:-1] self.save_file_1(self.configure, self.sub_file_path, type_name='configure') self.save_file_1(self.data_statistics, self.sub_file_path, type_name=None) self.save_file(self.nb_drift_prob, folder_create.get_path(), type_name='experiment_with_the_figure') Zip.plot_multi(self.nb_classifier_accuracy) Zip(folder_create.get_path()) # 提示所有数据训练完成,可结束主进程 print( 'All data has been trained. Please finish the main process manually!' ) self.con.notifyAll() else: self.con.wait() # training learner.do_training(instance, drift_status) else: # training learner.do_training(instance, drift_status)
def sub_thread(self, dataset, d_id, sub_dataset, spilt_inform, folder_create, length): global global_count # Set variables date_time_flag = False data_set_id = d_id if dataset == 'prsa_data': __batch_size = 24 * 3600 # 3600 represent 1 hour elif dataset == 'movie_data': __batch_size = 24 * 7 * 3600 # 3600 represent 1 hour else: __batch_size = 0 __instance_count = 0 __window_size = 500 __step = 1000 __start_point = 0 __count = 0 __bingo = 0 __last_unix_time = 0 __last_warning_status = False __last_drift_status = False __data_length = Dataset.get_length(dataset, sub_dataset) configure = {} data_statistics = dict() delay = {} online = {} weights = [] accuracy = [] batch_start_time = [] batch_count = [] right_count = [] warning = [] current_warning_status = {} warning_level = [] warning_level_set = [] drift = [] current_drift_status = {} predict_corr = [] hit = [] last_batch_attributions = None detection = True drift_status = False # classifier flag prsa_flag = False attributes_set = DistributedOnlineLearning.construct_correlation( dataset) # Creating a data stream data = Data(dataset, sub_dataset) labels, attributes = data.get_attributes() attributes_scheme = AttributeScheme.get_scheme(attributes) __numeric_attribute_scheme = attributes_scheme['numeric'] # Creating a save content project = Project('projects/single/{}'.format(dataset), sub_dataset) sub_folder_path = folder_create.sub_folder(sub_dataset) # Initializing a learner learner = Logistic(labels, attributes_scheme['numeric']) learner = OnlineAccuracyUpdatedEnsemble(labels, attributes_scheme['numeric'], learner, windowSize=__window_size, classifierLimit=10) # Initializing a drift detector drift_detector = DDM() # Initializing a evaluator evaluator = EvaluateWithWindowSize(learner, drift_detector, project, __window_size) # train & test for x, y, attribute in data.data(batch_size=1): if attribute is not None: attributes_scheme = AttributeScheme.get_scheme(attributes) __numeric_attribute_scheme = attributes_scheme['numeric'] continue instance = x.tolist()[0] + [int(y.tolist()[0][0])] # 每条数据的unix时间戳 # prsa data if dataset == 'prsa_data': date_time_flag = True date_time = list(map(int, instance[:4])) d = datetime.date(date_time[0], date_time[1], date_time[2]) t = datetime.time(date_time[3]) datetime_str = str(d) + ' ' + str(t) unix_time = int( time.mktime( time.strptime(datetime_str, '%Y-%m-%d %H:%M:%S'))) if unix_time >= 1363881600: prsa_flag = True elif dataset == 'movie_data': # movie data if instance[-2] > 62091: date_time_flag = True prsa_flag = True date_time = DistributedOnlineLearning.get_date( instance[-2]) datetime_str = str(date_time) + ' ' + '00:00:00' unix_time = int( time.mktime( time.strptime(datetime_str, '%Y-%m-%d %H:%M:%S'))) instance[0:len(instance) - 1] = Normalizer.normalize( instance[0:len(instance) - 1], __numeric_attribute_scheme) __instance_count += 1 if __instance_count > __window_size and date_time_flag and prsa_flag: if dataset == 'prsa_data': if unix_time == 1363881600: __start_point = unix_time batch_start_time.append(__start_point) elif dataset == 'movie_data': if __instance_count == __window_size + 1: __start_point = unix_time batch_start_time.append(__start_point) if unix_time - __start_point >= __batch_size: self.con.acquire() # 获得锁 __start_point = unix_time # for every batch # 权重 weights.append(learner.currentClassifierWeights.tolist()) # 属性记录 attributes_set, last_batch_attributions = DistributedOnlineLearning.calculate_correlation( predict_corr, last_batch_attributions, attributes_set, spilt_inform) # 准确度(可能会变, 目前是后端计算drift的准确率) accuracy.append(drift_detector.accuracy) # batch开始时间 batch_start_time.append(__start_point) # batch 数量 batch_count.append(__count) # 目前是 batch 中预测正确的个数, 后面改为concept drift 预测正确个数 right_count.append(__bingo) # warning level warning_level, warning_level_set, hit = DistributedOnlineLearning.calculate_warning_level( warning_level_set, warning_level, hit) predict_corr = [instance] __count = 1 __bingo = 0 # 新增bayes if len(warning_level) > 1: inst = [ self.wl_transformer(warning_level[-2]['max']), hit[-1] ] if len(warning_level) > 2: self.nb.set_ready() predicted = self.nb.do_testing(inst) print(data_set_id, predicted) self.nb.do_training(inst) else: self.nb.do_training(inst) global_count += 1 if global_count == length: global_count = 0 self.con.notifyAll() else: self.con.wait() else: __count += 1 predict_corr.append(instance) warning_level_set.append(drift_detector.risk) predicted_value = learner.do_testing(instance) prediction_status = evaluator.calculate_accuracy( predicted_value, instance[-1], output_size=__step, output_flag=False) if prediction_status: __bingo += 1 if detection is True: warning_status, drift_status = drift_detector.detect( prediction_status) if warning_status is not __last_warning_status: if warning_status: current_warning_status['start'] = unix_time current_warning_status['max_accuracy'] = [ drift_detector.o_s_d_min ] current_warning_status['max_accuracy_time'] = [ unix_time ] current_warning_status['backend_accuracy'] = [ drift_detector.accuracy ] else: current_warning_status['end'] = __last_unix_time warning.append(current_warning_status) current_warning_status = {} else: if warning_status: current_warning_status['max_accuracy'].append( drift_detector.o_s_d_min) current_warning_status['max_accuracy_time'].append( unix_time) current_warning_status['backend_accuracy'].append( drift_detector.accuracy) if drift_status is not __last_drift_status: if drift_status: current_drift_status['start'] = unix_time current_drift_status['max_accuracy'] = [ drift_detector.o_s_d_min ] current_drift_status['max_accuracy_time'] = [ unix_time ] current_drift_status['backend_accuracy'] = [ drift_detector.accuracy ] else: current_drift_status['end'] = __last_unix_time drift.append(current_drift_status) current_drift_status = {} else: if drift_status: current_drift_status['max_accuracy'].append( drift_detector.o_s_d_min) current_drift_status['max_accuracy_time'].append( unix_time) current_drift_status['backend_accuracy'].append( drift_detector.accuracy) __last_warning_status = warning_status __last_drift_status = drift_status __last_unix_time = unix_time else: warning_status = False drift_status = False # if 1393401600 - 12*3600 <= unix_time <= 1393401600 + 12*3600: # print("准确率为, S, P", evaluator.accuracy, drift_detector.S, drift_detector.P) if __instance_count == __data_length: # 最后一个batch可能只有少部分数据,要考虑 self.con.acquire() # 获得锁 # 权重 weights.append(learner.currentClassifierWeights.tolist()) # 属性记录 attributes_set, last_batch_attributions = DistributedOnlineLearning.calculate_correlation( predict_corr, last_batch_attributions, attributes_set, spilt_inform) # 准确度(可能会变, 目前是后端计算drift的准确率) accuracy.append(drift_detector.accuracy) # batch 数量 batch_count.append(__count) # 目前是 batch 中预测正确的个数, 后面改为concept drift 预测正确个数 right_count.append(__bingo) # warning level warning_level, warning_level_set, hit = DistributedOnlineLearning.calculate_warning_level( warning_level_set, warning_level, hit) # 新增bayes if len(warning_level) > 1: inst = [ self.wl_transformer(warning_level[-2]['max']), hit[-1] ] if len(warning_level) > 2: self.nb.do_testing(inst) self.nb.do_training(inst) else: self.nb.do_training(inst) global_count += 1 if global_count == length: global_count = 0 if __instance_count == __data_length: # 保存各种数据 pass self.con.notifyAll() else: if __instance_count == __data_length: # 保存各种数据 pass self.con.wait() # training learner.do_training(instance, drift_status) else: # training learner.do_training(instance, drift_status) configure['timeStart'] = batch_start_time[0] configure['timeEnd'] = batch_start_time[-1] configure['timeUnit'] = __batch_size configure['dataNumMax'] = max(batch_count) data_statistics['name'] = sub_dataset delay['time'] = batch_start_time delay['accuracy'] = accuracy delay['bingo'] = [] delay['hit'] = hit delay['warning'] = warning delay['drift'] = drift delay['warningLevel'] = warning_level delay['attributes'] = attributes_set online['weight'] = weights online['time'] = batch_start_time online['dataNum'] = batch_count data_statistics['delay'] = delay data_statistics['online'] = online
def run(self, data_set, sub_data_set): if data_set == 'prsa_data': self.__batch_size = 24 * 3600 # 3600 represent 1 hour elif data_set == 'movie_data': self.__batch_size = 24 * 7 * 3600 # 3600 represent 1 hour self.__data_length = Dataset.get_length(data_set, sub_data_set) self.attributes_set = SaveDifferentData.construct_correlation(data_set) self.spilt_inform = Dataset.get_spilt_inform(data_set) data = Data(data_set, sub_data_set) labels, attributes = data.get_attributes() attributes_scheme = AttributeScheme.get_scheme(attributes) self.__numeric_attribute_scheme = attributes_scheme['nominal'] self.__numeric_attribute_scheme = attributes_scheme['numeric'] # Initializing a learner learner = Logistic(labels, attributes_scheme['numeric']) learner = OnlineAccuracyUpdatedEnsemble( labels, attributes_scheme['numeric'], learner, windowSize=self.__window_size, classifierLimit=self.__classifier_limit) # Initializing a drift detector drift_detector = DDM() # Creating a save content project = Project('./projects/distributed/{}'.format(data_set), sub_data_set) # Initializing a evaluator evaluator = EvaluateWithWindowSize(learner, drift_detector, project, self.__window_size) # train & test for x, y, attribute in data.data(batch_size=1): if attribute is not None: attributes_scheme = AttributeScheme.get_scheme(attributes) self.__numeric_attribute_scheme = attributes_scheme['nominal'] self.__numeric_attribute_scheme = attributes_scheme['numeric'] continue instance = x.tolist()[0] + [int(y.tolist()[0][0])] # 每条数据的unix时间戳 # prsa data if data_set == 'prsa_data': self.date_time_flag = True date_time = list(map(int, instance[:4])) d = datetime.date(date_time[0], date_time[1], date_time[2]) t = datetime.time(date_time[3]) datetime_str = str(d) + ' ' + str(t) self.unix_time = int( time.mktime( time.strptime(datetime_str, '%Y-%m-%d %H:%M:%S'))) if self.unix_time >= 1363881600: self.prsa_flag = True elif data_set == 'movie_data': # movie data if instance[-2] > 62091: self.date_time_flag = True self.prsa_flag = True date_time = self._get_date(instance[-2]) datetime_str = str(date_time) + ' ' + '00:00:00' self.unix_time = int( time.mktime( time.strptime(datetime_str, '%Y-%m-%d %H:%M:%S'))) instance[0:len(instance) - 1] = Normalizer.normalize( instance[0:len(instance) - 1], self.__numeric_attribute_scheme) self.__instance_count += 1 if self.__instance_count > self.__window_size and self.date_time_flag and self.prsa_flag: if self.unix_time == 1363881600: self.__start_point = self.unix_time self.batch_start_time.append(self.__start_point) if self.unix_time - self.__start_point >= self.__batch_size: self.__start_point = self.unix_time # for every batch # 权重 self.weights.append( learner.currentClassifierWeights.tolist()) # 属性记录 self.calculate_correlation() # 准确度(可能会变, 目前是后端计算drift的准确率) self.accuracy.append(drift_detector.accuracy) # batch开始时间 self.batch_start_time.append(self.__start_point) # batch 数量 self.batch_count.append(self.__count) # 目前是 batch 中预测正确的个数, 后面改为concept drift 预测正确个数(后面为drift预测正确的个数) self.right_count.append(self.__bingo) # warning level self.calculate_warning_level() # print(batch_start_time, batch_count) self.predict_corr = [instance] self.__count = 1 self.__bingo = 0 else: self.__count += 1 self.predict_corr.append(instance) self.warning_level_set.append(drift_detector.risk) predicted_value = learner.do_testing(instance) prediction_status = evaluator.calculate_accuracy( predicted_value, instance[-1], output_size=self.__step, output_flag=False) if prediction_status: self.__bingo += 1 if self.detection is True: self.warning_status, self.drift_status = drift_detector.detect( prediction_status) if self.warning_status is not self.__last_warning_status: if self.warning_status: self.current_warning_status[ 'start'] = self.unix_time self.current_warning_status['max_accuracy'] = [ drift_detector.o_s_d_min ] self.current_warning_status[ 'max_accuracy_time'] = [self.unix_time] else: self.current_warning_status[ 'end'] = self.__last_unix_time self.warning.append(self.current_warning_status) self.current_warning_status = {} else: if self.warning_status: self.current_warning_status['max_accuracy'].append( drift_detector.o_s_d_min) self.current_warning_status[ 'max_accuracy_time'].append(self.unix_time) if self.drift_status is not self.__last_drift_status: if self.drift_status: self.current_drift_status['start'] = self.unix_time self.current_drift_status['max_accuracy'] = [ drift_detector.o_s_d_min ] self.current_drift_status['max_accuracy_time'] = [ self.unix_time ] else: self.current_drift_status[ 'end'] = self.__last_unix_time self.drift.append(self.current_drift_status) self.current_drift_status = {} else: if self.drift_status: self.current_drift_status['max_accuracy'].append( drift_detector.o_s_d_min) self.current_drift_status[ 'max_accuracy_time'].append(self.unix_time) self.__last_warning_status = self.warning_status self.__last_drift_status = self.drift_status self.__last_unix_time = self.unix_time else: self.warning_status = False self.drift_status = False if self.__instance_count == self.__data_length: # 最后一个batch可能只有少部分数据,要考虑 # 权重 self.weights.append( learner.currentClassifierWeights.tolist()) # 属性记录 self.calculate_correlation() # 准确度(可能会变, 目前是后端计算drift的准确率) self.accuracy.append(drift_detector.accuracy) # batch 数量 self.batch_count.append(self.__count) # 目前是 batch 中预测正确的个数, 后面改为concept drift 预测正确个数 self.right_count.append(self.__bingo) # warning level self.calculate_warning_level() # training learner.do_training(instance, self.drift_status) else: # training learner.do_training(instance, self.drift_status) self.save_file()
def sub_thread(self, dataset, d_id, sub_dataset, spilt_inform, folder_create, length): global global_count self.nb_set[sub_dataset] = NaiveBayes([0, 1], self.attributes) self.last_wl_status[sub_dataset] = [] self.instance_set[sub_dataset] = [] self.nb_classifier_accuracy[sub_dataset] = dict(all_count=0, right_count=0, accuracy=[]) # Set variables date_time_flag = False data_set_id = d_id __batch_size = 0 __instance_count = 0 __window_size = 500 __step = 1000 __start_point = 0 __count = 0 # __bingo = 0 __last_unix_time = 0 __last_warning_status = False __last_drift_status = False __warning_level_max = 0 __data_length = Dataset.get_length(dataset, sub_dataset) lc = [] bc = [] configure = {} data_statistics = dict() delay = {} online = {} weights = [] accuracy = [] batch_start_time = [] batch_count = [] # right_count = [] warning = [] current_warning_status = {} warning_level = [] warning_level_set = [] drift = [] current_drift_status = {} predict_corr = [] hit = [] last_batch_attributions = None detection = True drift_status = False # classifier flag prsa_flag = False attributes_set = DistributedOnlineLearning.construct_correlation( dataset) # Creating a data stream data = Data(dataset, sub_dataset) labels, attributes = data.get_attributes() attributes_scheme = AttributeScheme.get_scheme(attributes) __numeric_attribute_scheme = attributes_scheme['numeric'] # Creating a save content project = Project('projects/single/{}'.format(dataset), sub_dataset) sub_folder_path = folder_create.sub_folder(sub_dataset) # Initializing a learner learner = Logistic(labels, attributes_scheme['numeric']) learner = OnlineAccuracyUpdatedEnsemble(labels, attributes_scheme['numeric'], learner, windowSize=__window_size, classifierLimit=10) # Initializing a drift detector drift_detector = DDM() # Initializing a evaluator evaluator = EvaluateWithWindowSize(learner, drift_detector, project, __window_size) # train & test for x, y, attribute in data.data(batch_size=1): if attribute is not None: attributes_scheme = AttributeScheme.get_scheme(attributes) __numeric_attribute_scheme = attributes_scheme['numeric'] continue instance = x.tolist()[0] + [int(y.tolist()[0][0])] # 每条数据的unix时间戳 # prsa data if dataset == 'prsa_data': date_time_flag = True date_time = list(map(int, instance[:4])) d = datetime.date(date_time[0], date_time[1], date_time[2]) tt = datetime.time(date_time[3]) datetime_str = str(d) + ' ' + str(tt) unix_time = int( time.mktime( time.strptime(datetime_str, '%Y-%m-%d %H:%M:%S'))) if unix_time >= 1363881600: prsa_flag = True elif dataset == 'movie_data': # movie data if instance[-2] > 62091: date_time_flag = True prsa_flag = True date_time = DistributedOnlineLearning.get_date( instance[-2]) datetime_str = str(date_time) + ' ' + '00:00:00' unix_time = int( time.mktime( time.strptime(datetime_str, '%Y-%m-%d %H:%M:%S'))) instance[0:len(instance) - 1] = Normalizer.normalize( instance[0:len(instance) - 1], __numeric_attribute_scheme) __instance_count += 1 if __instance_count > __window_size and date_time_flag and prsa_flag: if dataset == 'prsa_data': if unix_time == 1363881600: __start_point = unix_time batch_start_time.append(__start_point) elif dataset == 'movie_data': if __instance_count == __window_size + 1: __start_point = unix_time batch_start_time.append(__start_point) difference_value = unix_time - __start_point if difference_value >= __batch_size: self.con.acquire() # 获得锁 batch_interval = int(difference_value / __batch_size) for cc in range(batch_interval): if cc == 0: r_f = False else: r_f = True __start_point += __batch_size # for every batch # 权重 weights.append( learner.currentClassifierWeights.tolist()) # 属性记录 attributes_set, last_batch_attributions, lc, bc = self.calculate_correlation( predict_corr, last_batch_attributions, attributes_set, spilt_inform, lc, bc, repeat_flag=r_f) # 准确度(可能会变, 目前是后端计算drift的准确率) accuracy.append(drift_detector.accuracy) # batch开始时间 batch_start_time.append(__start_point) # batch 数量 batch_count.append(__count) # warning level warning_level, warning_level_set, hit, __warning_level_max = self.calculate_warning_level( warning_level_set, warning_level, hit, __warning_level_max, repeated_flag=r_f) # 保存每个数据源的当前状态 self.last_wl_status[sub_dataset] = [ self.wl_transformer(warning_level[-1]['max']), hit[-1] ] __count = 0 global_count += 1 if global_count == length: global_count = 0 # 训练和测试每个模型的贝叶斯 if len(warning_level) > 1: d_s_n = self.nb_set.keys() for data_set_name in d_s_n: self.instance_set[data_set_name] = [self.last_wl_status[value][0] for value in d_s_n if value != data_set_name]\ + [self.last_wl_status[data_set_name][-1]] if len(warning_level) > 2: # testing for temple_name in d_s_n: self.nb_set[temple_name].set_ready() predict = self.nb_set[ temple_name].do_testing( self.instance_set[temple_name]) if predict == self.instance_set[ temple_name][-1]: self.nb_classifier_accuracy[ temple_name][ 'right_count'] += 1 self.nb_classifier_accuracy[ temple_name]['all_count'] += 1 self.nb_classifier_accuracy[ temple_name]['accuracy'].append( round( self. nb_classifier_accuracy[ temple_name] ['right_count'] / self. nb_classifier_accuracy[ temple_name] ['all_count'], 4)) # training for temple_name in d_s_n: self.nb_set[temple_name].do_training( self.instance_set[temple_name], drift_status) else: for temple_name in d_s_n: self.nb_set[temple_name].do_training( self.instance_set[temple_name], drift_status) # print(self.nb_classifier_accuracy) self.con.notifyAll() else: self.con.wait() predict_corr = [instance] __count = 1 else: __count += 1 predict_corr.append(instance) warning_level_set.append(drift_detector.risk) predicted_value = learner.do_testing(instance) prediction_status = evaluator.calculate_accuracy( predicted_value, instance[-1], output_size=__step, output_flag=False) # if prediction_status: # __bingo += 1 if detection is True: warning_status, drift_status = drift_detector.detect( prediction_status) if warning_status is not __last_warning_status: if warning_status: current_warning_status['start'] = unix_time current_warning_status['max_accuracy'] = [ drift_detector.o_s_d_min ] current_warning_status['max_accuracy_time'] = [ unix_time ] current_warning_status['backend_accuracy'] = [ drift_detector.accuracy ] else: current_warning_status['end'] = __last_unix_time warning.append(current_warning_status) current_warning_status = {} else: if warning_status: current_warning_status['max_accuracy'].append( drift_detector.o_s_d_min) current_warning_status['max_accuracy_time'].append( unix_time) current_warning_status['backend_accuracy'].append( drift_detector.accuracy) if drift_status is not __last_drift_status: if drift_status: current_drift_status['start'] = unix_time current_drift_status['max_accuracy'] = [ drift_detector.o_s_d_min ] current_drift_status['max_accuracy_time'] = [ unix_time ] current_drift_status['backend_accuracy'] = [ drift_detector.accuracy ] else: current_drift_status['end'] = __last_unix_time drift.append(current_drift_status) current_drift_status = {} else: if drift_status: current_drift_status['max_accuracy'].append( drift_detector.o_s_d_min) current_drift_status['max_accuracy_time'].append( unix_time) current_drift_status['backend_accuracy'].append( drift_detector.accuracy) __last_warning_status = warning_status __last_drift_status = drift_status __last_unix_time = unix_time else: # warning_status = False drift_status = False if __instance_count == __data_length: # 最后一个batch可能只有少部分数据,要考虑 self.con.acquire() # 获得锁 # 权重 weights.append(learner.currentClassifierWeights.tolist()) # 属性记录 attributes_set, last_batch_attributions, lc, bc = self.calculate_correlation( predict_corr, last_batch_attributions, attributes_set, spilt_inform, lc, bc, repeat_flag=False) # 准确度(可能会变, 目前是后端计算drift的准确率) accuracy.append(drift_detector.accuracy) # batch 数量 batch_count.append(__count) # # 目前是 batch 中预测正确的个数, 后面改为concept drift 预测正确个数 # right_count.append(__bingo) # warning level warning_level, warning_level_set, hit, __warning_level_max = self.calculate_warning_level( warning_level_set, warning_level, hit, __warning_level_max, repeated_flag=False) global_count += 1 if global_count == length: global_count = 0 # 训练和测试每个模型的贝叶斯 if len(warning_level) > 1: d_s_n = self.nb_set.keys() for data_set_name in d_s_n: self.instance_set[data_set_name] = [self.last_wl_status[value][0] for value in d_s_n if value != data_set_name] \ + [self.last_wl_status[data_set_name][-1]] if len(warning_level) > 2: # testing for temple_name in d_s_n: predict = self.nb_set[ temple_name].do_testing( self.instance_set[temple_name]) if predict == self.instance_set[ temple_name][-1]: self.nb_classifier_accuracy[ temple_name]['right_count'] += 1 self.nb_classifier_accuracy[temple_name][ 'all_count'] += 1 self.nb_classifier_accuracy[temple_name][ 'accuracy'].append( round( self.nb_classifier_accuracy[ temple_name]['right_count'] / self.nb_classifier_accuracy[ temple_name]['all_count'], 4)) # training for temple_name in d_s_n: self.nb_set[temple_name].do_training( self.instance_set[temple_name], drift_status) else: for temple_name in d_s_n: self.nb_set[temple_name].do_training( self.instance_set[temple_name], drift_status) # 保存最后一个数据源的状态 configure['timeStart'] = batch_start_time[0] configure['timeEnd'] = batch_start_time[-1] configure['timeUnit'] = __batch_size configure['dataNumMax'] = max(batch_count) configure['warningLevelMax'] = __warning_level_max data_statistics['name'] = sub_dataset delay['time'] = batch_start_time delay['accuracy'] = accuracy delay['bingo'] = [] delay['hit'] = hit[1:] delay['warning'] = warning delay['drift'] = drift delay['warningLevel'] = warning_level delay['attributes'] = attributes_set online['weight'] = weights online['time'] = batch_start_time online['dataNum'] = batch_count data_statistics['delay'] = delay data_statistics['online'] = online save_path = sub_folder_path + '/' self.save_file(configure, save_path, type_name='configure') self.save_file(data_statistics, save_path, type_name=None) # 提示所有数据训练完成,可结束主进程 print( 'All data has been trained. Please finish the main process manually!' ) Zip.plot_multi(self.nb_classifier_accuracy) Zip(folder_create.get_path()) self.con.notifyAll() else: configure['timeStart'] = batch_start_time[0] configure['timeEnd'] = batch_start_time[-1] configure['timeUnit'] = __batch_size configure['dataNumMax'] = max(batch_count) data_statistics['name'] = sub_dataset delay['time'] = batch_start_time delay['accuracy'] = accuracy delay['bingo'] = [] delay['hit'] = hit[1:] delay['warning'] = warning delay['drift'] = drift delay['warningLevel'] = warning_level delay['attributes'] = attributes_set online['weight'] = weights online['time'] = batch_start_time online['dataNum'] = batch_count data_statistics['delay'] = delay data_statistics['online'] = online save_path = sub_folder_path + '/' self.save_file(configure, save_path, type_name='configure') self.save_file(data_statistics, save_path, type_name=None) self.con.wait() # training learner.do_training(instance, drift_status) else: # training learner.do_training(instance, drift_status)