def pp_construct_data_dict(args): (sensor_uuid, sensor_reading, time_slots, ans_start_t, ans_end_t, timelet_inv) = args log.info('sampling sensor uuid ' + sensor_uuid) log.info('-' * 20) len_time_slots = len(time_slots) ret = None # sensor value is read by time dict_sensor_val, dict_stime, utc_t, val =\ get_val_timelet(sensor_reading, time_slots, ans_start_t, ans_end_t, timelet_inv) dict_sensor_val_temp = np.array([np.asarray(v) for v in dict_sensor_val]) dict_stime_temp = np.array([np.asarray(t) for t in dict_stime]) utc_t_val_temp = np.asarray([utc_t, val]) if dict_sensor_val == -1: log.debug('append purge list: dict_sensor_val=-1 ' + sensor_uuid) # return an empty array to indicate that this uuid has to be purged ret = (sensor_uuid, []) elif len(utc_t) < len_time_slots: log.debug('append purge list:len(utc_t)<len_time_slots' + sensor_uuid) ret = (sensor_uuid, []) elif len(val) < len_time_slots: log.debug('append purge list:len(val)<len_time_slots' + sensor_uuid) ret = (sensor_uuid, []) else: ret = (sensor_uuid, [dict_stime_temp, dict_sensor_val_temp, utc_t_val_temp]) return ret
def x_input_to_states(xinput, CORR_VAL_OUT=0, PARALLEL = False): sinput = np.zeros(xinput.shape) num_samples = xinput.shape[0] num_sensors = xinput.shape[1] if num_samples < num_sensors: log.warn('number of samplesa are smaller than number of sensors') log.info('Mapping ' + str(xinput.shape) + ' marix to discrete states ') for k, samples in enumerate(xinput.T): obs = samples[:, np.newaxis] label, opt_num_cluster, model, score, score_err_sum = state_retrieval(obs, max_num_cluster=6, est_method='kmean', PARALLEL=PARALLEL) high_peak_label_idx = np.argmax(model.cluster_centers_) low_peak_label_idx = np.argmin(model.cluster_centers_) high_peak_idx = np.nonzero(label == high_peak_label_idx)[0] sinput[high_peak_idx,k] = 1 low_peak_idx = np.nonzero(label == low_peak_label_idx)[0] sinput[low_peak_idx, k] = -1 corr_state_val = list() if CORR_VAL_OUT == 1: log.info('Compute Correlation Score....') for k, (row1, row2) in enumerate(zip(sinput.T, xinput.T)): corr_state_val.append(round(stats.pearsonr(row1, row2)[0],3)) corr_state_val = np.array(corr_state_val) return sinput, corr_state_val
def build_diff(args): (k, time_slots, conf_lev, set_val, set_name, num_type) = args log.info(set_name) try: diff_mean = get_diff(set_val, time_slots, num_type, conf_lev) if num_type == FLOAT_TYPE: #norm_diff_mean,output_status=normalize_data(diff_mean[:,0]) norm_diff_mean,output_status=normalize_data(diff_mean) elif num_type == INT_TYPE: #num_discrete_vals=len(set(list(diff_mean[:,0]))) num_discrete_vals=len(set(list(diff_mean))) log.info('num_discrete_vals :' + str(num_discrete_vals)) if num_discrete_vals>1: output_status = 0 norm_diff_mean = diff_mean else: output_status = -1 norm_diff_mean = list(set(diff_mean)) #norm_diff_mean=list(set(diff_mean[:,0])) else: pass except Exception as e: log.error(traceback.print_exc()) log.error('Error in processing data feature, excluded from analysis ' + str(e)) output_status = -1 norm_diff_mean = None return (k,[output_status, norm_diff_mean]) return (k, [output_status, norm_diff_mean])
def interpolation_measurement(data_dict, input_names, err_rate=1, sgm_bnd=20): log.info('-' * 40) log.info('interploattion starts....') log.info('-' * 40) measurement_point_set = list() num_of_discrete_val = list() sampling_interval_set = list() num_type_set = list() err_rate = 1 sgm_bnd = 20 for i, key_name in enumerate(input_names): log.info(key_name + '...') t_ = np.array(data_dict[key_name][2][0]) if len(t_) == 0: continue intpl_intv = np.ceil((t_[-1]-t_[0]) /len(t_)) sampling_interval_set.append(intpl_intv) val_ = np.array(data_dict[key_name][2][1]) num_of_discrete_val_temp = len(set(val_)) num_of_discrete_val.append(num_of_discrete_val_temp) # filtering outlier # assuming 1% of errors and 30 x standard deviation rules outlier_idx = outlier_detect(val_, err_rate, sgm_bnd) if len(outlier_idx) > 0: log.info('outlier samples are detected: outlier_idx: ' + str(outlier_idx)) t_ = np.delete(t_,outlier_idx) val_ = np.delete(val_,outlier_idx) t_new = np.r_[t_[0]:t_[-1]:intpl_intv] """ if num_of_discrete_val_temp<MIN_NUM_VAL_FOR_FLOAT: num_type=INT_TYPE val_new=fast_nearest_interp(t_new, t_,val_) else: num_type=FLOAT_TYPE val_new = np.interp(t_new, t_,val_) """ num_type = check_data_type(data_dict[key_name][2][1]) if num_type == INT_TYPE: val_new = fast_nearest_interp(t_new, t_, val_) else: #num_type=FLOAT_TYPE val_new = np.interp(t_new, t_, val_) c = np.vstack([t_new,val_new]) measurement_point_set.append(c) num_type_set.append(num_type) #return measurement_point_set,num_type_set,num_of_discrete_val,sampling_interval_set return measurement_point_set, np.array(num_type_set)
def max_pack_cluster(DIST_MAT, min_dist=0.3, max_dist=1.0): # minium distance for clusters set by max_dist=1.0 , min_dist=0.3 # Initionalize num_nodes = DIST_MAT.shape[0] label = np.inf * np.ones(num_nodes) label_num = 0 remain_index = np.arange(num_nodes) dist_mat = DIST_MAT.copy() exemplar_list = list() while dist_mat.shape[0] > 2: if udiag_min(dist_mat) > max_dist: log.info('all samples are seperated further than max_dist') log.info('remaining samples will be individual clusters') # Assign different labels to all raminig samples inf_idx = np.nonzero(label == np.inf)[0] for r in inf_idx: exemplar_list.append(int(r)) #label[inf_idx]=label_num+np.arange(len(inf_idx)) label[inf_idx] = np.int_(label_num + np.arange(len(inf_idx))) break elif udiag_max(dist_mat) < min_dist: # Assign the same label to all raminig samples log.info('all samples are seperated within min_dist') log.info('remaining samples will be the same') inf_idx = np.nonzero(label == np.inf)[0] exemplar_list.append(int(inf_idx[0])) label[inf_idx] = int(label_num) break else: exemplar_idx, max_cluster_idx = max_diff_dist_idx( dist_mat, min_dist, max_dist) dcluster_idx = remain_index[max_cluster_idx] exemplar_list.append(np.int_(remain_index[exemplar_idx])) # Update dist_mat and remain_idx dist_mat = np.delete(dist_mat, max_cluster_idx, axis=0) dist_mat = np.delete(dist_mat, max_cluster_idx, axis=1) remain_index = np.delete(remain_index, max_cluster_idx, axis=0) # Adding label info label[dcluster_idx] = label_num label_num += 1 log.info('dist_mat.max()=' + str(dist_mat.max())) unassigned_idx = np.nonzero(label == np.inf)[0] if len(unassigned_idx) > 0: label[unassigned_idx] = label_num + np.arange(len(unassigned_idx)) exemplar_list = exemplar_list + list(unassigned_idx) #raise NameError('There exist the unassigned: '+str(unassigned_idx)) intra_err_cnt, inter_err_cnt = check_bounded_distance_constraint_condition( DIST_MAT, label, min_dist, max_dist) return np.int_(exemplar_list), np.int_(label)
def __init__(self, config, dataset, session): self.config = config self.session = session self.dataset = dataset self.filepath = '%s-%.1f' % ( config.method, config.alpha, ) self.train_dir = './train_dir/%s' % self.filepath for folder in [self.train_dir]: if not os.path.exists(folder): os.makedirs(folder) # clean train folder if self.config.clean: files = glob.glob(folder + '/*') for f in files: os.remove(f) # --- create model --- self.model = Model(config) # --- optimizer --- #self.global_step = tf.contrib.framework.get_or_create_global_step(graph=None) self.global_step = tf.Variable(0, name="global_step") self.learning_rate = config.learning_rate if config.lr_weight_decay: self.learning_rate = tf.train.exponential_decay( self.learning_rate, global_step=self.global_step, decay_steps=10000, decay_rate=0.5, staircase=True, name='decaying_learning_rate') self.summary_op = tf.summary.merge_all() self.saver = tf.train.Saver(max_to_keep=1) self.summary_writer = tf.summary.FileWriter(self.train_dir) self.checkpoint_secs = 300 # 5 min self.train_op = self.optimize_adam(self.model.kl_loss, lr=self.learning_rate) tf.global_variables_initializer().run() if config.checkpoint is not None: self.ckpt_path = tf.train.latest_checkpoint(self.config.checkpoint) if self.ckpt_path is not None: log.info("Checkpoint path: %s", self.ckpt_path) self.saver.restore(self.session, self.ckpt_path) log.info( "Loaded the pretrain parameters from the provided checkpoint path" )
def max_pack_cluster(DIST_MAT,min_dist=0.3,max_dist=1.0): # minium distance for clusters set by max_dist=1.0 , min_dist=0.3 # Initionalize num_nodes = DIST_MAT.shape[0] label = np.inf*np.ones(num_nodes) label_num = 0 remain_index = np.arange(num_nodes) dist_mat = DIST_MAT.copy() exemplar_list = list() while dist_mat.shape[0] > 2: if udiag_min(dist_mat) > max_dist: log.info('all samples are seperated further than max_dist') log.info('remaining samples will be individual clusters') # Assign different labels to all raminig samples inf_idx=np.nonzero(label == np.inf)[0] for r in inf_idx: exemplar_list.append(int(r)) #label[inf_idx]=label_num+np.arange(len(inf_idx)) label[inf_idx] = np.int_(label_num+np.arange(len(inf_idx))) break elif udiag_max(dist_mat)<min_dist: # Assign the same label to all raminig samples log.info('all samples are seperated within min_dist') log.info('remaining samples will be the same') inf_idx=np.nonzero(label==np.inf)[0] exemplar_list.append(int(inf_idx[0])) label[inf_idx]=int(label_num) break else: exemplar_idx,max_cluster_idx=max_diff_dist_idx(dist_mat,min_dist,max_dist) dcluster_idx=remain_index[max_cluster_idx] exemplar_list.append(np.int_(remain_index[exemplar_idx])) # Update dist_mat and remain_idx dist_mat=np.delete(dist_mat, max_cluster_idx, axis=0) dist_mat=np.delete(dist_mat, max_cluster_idx, axis=1) remain_index=np.delete(remain_index,max_cluster_idx, axis=0) # Adding label info label[dcluster_idx]=label_num;label_num+=1 log.info('dist_mat.max()=' + str(dist_mat.max())) unassigned_idx=np.nonzero(label==np.inf)[0] if len(unassigned_idx)>0: label[unassigned_idx]=label_num+np.arange(len(unassigned_idx)) exemplar_list=exemplar_list+list(unassigned_idx) #raise NameError('There exist the unassigned: '+str(unassigned_idx)) intra_err_cnt, inter_err_cnt=check_bounded_distance_constraint_condition(DIST_MAT,label,min_dist,max_dist) return np.int_(exemplar_list),np.int_(label)
def verify_data_mat(X): num_err_temp = np.array([[len(np.nonzero(np.isnan(sample))[0]),len(np.nonzero(sample==np.inf)[0]),len(np.nonzero(np.var(sample)==0)[0])] for sample in X]) num_err = np.sum(num_err_temp, axis=0) for err_idx in np.argwhere( num_err > 0): if err_idx == 0: NameError('nan entry found') if err_idx == 1: NameError('inf entry found') if err_idx == 2: NameError('zero var found') log.info('all entry values of data matrix are verifed ok')
def _bn_anaylsis_all(bldg_obj, p_name, sig_tag='avg', num_picks_bn=15, learning_alg='hc'): s_names = bldg_obj.sigtags[sig_tag].names['sensor'] p_idx = s_names.index(p_name) data_state_mat = bldg_obj.sigtags[sig_tag].data_state_mat log.info('power - sensors + weather + time ...') s_cause_label, s_labels, s_hc, s_cp_mat, s_bndata_mat = \ _bn_anaylsis(bldg_obj, p_name, attr='sensor', sig_tag=sig_tag, num_picks_bn=num_picks_bn, learning_alg=learning_alg) t_cause_label, t_labels, t_hc, t_cp_mat, t_bndata_mat = \ _bn_anaylsis(bldg_obj, p_name, attr='time', sig_tag=sig_tag, num_picks_bn=num_picks_bn, learning_alg=learning_alg) w_cause_label, w_labels, w_hc, w_cp_mat, w_bndata_mat = \ _bn_anaylsis(bldg_obj, p_name, attr='weather', sig_tag=sig_tag, num_picks_bn=num_picks_bn, learning_alg=learning_alg) #s_cause_label=s_labels; w_cause_label=w_labels;t_cause_label=t_labels s_cause_idx = [bldg_obj.sigtags[sig_tag].names['sensor'].index(name) for name in s_cause_label] t_cause_idx = [bldg_obj.sigtags[sig_tag].names['time'].index(name) for name in t_cause_label] w_cause_idx = [bldg_obj.sigtags[sig_tag].names['weather'].index(name) for name in w_cause_label] bndata_mat = np.vstack( (bldg_obj.sigtags[sig_tag].data_state_mat[:, p_idx].T,\ bldg_obj.sigtags[sig_tag].data_state_mat[:, s_cause_idx].T, \ bldg_obj.sigtags[sig_tag].data_weather_mat_[:, w_cause_idx].T, \ bldg_obj.sigtags[sig_tag].data_time_mat[:, t_cause_idx].T)).T cols = [name for name in [p_name] + s_cause_label + w_cause_label + t_cause_label] b_arc_list = \ pair_in_idx([p_name], s_cause_label + w_cause_label + t_cause_label) + \ pair_in_idx(s_cause_label, w_cause_label+t_cause_label) + \ pair_in_idx(w_cause_label, t_cause_label) + \ pair_in_idx(t_cause_label, t_cause_label) # this is the heart and soul of ddea black_arc_frame = rbn.construct_arcs_frame(b_arc_list) factor_data_mat = rbn.convert_pymat_to_rfactor(bndata_mat) data_frame = rbn.construct_data_frame(factor_data_mat, cols) if learning_alg == 'tabu': hc_b = rbn.bnlearn.tabu(data_frame, blacklist=black_arc_frame, score='bic') elif learning_alg == 'mmhc': hc_b = rbn.bnlearn.mmhc(data_frame, blacklist=black_arc_frame, score='bic') else: hc_b = rbn.bnlearn.hc(data_frame, blacklist=black_arc_frame, score='bic') amat = rbn.py_get_amat(hc_b) cause_label = list(np.array(cols)[np.nonzero(amat[:, 0] == 1)[0]]) cause_idx = [cols.index(label_) for label_ in cause_label] return cause_label, cols, hc_b, amat, bndata_mat
def create_bldg_object(data_dict, avgdata_dict, diffdata_dict, bldg_tag, pname_key, PARALLEL=False): log.info('-' * 40) log.info('create object for '+ bldg_tag) log.info('-' * 40) bldg_object = BuildingObject(bldg_tag) # average data bldg_object.sigtags['avg'] = _sigtag_property(avgdata_dict, pname_key, 'avg') # variance data bldg_object.sigtags['diff'] = _sigtag_property(diffdata_dict, pname_key, 'diff') #TODO: Name correction for exemplar bldg_object.Conditions_dict = data_dict['Conditions_dict'] bldg_object.Events_dict = data_dict['Events_dict'] bldg_obj_weather_convert(bldg_object, 'avg', PARALLEL=PARALLEL) bldg_obj_weather_convert(bldg_object, 'diff', PARALLEL=PARALLEL) # Create classs strucutre for data analysis avg_p_name = [BuildingAnalysis(remove_dot(p_name)) for p_name in bldg_object.sigtags['avg'].p_names] diff_p_name = [BuildingAnalysis(remove_dot(p_name)) for p_name in bldg_object.sigtags['diff'].p_names] bldg_object.analysis = {'avg': avg_p_name, 'diff': diff_p_name} _compute_lh_value(bldg_object.sigtags['avg'], bldg_object.analysis['avg'], 'avg') _compute_lh_value(bldg_object.sigtags['diff'], bldg_object.analysis['diff'], 'diff') return bldg_object
def _compute_lh_value(blgd_property, bldg_analysis, sig_tag): log.info('-' * 40) log.info('Compute LH values for ' + sig_tag) log.info('-' * 40) all_data_state_mat = np.vstack( (blgd_property.data_state_mat.T, blgd_property.data_time_mat.T, blgd_property.data_weather_mat_.T)).T p_idx = blgd_property.p_idx p_names = blgd_property.p_names len_sensor = blgd_property.data_state_mat.shape[1] len_time = blgd_property.data_time_mat.shape[1] len_weather = blgd_property.data_weather_mat.shape[1] sensor_cause_idx_set = range(len_sensor) time_cause_idx_set = range(len_sensor, len_sensor + len_time) weather_cause_idx_set = range(len_sensor + len_time, len_sensor + len_time + len_weather) for k, effect_idx in enumerate(p_idx): p_name = remove_dot(p_names[k]) log.info('compute cond. prob of ' + p_name) for i in xrange(len(bldg_analysis)): bldg_anal_obj = bldg_analysis[i] if bldg_anal_obj.sensor_tag == p_name: # check weather it is in the set effect_state_set = np.array( list(set(all_data_state_mat[:, effect_idx]))) eff_state = effect_state_set.max() bldg_anal_obj.peak_eff_state = eff_state s_optstate_set_temp, s_optprob_set_temp = \ find_cond_lh_set(all_data_state_mat, sensor_cause_idx_set, effect_idx, eff_state) bldg_anal_obj.attrs['sensor'].optprob_set = s_optprob_set_temp bldg_anal_obj.attrs[ 'sensor'].optstate_set = s_optstate_set_temp w_optstate_set_temp, w_optprob_set_temp = \ find_cond_lh_set(all_data_state_mat, weather_cause_idx_set, effect_idx, eff_state) bldg_anal_obj.attrs['weather'].optprob_set = w_optprob_set_temp bldg_anal_obj.attrs[ 'weather'].optstate_set = w_optstate_set_temp t_optstate_set_temp, t_optprob_set_temp = \ find_cond_lh_set(all_data_state_mat, time_cause_idx_set, effect_idx, eff_state) bldg_anal_obj.attrs['time'].optprob_set = t_optprob_set_temp bldg_anal_obj.attrs['time'].optstate_set = t_optstate_set_temp
def _compute_lh_value(blgd_property, bldg_analysis, sig_tag): log.info('-' * 40) log.info('Compute LH values for ' + sig_tag) log.info('-' * 40) all_data_state_mat = np.vstack((blgd_property.data_state_mat.T, blgd_property.data_time_mat.T, blgd_property.data_weather_mat_.T)).T p_idx = blgd_property.p_idx p_names = blgd_property.p_names len_sensor = blgd_property.data_state_mat.shape[1] len_time = blgd_property.data_time_mat.shape[1] len_weather = blgd_property.data_weather_mat.shape[1] sensor_cause_idx_set = range(len_sensor) time_cause_idx_set = range(len_sensor, len_sensor + len_time) weather_cause_idx_set = range(len_sensor + len_time, len_sensor + len_time + len_weather) for k, effect_idx in enumerate(p_idx): p_name = remove_dot(p_names[k]) log.info('compute cond. prob of ' + p_name) for i in xrange(len(bldg_analysis)): bldg_anal_obj = bldg_analysis[i] if bldg_anal_obj.sensor_tag == p_name: # check weather it is in the set effect_state_set = np.array(list(set(all_data_state_mat[:, effect_idx]))) eff_state = effect_state_set.max() bldg_anal_obj.peak_eff_state = eff_state s_optstate_set_temp, s_optprob_set_temp = \ find_cond_lh_set(all_data_state_mat, sensor_cause_idx_set, effect_idx, eff_state) bldg_anal_obj.attrs['sensor'].optprob_set = s_optprob_set_temp bldg_anal_obj.attrs['sensor'].optstate_set = s_optstate_set_temp w_optstate_set_temp, w_optprob_set_temp = \ find_cond_lh_set(all_data_state_mat, weather_cause_idx_set, effect_idx, eff_state) bldg_anal_obj.attrs['weather'].optprob_set = w_optprob_set_temp bldg_anal_obj.attrs['weather'].optstate_set = w_optstate_set_temp t_optstate_set_temp, t_optprob_set_temp = \ find_cond_lh_set(all_data_state_mat, time_cause_idx_set, effect_idx, eff_state) bldg_anal_obj.attrs['time'].optprob_set = t_optprob_set_temp bldg_anal_obj.attrs['time'].optstate_set = t_optstate_set_temp
def create_bldg_object(blgd_data, bldg_tag, pname_key, PARALLEL=False): log.info('-' * 40) log.info('create object for ' + bldg_tag) log.info('-' * 40) bldg_object = BuildingObject(bldg_tag) data_dict = blgd_data['data_dict'] avgdata_dict = None diffdata_dict = None # average data if 'avgdata_dict' in blgd_data.keys(): avgdata_dict = blgd_data['avgdata_dict'] bldg_object.sigtags['avg'] = _sigtag_property(avgdata_dict, pname_key, 'avg') # variance data if 'diffdata_dict' in blgd_data.keys(): diffdata_dict = blgd_data['diffdata_dict'] bldg_object.sigtags['diff'] = _sigtag_property(diffdata_dict, pname_key, 'diff') #TODO: Name correction for exemplar bldg_object.Conditions_dict = data_dict['Conditions_dict'] bldg_object.Events_dict = data_dict['Events_dict'] # Create classs strucutre for data analysis if avgdata_dict: bldg_obj_weather_convert(bldg_object, 'avg', PARALLEL=PARALLEL) avg_p_name = [ BuildingAnalysis(remove_dot(p_name)) for p_name in bldg_object.sigtags['avg'].p_names ] bldg_object.analysis['avg'] = avg_p_name _compute_lh_value(bldg_object.sigtags['avg'], bldg_object.analysis['avg'], 'avg') if diffdata_dict: bldg_obj_weather_convert(bldg_object, 'diff', PARALLEL=PARALLEL) diff_p_name = [ BuildingAnalysis(remove_dot(p_name)) for p_name in bldg_object.sigtags['diff'].p_names ] bldg_object.analysis['diff'] = diff_p_name _compute_lh_value(bldg_object.sigtags['diff'], bldg_object.analysis['diff'], 'diff') return bldg_object
def signle_let_cluster_idx(dist_mat, max_dist): log.info(str(max_dist)) num_nodes = dist_mat.shape[0] nodes_all_alone = list() exemplar_idx = list() max_cluster_idx = list() for i, dist_vals in enumerate(dist_mat): # exclude its own distance idx_set = np.r_[np.r_[0:i:1], np.r_[i + 1:num_nodes:1]] temp = dist_vals[idx_set] num_nodes_away_more_than_max_dist = len(np.nonzero(temp > max_dist)[0]) #print temp if num_nodes_away_more_than_max_dist == num_nodes - 1: log.info('-' * 20) log.info(str(i) + 'th node check') log.info('*** all nodes are away beyond max_dist **') nodes_all_alone.append(i) #exemplar_idx.append([i]) exemplar_idx.append(i) #max_cluster_idx.append([i]) max_cluster_idx.append(i) return exemplar_idx, max_cluster_idx
def signle_let_cluster_idx(dist_mat, max_dist): log.info(str(max_dist)) num_nodes=dist_mat.shape[0] nodes_all_alone = list() exemplar_idx = list() max_cluster_idx = list() for i, dist_vals in enumerate(dist_mat): # exclude its own distance idx_set = np.r_[np.r_[0:i:1], np.r_[i+1:num_nodes:1]] temp = dist_vals[idx_set] num_nodes_away_more_than_max_dist = len(np.nonzero(temp>max_dist)[0]) #print temp if num_nodes_away_more_than_max_dist==num_nodes-1: log.info('-' * 20) log.info(str(i) +'th node check') log.info('*** all nodes are away beyond max_dist **') nodes_all_alone.append(i) #exemplar_idx.append([i]) exemplar_idx.append(i) #max_cluster_idx.append([i]) max_cluster_idx.append(i) return exemplar_idx,max_cluster_idx
def pp_verify_sensor_data_format(tup): (key, data_list, time_slots, q) = tup log.info(' checking ' + key + '...') try: for i, samples in enumerate(data_list): for j, each_sample in enumerate(samples): if each_sample == []: q.put([key, i, j]) log.info(str(each_sample) + ' at ' + str(time_slots[i]) + ' in ' + str(key)) elif not isinstance(each_sample, int) and not isinstance(each_sample, float): q.put([key, i, j]) log.info(str(each_sample) + ' at ' + str(time_slots[i]) + ' in ' + str(key)) except Exception as e: log.error(str(e))
def pp_verify_sensor_data_format(tup): (key, data_list, time_slots, q) = tup log.info(' checking ' + key + '...') try: for i, samples in enumerate(data_list): for j, each_sample in enumerate(samples): if each_sample == []: q.put([key, i, j]) log.info(str(each_sample) + ' at ' + str(time_slots[i]) + ' in ' + str(key)) elif not isinstance(each_sample, int) and not isinstance(each_sample, float): q.put([key, i, j]) log.info(str(each_sample) + ' at ' + str(time_slots[i]) + ' in ' + str(key)) except Exception as e: log.error(traceback.print_exc()) log.error(str(e))
def create_bldg_object(blgd_data, bldg_tag, pname_key, PARALLEL=False): log.info('-' * 40) log.info('create object for '+ bldg_tag) log.info('-' * 40) bldg_object = BuildingObject(bldg_tag) data_dict = blgd_data['data_dict'] avgdata_dict = None diffdata_dict = None # average data if 'avgdata_dict' in blgd_data.keys(): avgdata_dict = blgd_data['avgdata_dict'] bldg_object.sigtags['avg'] = _sigtag_property(avgdata_dict, pname_key, 'avg') # variance data if 'diffdata_dict' in blgd_data.keys(): diffdata_dict = blgd_data['diffdata_dict'] bldg_object.sigtags['diff'] = _sigtag_property(diffdata_dict, pname_key, 'diff') #TODO: Name correction for exemplar bldg_object.Conditions_dict = data_dict['Conditions_dict'] bldg_object.Events_dict = data_dict['Events_dict'] # Create classs strucutre for data analysis if avgdata_dict: bldg_obj_weather_convert(bldg_object, 'avg', PARALLEL=PARALLEL) avg_p_name = [BuildingAnalysis(remove_dot(p_name)) for p_name in bldg_object.sigtags['avg'].p_names] bldg_object.analysis['avg'] = avg_p_name _compute_lh_value(bldg_object.sigtags['avg'], bldg_object.analysis['avg'], 'avg') if diffdata_dict: bldg_obj_weather_convert(bldg_object, 'diff', PARALLEL=PARALLEL) diff_p_name = [BuildingAnalysis(remove_dot(p_name)) for p_name in bldg_object.sigtags['diff'].p_names] bldg_object.analysis['diff'] = diff_p_name _compute_lh_value(bldg_object.sigtags['diff'], bldg_object.analysis['diff'], 'diff') return bldg_object
def data_summerization(bldg_key, data_dict, proc_avg=True, proc_diff=True, PARALLEL=False): time_slots = data_dict['time_slots'][:] conditions_dict = data_dict['Conditions_dict'].copy() events_dict = data_dict['Events_dict'].copy() sensor_list = data_dict['sensor_list'][:] weather_list = data_dict['weather_list'][:] weather_list_used = ['TemperatureC', 'Dew PointC', 'Humidity', 'Events', 'Conditions'] # data_used is the list of refernece name for all measurements from now on. data_used = sensor_list + weather_list_used # This is a global ID for data_used measurement data_used_idx = range(len(data_used)) sensor_idx = range(len(sensor_list)) weather_idx = range(len(sensor_list), len(data_used)) dsout = {'data_dict': data_dict} if proc_avg: log.info('-' * 40) log.info('processing avg.feature..') log.info('-' * 40) X_Feature, X_Time, X_names, X_zero_var_list, X_zero_var_val, X_int_type_list,\ X_int_type_idx, X_float_type_list, X_float_type_idx, X_weather_type_idx, X_sensor_type_idx = \ build_feature_matrix(data_dict, sensor_list, weather_list_used, time_slots, interpolation=1, max_num_succ_idx_for_itpl=int(len(time_slots)*0.05)) build_feature_matrix_out = \ {'X_Feature': X_Feature, 'X_Time': X_Time, 'X_names': X_names, 'X_zero_var_list': X_zero_var_list, 'X_zero_var_val': X_zero_var_val, 'X_int_type_list': X_int_type_list, 'X_int_type_idx': X_int_type_idx, 'X_float_type_list': X_float_type_list, 'X_float_type_idx': X_float_type_idx, 'X_weather_type_idx': X_weather_type_idx, 'X_sensor_type_idx': X_sensor_type_idx} build_feature_matrix_out = obj(build_feature_matrix_out) if len(X_names+X_zero_var_list) != len(data_used): log.error('Missing name is found in X_names or X_zero_var_list') raise NameError('Missing name is found in X_names or X_zero_var_list') else: zero_var_idx = [data_used.index(name_str) for name_str in X_zero_var_list] nzero_var_idx = list(set(data_used_idx)-set(zero_var_idx)) if X_Feature.shape[0] > 0: # From below all index are reference to X_Feature sf_idx = list(set(X_sensor_type_idx)&set(X_float_type_idx)) # Equivalent to np.array(data_used)[np.array(nzero_var_idx)[sf_idx]] sf_name = list(np.array(X_names)[sf_idx]) si_idx = list(set(X_sensor_type_idx)&set(X_int_type_idx)) si_name = list(np.array(X_names)[si_idx]) wf_idx = list(set(X_weather_type_idx)&set(X_float_type_idx)) wf_name = list(np.array(X_names)[wf_idx]) wi_idx = list(set(X_weather_type_idx)&set(X_int_type_idx)) wi_name = list(np.array(X_names)[wi_idx]) #Euclidian Distance Matrix of Floating type of data only wf+o float_idx = list(set(sf_idx)| set(wf_idx)) int_idx = list(set(si_idx)| set(wi_idx)) # Float Type Measurement Clustering X_Feature_sfe, sf_exemplars_dict, exemplars_, labels_ = \ cluster_measurement_points(X_Feature[:, sf_idx], sf_name, corr_bnd=[0.1, 0.9], alg='aff') sfe_idx = list(np.array(sf_idx)[exemplars_]) #plot_label(X_Feature,X_names,labels_,exemplars_,[4,5,6,7]) # InT Type Measurement Clustering X_Feature_sie, si_exemplars_dict, exemplars_, labels_ = \ cluster_measurement_points(X_Feature[:, si_idx], si_name, corr_bnd=[0.0, 0.9], alg='aff') sie_idx = list(np.array(si_idx)[exemplars_]) # sensor -float type sfe_state, sfe_corr_val = x_input_to_states(X_Feature_sfe, CORR_VAL_OUT=1) # sensor -integer type sie_state = X_Feature_sie # weather -float type wf_state, wf_corr_val = x_input_to_states(X_Feature[:, wf_idx], CORR_VAL_OUT=1) # weather -integer type wi_state = X_Feature[:, wi_idx] empty_states = np.array([[] for i in range(len(X_Time))]) if len(sfe_state) == 0: sfe_state = empty_states if len(sie_state) == 0: sie_state = empty_states if len(wf_state) ==0: wf_state = empty_states if len(wi_state) == 0: wi_state = empty_states # Exemplar sensor only X_Sensor_STATE = np.append(sfe_state,sie_state, axis=1) X_Sensor_STATE = X_Sensor_STATE.astype(int) X_Sensor_NAMES = list(np.array(X_names)[sfe_idx]) + list(np.array(X_names)[sie_idx]) X_Weather_STATE = np.append(wf_state,wi_state, axis=1) X_Weather_STATE = X_Weather_STATE.astype(int) X_Weather_NAMES = list(np.array(X_names)[wf_idx])+list(np.array(X_names)[wi_idx]) # months of a year,days of a week, and hours of a day # (Monday, Tuesday,Wendsday,Thursday,Saturday,Sunday) =(0,1,2,3,4,5,6) X_Time_STATE_temp = build_time_states(X_Time) X_Time_NAMES_temp = ['MTH', 'WD', 'HR'] X_Time_STATE = list() X_Time_NAMES = list() for xt_col, xt_name in zip(X_Time_STATE_temp.T,X_Time_NAMES_temp): if len(set(xt_col)) > 1: X_Time_STATE.append(xt_col) X_Time_NAMES.append(xt_name) X_Time_STATE = np.array(X_Time_STATE).T ################################################# # FORMATTED DATA FOR REGUALR EVENT ################################################# #DO_PROB_EST=1 ** Save this variables*** #avgdata_mat = np.hstack([X_Sensor_STATE,X_Weather_STATE,X_Time_STATE]) #avgdata_names = X_Sensor_NAMES+X_Weather_NAMES+X_Time_NAMES avgdata_exemplar = dict(sf_exemplars_dict.items()+si_exemplars_dict.items()) avgdata_zvar = X_zero_var_list avgdata_dict = dict() avgdata_dict.update({'build_feature_matrix_out': build_feature_matrix_out}) avgdata_dict.update({'avgdata_state_mat': X_Sensor_STATE}) avgdata_dict.update({'avgdata_weather_mat': X_Weather_STATE}) avgdata_dict.update({'avgdata_time_mat': X_Time_STATE}) avgdata_dict.update({'avg_time_slot': X_Time}) avgdata_dict.update({'avgdata_exemplar': avgdata_exemplar}) avgdata_dict.update({'avgdata_zvar': avgdata_zvar}) avgdata_dict.update({'sensor_names': X_Sensor_NAMES}) avgdata_dict.update({'weather_names': X_Weather_NAMES}) avgdata_dict.update({'time_names': X_Time_NAMES}) dsout.update({'avgdata_dict': avgdata_dict}) if proc_diff: log.info('-' * 40) log.info('processing diff.feature..') log.info('-' * 40) #################################### # Irregular Event Extraction #################################### # Interpolatoin with outlier removal, Here we exclude weather data from irregualr event analysis # since weather data noramlly show slow changes in time.so we dont expect in any meaningful diffs values measurement_point_set,num_type_set = interpolation_measurement(data_dict, sensor_list, err_rate=1, sgm_bnd=20) # Irregualr matrix Xdiff_Mat,\ Xdiff_Time,\ Xdiff_Names,\ Xdiff_zero_var_list,\ Xdiff_zero_var_val,\ Xdiff_int_type_list,\ Xdiff_int_type_idx,\ Xdiff_float_type_list,\ Xdiff_float_type_idx =\ build_diff_matrix(measurement_point_set, time_slots, num_type_set, sensor_list, PARALLEL=PARALLEL) build_diff_matrix_out = \ {'Xdiff_Mat':Xdiff_Mat, 'Xdiff_Time':Xdiff_Time, 'Xdiff_Names':Xdiff_Names, 'Xdiff_zero_var_list':Xdiff_zero_var_list, 'Xdiff_zero_var_val':Xdiff_zero_var_val, 'Xdiff_int_type_list':Xdiff_int_type_list, 'Xdiff_int_type_idx':Xdiff_int_type_idx, 'Xdiff_float_type_list':Xdiff_float_type_list, 'Xdiff_float_type_idx':Xdiff_float_type_idx} build_diff_matrix_out = obj(build_diff_matrix_out) if Xdiff_Mat.shape[0] > 0: #============================================================================== # Restructure diff_marix's and weather matix for the same common time slot #============================================================================== time_slots_array = np.sort(np.array(list(set(Xdiff_Time) & set(X_Time)))) # Extract subset of X_Weather_STATE removed_idx_list = list() for ridx, slot in enumerate(X_Time): slot_idx = np.where(time_slots_array==slot)[0] # slot not in common time slots if len(slot_idx) == 0: removed_idx_list.append(ridx) XDIFF_Weather_STATE = np.delete(X_Weather_STATE, removed_idx_list,axis=0) # Extract subset of Xdiff_Mat removed_idx_list = list() for ridx,slot in enumerate(Xdiff_Time): slot_idx = np.where(time_slots_array == slot)[0] # slot not in common time slots if len(slot_idx) == 0: removed_idx_list.append(ridx) Xdiff_Mat = np.delete(Xdiff_Mat, removed_idx_list, axis=0) # Update Xdiff_Time Xdiff_Time = time_slots_array XDIFF_Weather_STATE = np.array(XDIFF_Weather_STATE) # From below all index are reference to X_Feature xdiff_sf_idx = Xdiff_float_type_idx xdiff_sf_name = Xdiff_float_type_list xdiff_si_idx = Xdiff_int_type_idx xdiff_si_name = Xdiff_int_type_list # Float Type Measurement Clustering X_Diff_sfe, sf_diff_exemplars_dict, exemplars_, labels_ = \ cluster_measurement_points(Xdiff_Mat[:, xdiff_sf_idx], xdiff_sf_name, corr_bnd=[0.1, 0.9]) xdiff_sfe_idx = list(np.array(xdiff_sf_idx)[exemplars_]) # InT Type Measurement Clustering X_Diff_sie, si_diff_exemplars_dict, exemplars_, labels_ = \ cluster_measurement_points(Xdiff_Mat[:, xdiff_si_idx], xdiff_si_name, corr_bnd=[0.1, 0.9]) xdiff_sie_idx = list(np.array(xdiff_si_idx)[exemplars_]) # sensor -float type xdiff_sfe_state, xdiff_sfe_corr_val =\ x_input_to_states(X_Diff_sfe, CORR_VAL_OUT=1, PARALLEL=PARALLEL) # sensor -integer type xdiff_sie_state = X_Diff_sie empty_states = np.array([[] for i in range(len(Xdiff_Time))]) if len(xdiff_sfe_state) == 0: xdiff_sfe_state = empty_states if len(xdiff_sie_state) == 0: xdiff_sie_state = empty_states if len(wf_state) == 0: wf_state = empty_states if len(wi_state) == 0: wi_state = empty_states # Exemplar sensor only XDIFF_Sensor_STATE = np.append(xdiff_sfe_state,xdiff_sie_state, axis=1) XDIFF_Sensor_STATE = XDIFF_Sensor_STATE.astype(int) XDIFF_Sensor_NAMES = list(np.array(Xdiff_Names)[xdiff_sfe_idx])+list(np.array(Xdiff_Names)[xdiff_sie_idx]) # months of a year,days of a week, and hours of a day # (Monday, Tuesday,Wendsday,Thursday,Saturday,Sunday) =(0,1,2,3,4,5,6) XDIFF_Time_STATE_temp = build_time_states(Xdiff_Time) XDIFF_Time_NAMES_temp = ['MTH', 'WD', 'HR'] XDIFF_Time_STATE = list() XDIFF_Time_NAMES = list() for xt_col, xt_name in zip(XDIFF_Time_STATE_temp.T, XDIFF_Time_NAMES_temp): if len(set(xt_col)) > 1: XDIFF_Time_STATE.append(xt_col) XDIFF_Time_NAMES.append(xt_name) XDIFF_Time_STATE = np.array(XDIFF_Time_STATE).T ################################################# # FORMATTED DATA FOR IRREGUALR EVENT ################################################# #** Save this variables*** #diffdata_mat = np.hstack([XDIFF_Sensor_STATE,X_Weather_STATE,XDIFF_Time_STATE]) #diffdata_names = XDIFF_Sensor_NAMES+X_Weather_NAMES+XDIFF_Time_NAMES diffdata_exemplar = dict(sf_diff_exemplars_dict.items() + si_diff_exemplars_dict.items()) diffdata_zvar = Xdiff_zero_var_list diffdata_dict = dict() diffdata_dict.update({'build_diff_matrix_out': build_diff_matrix_out}) diffdata_dict.update({'diffdata_state_mat': XDIFF_Sensor_STATE}) diffdata_dict.update({'diffdata_weather_mat': XDIFF_Weather_STATE}) diffdata_dict.update({'diffdata_time_mat': XDIFF_Time_STATE}) diffdata_dict.update({'diff_time_slot': Xdiff_Time}) diffdata_dict.update({'diffdata_exemplar': diffdata_exemplar}) diffdata_dict.update({'diffdata_zvar': diffdata_zvar}) diffdata_dict.update({'sensor_names': XDIFF_Sensor_NAMES}) diffdata_dict.update({'weather_names': X_Weather_NAMES}) diffdata_dict.update({'time_names': X_Time_NAMES}) dsout.update({'diffdata_dict': diffdata_dict}) dsout.update({'bldg_key': remove_dot(bldg_key)}) return dsout
def build_feature_matrix(data_dict, sensor_list, weather_list, time_slots, interpolation=1, max_num_succ_idx_for_itpl=4): data_used = sensor_list + weather_list log.info('Build data feature matrix now.....') if interpolation == 1: log.info('Missing samples will be interpolated upto ' + str(max_num_succ_idx_for_itpl) + 'successive time slots') else: log.info('All time slots with any missing sample will be removed without interpolatoin ') num_of_data = len(data_used) num_of_samples = len(time_slots) # Declare as 2-d list for exception. X = list() INT_type_list = list() FLOAT_type_list = list() input_names = list() weather_type_idx = list() sensor_type_idx = list() INT_type_idx = list() FLOAT_type_idx = list() zero_var_list = list() zero_var_val = list() # whose variance is zero, hence carry no information, # Constrcut X matrix by summerizing hourly samples for j, key in enumerate(data_used): log.info('-' * 40) log.info('building for ' + str(key)) try: num_type = check_data_type(data_dict[key][2][1]) # Avg. value feature x_temp = get_feature(data_dict[key][1], num_type) non_inf_idx = np.nonzero(x_temp < np.inf)[0] #if non_inf_idx <len(time_slots):measurement_point_set # Outlier removal, different parameters for sensors and weather data if len(sensor_list) <= j: # weather data is_weather_data = True outlier_idx = outlier_detect(x_temp[non_inf_idx], 5, 10) else: is_weather_data = False outlier_idx = outlier_detect(x_temp[non_inf_idx], 1, 20) if len(outlier_idx) > 0: log.info('outlier samples are detected: outlier_idx:' + str(outlier_idx)) x_temp[non_inf_idx[outlier_idx]] = np.inf # interplolation data, use nearest for int type, use linear for float type if interpolation == 1: x_temp = interploate_data(x_temp, num_type, max_num_succ_idx_for_itpl) norm_data_vec, output_status = normalize_data(x_temp[:, 0]) if len(np.nonzero(norm_data_vec == np.inf)[0]) > num_of_samples/5: raise except Exception as e: log.error(traceback.print_exc()) log.error(' Error in processing data feature, excluded from analysis ' + str(e)) output_status = -1 norm_data_vec = None if output_status == -1: zero_var_list.append(key) zero_var_val.append(norm_data_vec) log.info('too small variance for float type, added to zero var list') else: input_names.append(key) log.info(str(j)+'th sensor update') if (num_type == FLOAT_TYPE) and (is_weather_data == False): X.append(norm_data_vec) FLOAT_type_idx.append(len(X)-1) FLOAT_type_list.append(key) elif (num_type == INT_TYPE) or (is_weather_data == True): X.append(x_temp[:, 0]) INT_type_idx.append(len(X)-1) INT_type_list.append(key) else: log.error('Sample type must either INT or FLOAT type') raise NameError('Sample type must either INT or FLOAT type') if key in weather_list: weather_type_idx.append(len(X)-1) elif key in sensor_list: sensor_type_idx.append(len(X)-1) else: log.error('Sample type must either Weather or Sensor type') raise NameError('Sample type must either Weather or Sensor type') # Linear Interpolate X = np.array(X).T if X.shape[0] != num_of_samples: log.error('The numeber of rows in feature matrix and the number of the time slots are different ') raise NameError('The numeber of rows in feature matrix and the number of the time slots are different ') if X.shape[1]+len(zero_var_list) != num_of_data: log.error('The sume of the numeber of column in feature matrix and the number of zero var column are different from the number of input measurements ') raise NameError('The sume of the numeber of column in feature matrix and the number of zero var column are different from the number of input measurements ') deleted_timeslot_idx=[] log.info('-' * 20) log.info('removing time slots having no sample...') inf_idx_set = [] for col_vec in X.T: inf_idx = np.nonzero(col_vec ==np.infty)[0] inf_idx_set = np.r_[inf_idx_set, inf_idx] inf_col_idx = list(set(list(inf_idx_set))) deleted_timeslot_idx = np.array([int(x) for x in inf_col_idx]) log.info('time slots ' + str(deleted_timeslot_idx) + ' removed...') log.info('-' * 20) X = np.delete(X, deleted_timeslot_idx, axis=0) new_time_slot = np.delete(time_slots, deleted_timeslot_idx) # Checking whether it has any ill entry value verify_data_mat(X) return X, new_time_slot, input_names, zero_var_list, zero_var_val, INT_type_list, INT_type_idx, FLOAT_type_list, FLOAT_type_idx, weather_type_idx, sensor_type_idx
def plotting_bldg_lh(bldg, bldg_key=[], attr='sensor', num_picks=30): log.info('-' * 40) log.info('plotting lh for ' + attr) log.info('-' * 40) sig_tag_set = ['avg', 'diff'] plt.ioff() if not len(bldg_key): bldg_tag_set = [bldg.bldg_tag] else: bldg_tag_set = [bldg_key] for bldg_tag in bldg_tag_set: if bldg_tag == bldg.bldg_tag: log.info('-' * 40) log.info(bldg_tag + " is to be plotted...") log.info('-' * 40) for sig_tag in sig_tag_set: try: p_names = bldg.sigtags[sig_tag].p_names for pname in p_names: try: blank_idx = pname.index('.') pname = pname.replace('.', '_') except: pass optprob_set = None optstate_set = None for anal in bldg.analysis[sig_tag]: if anal.sensor_tag == pname: optprob_set = anal.attrs[attr].optprob_set optstate_set = anal.attrs[attr].optstate_set break s_names = bldg.sigtags[sig_tag].names[attr] num_picks = 30 sort_idx = np.argsort(optprob_set)[::-1] sort_lh = optprob_set[sort_idx[:num_picks]].T sort_state = optstate_set[sort_idx[:num_picks]].T x_label = list(np.array(s_names)[sort_idx[:num_picks]]) x_ticks = range(len(x_label)) plt.figure(figsize=(20.0, 15.0)) plt.subplot(2, 1, 1) plt.plot(sort_lh, '-*') plt.xticks(x_ticks, x_label, rotation=270, fontsize="small") if sig_tag == 'avg': plt.title('Most relavant ' + attr + ' attributes to the peak (demand) of ' + pname, fontsize=20) else: plt.title( 'Most relavant ' + attr + ' attributes to the peak variations of ' + pname, fontsize=20) plt.tick_params(labelsize='large') plt.ylim([-0.05, 1.05]) plt.ylabel('Likelihood (From 0 to 1)', fontsize=18) plt.savefig(FIG_DIR + bldg_tag + '_' + pname + '_' + attr + '_' + sig_tag + '_lh_sensors.png', bbox_inches='tight') plt.close() except Exception as e: log.error(traceback.print_exc()) log.error(str(e)) pass plt.close() plt.ion()
def check_bounded_distance_constraint_condition(dist_mat,labels,min_dist,max_dist): intra_err_cnt=0 num_clusters=int(labels.max()+1) log.info('-' * 80) log.info('Intra-Cluster distance check.....') log.info('Condition: inter-cluster distance is upper-bounded by' + str(round(max_dist,2))) log.info('-' * 80) for i in range(num_clusters): idx_set = np.nonzero(labels==(i))[0] #print '----------------------------------------------------------' #print i,'th cluster: ',idx_set for idx_pair in pair_in_idx(idx_set): #print idx_pair, 'dist-',round(dist_mat[idx_pair[0],idx_pair[1]],2) dist_val_=dist_mat[idx_pair[0],idx_pair[1]] # Rule violation if dist_val_ > max_dist: log.info('*** the distance of pairs :' + str(idx_pair) + ' in ' + str(i) + 'th cluster ~' + str(np.round(dist_val_,2)) + ' > max_dist=' + str(np.round(max_dist,2)) +'***') intra_err_cnt=intra_err_cnt+1 log.info('-' * 80) log.info('Inter-Cluster distance check.....') log.info('Condition: intra-cluster distance is lower-bounded by ' + str(round(min_dist,2))) log.info('-' * 80) cluster_pairs=pair_in_idx(range(num_clusters)) inter_err_cnt=0 for c_pair in cluster_pairs: idx_set_0=np.nonzero(labels==(c_pair[0]))[0] idx_set_1=np.nonzero(labels==(c_pair[1]))[0] #print '----------------------------------------------------------' #print 'The pairwise distance between ',c_pair[0],'th cluster and',c_pair[1],'th cluster' for idx_pair in pair_in_idx(idx_set_0,idx_set_1): #print idx_pair, 'dist-',round(dist_mat[idx_pair[0],idx_pair[1]],2) dist_val_=dist_mat[idx_pair[0],idx_pair[1]] # Rule violation if dist_val_<min_dist: log.info('*** the distance of pairs :' + str(idx_pair[0]) + ' in ' + str(c_pair[0]) + ' and ' + str(idx_pair[1]) + ' in ' + str(c_pair[1]) + ' ~ ' + str(round(dist_val_,2)) + ' < min_dist=', str(round(min_dist,2)) + '***') inter_err_cnt += inter_err_cnt+1 return intra_err_cnt, inter_err_cnt
def get_val_timelet(reading, t_slots, ans_start_t, ans_end_t, timelet_inv): data = dict() data['value'] = np.array([r[1] for r in reading], dtype=float) ts_list = list() for r in reading: local_dt = dt.datetime.fromtimestamp(r[0]) time_tup = local_dt.timetuple() ts_list.append([ local_dt, time_tup[5], time_tup[4], time_tup[3], time_tup[6], time_tup[2], time_tup[1] ]) data['ts'] = np.array(ts_list) if not len(data): log.critical( 'Error in file reading: empty data. Skip and need to be purged from sensor list' ) sensor_read = -1 stime_read = -1 utc_t = -1 val = -1 return sensor_read, stime_read, utc_t, val if (len(data["ts"]) < MIN_NUM_VAL_FOR_FLOAT) or (len(data["value"]) < MIN_NUM_VAL_FOR_FLOAT): log.critical('No data included ' + str(data) + '... Skip and need to be purged from sensor list') sensor_read = -1 stime_read = -1 utc_t = -1 val = -1 return sensor_read, stime_read, utc_t, val nan_idx_list = np.nonzero(np.isnan(data["value"]))[0] sensor_val = np.delete(data["value"], nan_idx_list, axis=0) time_val = np.delete(data["ts"], nan_idx_list, axis=0) # Create the list of lists for value sensor_read = [[] for i in range(len(t_slots))] # Create the list of lists for seconds index stime_read = [[] for i in range(len(t_slots))] utc_t = [] val = [] for t_sample, v_sample in zip(time_val, sensor_val): temp_dt = t_sample[DT_IDX] if temp_dt < ans_start_t or temp_dt >= ans_end_t: continue try: idx = int( (temp_dt - ans_start_t).total_seconds() / timelet_inv.seconds) sensor_read[idx].append(v_sample) #secs=t_sample[MIN_IDX]*MIN+t_sample[SEC_IDX] secs = (temp_dt - t_slots[idx]).total_seconds() if secs >= timelet_inv.seconds: log.info('sec: ' + str(secs)) raise NameError( 'Seconds from an hour idx cannot be greater than ' + str(timelet_inv.seconds) + 'secs') stime_read[idx].append(secs) except ValueError: idx = -1 utc_temp = dtime_to_unix([t_sample[DT_IDX]]) utc_t.append(utc_temp) val.append(v_sample) return sensor_read, stime_read, utc_t, val
def cluster_measurement_points(m_matrix, m_name, corr_bnd=[0.1, 0.9], alg='aff'): exemplars_dict = dict() if m_matrix.shape[1] == 0: return [], exemplars_dict, [], [] elif m_matrix.shape[1] == 1: exemplars_ = [0] labels_ = [0] exemplars_name = m_name else: distmat_input = find_norm_dist_matrix(m_matrix) # Find representative set of sensor measurements min_dist_ = np.sqrt(2 * (1 - (corr_bnd[1]))) max_dist_ = np.sqrt(2 * (1 - (corr_bnd[0]))) if alg == 'pack': log.info('use pack clustering algoirthm') exemplars_, labels_ = max_pack_cluster(distmat_input, min_dist=min_dist_, max_dist=max_dist_) else: log.info('use affinity clustering algoirthm') SIMM_MAT = 2 - distmat_input exemplars_, labels_ = cluster.affinity_propagation(SIMM_MAT, damping=0.5) num_clusters = int(labels_.max() + 1) log.info('-' * 40) log.info( str(num_clusters) + 'clusters out of ' + str(len(labels_)) + 'measurements') log.info('-' * 40) validity, intra_dist, inter_dist = compute_cluster_err( distmat_input, labels_) log.info('validity: ' + str(round(validity, 2)) + ', intra_dist: ' + str(np.round(intra_dist, 2)) + ', inter_dist: ' + str(np.round(inter_dist, 2))) log.info('-' * 40) exemplars_name = list(np.array(m_name)[exemplars_]) for label_id, (m_idx, exemplar_label) in enumerate(zip(exemplars_, exemplars_name)): log.info(str(exemplar_label)) children_set = list( set(np.nonzero(labels_ == label_id)[0]) - set([m_idx])) log.info('Label ' + str(label_id) + ' : ' + str(m_idx) + '<--' + str(children_set)) exemplars_dict.update( {exemplar_label: list(np.array(m_name)[children_set])}) return m_matrix[:, exemplars_], exemplars_dict, exemplars_, labels_
def show_clusters(exemplars, labels, input_names): n_labels = labels.max() for i in range(n_labels + 1): log.info('Cluster %i: %s' % ((i + 1), ', '.join(input_names[labels == i])))
def _sigtag_property(data_dict, pname_key, sig_tag): data_state_mat = data_dict[sig_tag + 'data_state_mat'] data_weather_mat = data_dict[sig_tag + 'data_weather_mat'] data_time_mat = data_dict[sig_tag + 'data_time_mat'] time_slot = data_dict[sig_tag + '_time_slot'] data_exemplar = data_dict[sig_tag + 'data_exemplar'] data_zvar = remove_dot(data_dict[sig_tag + 'data_zvar']) sensor_names = remove_dot(data_dict['sensor_names']) weather_names = remove_dot(data_dict['weather_names']) time_names = remove_dot(data_dict['time_names']) if pname_key and len(pname_key): #TODO: Name correction for exemplar if isinstance(pname_key, list): p_idx = [sig_tag + sensor_names.index(p_name) for p_name in pname_key] p_names = remove_dot(list(np.array(sensor_names)[list(set(p_idx))])) else: p_idx = grep(pname_key, sensor_names) p_names = remove_dot(list(np.array(sensor_names)[p_idx])) else: p_idx = [i for i in xrange(0, len(sensor_names))] p_names = remove_dot(sensor_names) log.info('-' * 40) log.info('Power sensor selected -' + sig_tag) log.info('-' * 40) log.info("p_idx : " + str(p_idx)) log.info("sensor_names : " + str(sensor_names)) log.info("p_names : " + str(p_names)) return BuildingSigtagProperty(sig_tag, data_state_mat, data_weather_mat, data_time_mat, time_slot, data_exemplar, data_zvar, sensor_names, weather_names, time_names, p_idx, p_names)
def _sigtag_property(data_dict, pname_key, sig_tag): data_state_mat = data_dict[sig_tag + 'data_state_mat'] data_weather_mat = data_dict[sig_tag + 'data_weather_mat'] data_time_mat = data_dict[sig_tag + 'data_time_mat'] time_slot = data_dict[sig_tag + '_time_slot'] data_exemplar = data_dict[sig_tag + 'data_exemplar'] data_zvar = remove_dot(data_dict[sig_tag + 'data_zvar']) sensor_names = remove_dot(data_dict['sensor_names']) weather_names = remove_dot(data_dict['weather_names']) time_names = remove_dot(data_dict['time_names']) if pname_key and len(pname_key): #TODO: Name correction for exemplar if isinstance(pname_key, list): p_idx = [ sig_tag + sensor_names.index(p_name) for p_name in pname_key ] p_names = remove_dot(list( np.array(sensor_names)[list(set(p_idx))])) else: p_idx = grep(pname_key, sensor_names) p_names = remove_dot(list(np.array(sensor_names)[p_idx])) else: p_idx = [i for i in xrange(0, len(sensor_names))] p_names = remove_dot(sensor_names) log.info('-' * 40) log.info('Power sensor selected -' + sig_tag) log.info('-' * 40) log.info("p_idx : " + str(p_idx)) log.info("sensor_names : " + str(sensor_names)) log.info("p_names : " + str(p_names)) return BuildingSigtagProperty(sig_tag, data_state_mat, data_weather_mat, data_time_mat, time_slot, data_exemplar, data_zvar, sensor_names, weather_names, time_names, p_idx, p_names)
def state_retrieval(obs, max_num_cluster=6, off_set=0, est_method='kmean', PARALLEL = False): log.info('-' * 40) log.info('Retrieving discrete states from data using ' + est_method + ' model...') log.info('-' * 40) log.info('try '+ str(max_num_cluster) + ' clusters..... ') score = np.zeros(max_num_cluster) model_set = list() if not PARALLEL: for num_cluster in range(max_num_cluster): log.info('Try ' + str(num_cluster+1) + ' clusters ') log.info('-----------------------------------') if est_method == 'kmean': kmean = KMeans(n_clusters=num_cluster+1).fit(obs) model_set.append(kmean) #score[num_cluster]=-1*np.log(-1*np.sum(kmean.score(obs))) #score[num_cluster]=kmean.score(obs) #score[num_cluster]=kmean.score(obs)-.5*(num_cluster+1)*1*log10(len(obs)) #log_ll_val=compute_log_ll(kmean.labels_,obs) score[num_cluster] = compute_log_ll(kmean.labels_, obs) elif est_method == 'gmm': gmm = mixture.GMM(n_components=num_cluster+1).fit(obs) model_set.append(gmm) score[num_cluster] = np.sum(gmm.score(obs)) else: log.error('not supported est_method') raise NameError('not supported est_method') else: log.info('Parallel enabled...') model_set = [0] * max_num_cluster score = [0] * max_num_cluster params = [(obs, i+1, est_method) for i in range(max_num_cluster)] p = Pool(max_num_cluster) models = p.map(pp_cluster_state_retrieval, params) p.close() p.join() model_dict = dict(models) for k, v in model_dict.iteritems(): model_set[k] = v[0] score[k] = v[1] score_err_sum = np.zeros(max_num_cluster) log.info('Finding knee points of log likelihood...') for i in range(max_num_cluster): a_0 = score[:(i)] if len(a_0) > 1: slope, intercept, r_value, p_value, std_err = stats.linregress(range(len(a_0)),a_0) sqr_sum_err0 = sum(((slope*np.arange(len(a_0)) + intercept)-a_0)**2) else: sqr_sum_err0=0 a_1 = score[(i):] if len(a_1) > 1: slope, intercept, r_value, p_value, std_err = stats.linregress(range(len(a_1)),a_1) sqr_sum_err1 = sum(((slope*np.arange(len(a_1)) + intercept)-a_1)**2) else: sqr_sum_err1 = 0 score_err_sum[i] = sqr_sum_err0 + sqr_sum_err1 # Optimum number of clusters. min_idx = np.argmin(score_err_sum) opt_num_cluster = min_idx+1 log.info('opt_num_cluster: ' + str(opt_num_cluster)) if est_method == 'kmean': label = model_set[min_idx].labels_ elif est_method == 'gmm': label = model_set[min_idx].predict(obs) else: raise NameError('not supported est_method') return label, opt_num_cluster, model_set[min_idx], score, score_err_sum
def get_weather_timelet(data_dict,t_slots, timelet_inv, use_weather_data_bin=True): log.info('------------------------------------') log.info('Retrieving weather data... ') log.info('------------------------------------') t_start = t_slots[0] t_end = t_slots[-1] log.info('start time: ' + str(t_start) + ' ~ end time: ' + str(t_end)) # Date iteration given start time and end-time # Iterate for each day for all weather data types for date_idx, date in enumerate(daterange(t_start, t_end, inclusive=True)): log.info("weather date : " + date.strftime("%Y-%m-%d")) temp = date.strftime("%Y,%m,%d").rsplit(',') if use_weather_data_bin: filename = WEATHER_DIR + "%04d_%02d_%02d.bin"%(int(temp[0]), int(temp[1]), int(temp[2])) data_day = mt.loadObjectBinaryFast(filename) else: data_day = rw.retrieve_data('SDH', int(temp[0]), int(temp[1]), int(temp[2]), view='d') # split the data into t data_day = data_day.split('\n') # Iterate for each time index(h_idx) of a day for all weather data types for h_idx, hour_sample in enumerate(data_day): hour_samples = hour_sample.split(',') # Initialize weather data lists of dictionary # The first row is always the list of weather data types if (h_idx == 0) and (date_idx == 0): sensor_name_list = hour_sample.split(',') sensor_name_list = [sensor_name.replace('/', '-') for sensor_name in sensor_name_list] for sample_idx, each_sample in enumerate(hour_samples): sensor_name = sensor_name_list[sample_idx] sensor_read = [[] for i in range(len(t_slots))] stime_read = [[] for i in range(len(t_slots))] # Creat the list of lists for minute index utc_t = [] val = [] #data_dict.update({sensor_name:sensor_read}) #data_dict.update({sensor_name:zip(mtime_read,sensor_read)}) data_dict.update({sensor_name: [stime_read, sensor_read, [utc_t, val]]}) elif h_idx > 0: ################################################################ # 'DateUTC' is the one sample_DateUTC = hour_samples[sensor_name_list.index('DateUTC')] # convert to UTC time to VTT local time. utc_dt = dt.datetime.strptime(sample_DateUTC, "%Y-%m-%d %H:%M:%S") vtt_dt_aware = utc_dt.replace(tzinfo=from_zone).astimezone(to_zone) # convert to offset-naive from offset-aware datetimes vtt_dt = dt.datetime(*(vtt_dt_aware.timetuple()[:6])) ### WARNING: vtt_utc is not utc #log.warn("vtt_utc is not utc") vtt_utc = dtime_to_unix([vtt_dt]) # Check boundary condition if int((vtt_dt - t_slots[0]).total_seconds()) < 0 or int((vtt_dt - t_slots[-1]).total_seconds()) >= timelet_inv.seconds: log.debug('skipping weather data out of analysis range...') continue slot_idx = int((vtt_dt - t_slots[0]).total_seconds() / timelet_inv.seconds) cur_sec_val = (vtt_dt - t_slots[slot_idx]).total_seconds() if cur_sec_val >= timelet_inv.seconds: log.critical('sec: ' + str(cur_sec_val)) raise NameError('Seconds from an hour idx cannot be greater than '+str(timelet_inv.seconds) +'secs') # time slot index a given weather sample time try: for sample_idx, each_sample in enumerate(hour_samples): # convert string type to float time if possible try: each_sample = float(each_sample) except ValueError: each_sample = each_sample sensor_name = sensor_name_list[sample_idx] if sensor_name in data_dict: if each_sample != 'N/A' and each_sample !=[]: #data_dict[sensor_name][vtt_dt_idx].append(each_sample) data_dict[sensor_name][0][slot_idx].append(cur_sec_val) data_dict[sensor_name][1][slot_idx].append(each_sample) data_dict[sensor_name][2][0].append(vtt_utc) data_dict[sensor_name][2][1].append(each_sample) else: raise NameError('Inconsistency in the list of weather data') except ValueError: slot_idx = -1 # hour_sample is list of weather filed name, discard else: hour_sample = list() return sensor_name_list
def get_val_timelet(reading, t_slots, ans_start_t, ans_end_t, timelet_inv): data = dict() data['value'] = np.array([r[1] for r in reading], dtype=float) ts_list = list() for r in reading: local_dt = dt.datetime.fromtimestamp(r[0]) time_tup = local_dt.timetuple() ts_list.append([local_dt, time_tup[5], time_tup[4], time_tup[3], time_tup[6], time_tup[2], time_tup[1]]) data['ts'] = np.array(ts_list) if not len(data): log.critical('Error in file reading: empty data. Skip and need to be purged from sensor list') sensor_read = -1 stime_read = -1 utc_t = -1 val = -1 return sensor_read, stime_read, utc_t, val if (len(data["ts"]) < MIN_NUM_VAL_FOR_FLOAT) or (len(data["value"]) < MIN_NUM_VAL_FOR_FLOAT): log.critical('No data included ' + str(data) + '... Skip and need to be purged from sensor list') sensor_read = -1 stime_read = -1 utc_t = -1 val = -1 return sensor_read, stime_read, utc_t, val nan_idx_list = np.nonzero(np.isnan(data["value"]))[0] sensor_val = np.delete(data["value"], nan_idx_list, axis=0) time_val = np.delete(data["ts"], nan_idx_list, axis=0) # Create the list of lists for value sensor_read = [[] for i in range(len(t_slots))] # Create the list of lists for seconds index stime_read = [[] for i in range(len(t_slots))] utc_t = [] val = [] for t_sample, v_sample in zip(time_val, sensor_val): temp_dt = t_sample[DT_IDX] if temp_dt < ans_start_t or temp_dt >= ans_end_t: continue try: idx = int((temp_dt - ans_start_t).total_seconds() / timelet_inv.seconds) sensor_read[idx].append(v_sample) #secs=t_sample[MIN_IDX]*MIN+t_sample[SEC_IDX] secs = (temp_dt - t_slots[idx]).total_seconds() if secs >= timelet_inv.seconds: log.info('sec: ' + str(secs)) raise NameError('Seconds from an hour idx cannot be greater than ' + str(timelet_inv.seconds) + 'secs') stime_read[idx].append(secs) except ValueError: idx = -1 utc_temp = dtime_to_unix([t_sample[DT_IDX]]) utc_t.append(utc_temp) val.append(v_sample) return sensor_read, stime_read, utc_t, val
def construct_data_dict(sensor_data, ans_start_t, ans_end_t, timelet_inv, include_weather=1, PARALLEL=False): log.info('-' * 80) log.info('mapping sensor list into hasing table using dictionary') log.info('Align sensor data into a single time_slots referece... from ' + str(ans_start_t) + ' to ' + str(ans_end_t)) log.info('-' * 80) # Variable Declare and initialization time_slots = list() start = ans_start_t while start < ans_end_t: time_slots.append(start) start = start + timelet_inv # Data dictionary # All sensor and weather data is processed and structred into # a consistent single data format -- Dictionary data_dict = dict() sensor_list = list() purge_list = list() # Data Access is following .... #data_dict[key][time_slot_idx][(min_idx=0 or values=1)] if PARALLEL: log.info("construct_data_dict >>> Parallel enabled") args = [(sensor_uuid, sensor_reading, time_slots, ans_start_t, ans_end_t, timelet_inv) for sensor_uuid, sensor_reading in sensor_data.iteritems() ] p = Pool(CPU_CORE_NUM) timed_vlist = p.map(pp_construct_data_dict, args) p.close() p.join() for v in timed_vlist: sensor_uuid, timed_value = v if len(timed_value): sensor_list.append(sensor_uuid) data_dict.update({sensor_uuid: timed_value}) else: purge_list.append(sensor_uuid) else: for sensor_uuid, sensor_reading in sensor_data.iteritems(): log.info('sampling sensor uuid ' + sensor_uuid) len_time_slots = len(time_slots) # sensor value is read by time dict_sensor_val, dict_stime, utc_t, val =\ get_val_timelet(sensor_reading, time_slots, ans_start_t, ans_end_t, timelet_inv) if dict_sensor_val == -1: log.debug('append purge list: dict_sensor_val=-1 ' + sensor_uuid) purge_list.append(sensor_uuid) elif len(utc_t) < len_time_slots: log.debug('append purge list:len(utc_t)<len_time_slots' + sensor_uuid) purge_list.append(sensor_uuid) elif len(val) < len_time_slots: log.debug('append purge list:len(val)<len_time_slots' + sensor_uuid) purge_list.append(sensor_uuid) else: sensor_list.append(sensor_uuid) # Convert list to array type for bin file size and loading time, dict_sensor_val_temp = np.array([np.asarray(val_) for val_ in dict_sensor_val]) dict_stime_temp = np.array([np.asarray(t_) for t_ in dict_stime]) utc_t_val_temp = np.asarray([utc_t, val]) data_dict.update({sensor_uuid: [dict_stime_temp, dict_sensor_val_temp, utc_t_val_temp]}) log.info('-' * 20) data_dict.update({'time_slots': time_slots}) log.info('-' * 40) # directly access internet if include_weather == 1: log.info("Construction weather dict") #weather_list -that is pretty much fixed from database #(*) is the data to be used for our analysis #0 TimeEEST #1 TemperatureC (*) #2 Dew PointC (*) #3 Humidity (*) #4 Sea Level PressurehPa #5 VisibilityKm #6 Wind Direction #7 Wind SpeedKm/h #8 Gust SpeedKm/h #9 Precipitationmm #10 Events (*) #11 Conditions (*) #12 WindDirDegrees #13 DateUTC weather_list = get_weather_timelet(data_dict, time_slots, timelet_inv) # Convert symbols to Integer representaion data_dict['Conditions'][1], Conditions_dict = symbol_to_state(data_dict['Conditions'][1]) data_dict['Events'][1], Events_dict = symbol_to_state(data_dict['Events'][1]) data_dict.update({'sensor_list': sensor_list}) data_dict.update({'weather_list' : weather_list}) data_dict.update({'Conditions_dict': Conditions_dict}) data_dict.update({'Events_dict' : Events_dict}) # Change List to Array type for key_id in weather_list: temp_list = list() for k, list_val_ in enumerate(data_dict[key_id]): temp_list.append(np.asanyarray(list_val_)) data_dict[key_id] = temp_list # use stored bin file elif include_weather == 2: log.info('use weather_dict.bin') # This part to be filled with Khiem...... else: log.info('skip weather database...') return data_dict, purge_list
def state_retrieval(obs, max_num_cluster=6, off_set=0, est_method='kmean', PARALLEL=False): log.info('-' * 40) log.info('Retrieving discrete states from data using ' + est_method + ' model...') log.info('-' * 40) log.info('try ' + str(max_num_cluster) + ' clusters..... ') score = np.zeros(max_num_cluster) model_set = list() if not PARALLEL: for num_cluster in range(max_num_cluster): log.info('Try ' + str(num_cluster + 1) + ' clusters ') log.info('-----------------------------------') if est_method == 'kmean': kmean = KMeans(n_clusters=num_cluster + 1).fit(obs) model_set.append(kmean) #score[num_cluster]=-1*np.log(-1*np.sum(kmean.score(obs))) #score[num_cluster]=kmean.score(obs) #score[num_cluster]=kmean.score(obs)-.5*(num_cluster+1)*1*log10(len(obs)) #log_ll_val=compute_log_ll(kmean.labels_,obs) score[num_cluster] = compute_log_ll(kmean.labels_, obs) elif est_method == 'gmm': gmm = mixture.GMM(n_components=num_cluster + 1).fit(obs) model_set.append(gmm) score[num_cluster] = np.sum(gmm.score(obs)) else: log.error('not supported est_method') raise NameError('not supported est_method') else: log.info('Parallel enabled...') model_set = [0] * max_num_cluster score = [0] * max_num_cluster params = [(obs, i + 1, est_method) for i in range(max_num_cluster)] p = Pool(max_num_cluster) models = p.map(pp_cluster_state_retrieval, params) p.close() p.join() model_dict = dict(models) for k, v in model_dict.iteritems(): model_set[k] = v[0] score[k] = v[1] score_err_sum = np.zeros(max_num_cluster) log.info('Finding knee points of log likelihood...') for i in range(max_num_cluster): a_0 = score[:(i)] if len(a_0) > 1: slope, intercept, r_value, p_value, std_err = stats.linregress( range(len(a_0)), a_0) sqr_sum_err0 = sum( ((slope * np.arange(len(a_0)) + intercept) - a_0)**2) else: sqr_sum_err0 = 0 a_1 = score[(i):] if len(a_1) > 1: slope, intercept, r_value, p_value, std_err = stats.linregress( range(len(a_1)), a_1) sqr_sum_err1 = sum( ((slope * np.arange(len(a_1)) + intercept) - a_1)**2) else: sqr_sum_err1 = 0 score_err_sum[i] = sqr_sum_err0 + sqr_sum_err1 # Optimum number of clusters. min_idx = np.argmin(score_err_sum) opt_num_cluster = min_idx + 1 log.info('opt_num_cluster: ' + str(opt_num_cluster)) if est_method == 'kmean': label = model_set[min_idx].labels_ elif est_method == 'gmm': label = model_set[min_idx].predict(obs) else: raise NameError('not supported est_method') return label, opt_num_cluster, model_set[min_idx], score, score_err_sum
def data_summerization(bldg_key, data_dict, proc_avg=True, proc_diff=True, PARALLEL=False): time_slots = data_dict['time_slots'][:] conditions_dict = data_dict['Conditions_dict'].copy() events_dict = data_dict['Events_dict'].copy() sensor_list = data_dict['sensor_list'][:] weather_list = data_dict['weather_list'][:] weather_list_used = ['TemperatureC', 'Dew PointC', 'Humidity', 'Events', 'Conditions'] # data_used is the list of refernece name for all measurements from now on. data_used = sensor_list + weather_list_used # This is a global ID for data_used measurement data_used_idx = range(len(data_used)) sensor_idx = range(len(sensor_list)) weather_idx = range(len(sensor_list), len(data_used)) dsout = {'data_dict': data_dict} if proc_avg: log.info('-' * 40) log.info('processing avg.feature..') log.info('-' * 40) X_Feature, X_Time, X_names, X_zero_var_list, X_zero_var_val, X_int_type_list,\ X_int_type_idx, X_float_type_list, X_float_type_idx, X_weather_type_idx, X_sensor_type_idx = \ build_feature_matrix(data_dict, sensor_list, weather_list_used, time_slots, interpolation=1, max_num_succ_idx_for_itpl=int(len(time_slots)*0.05)) build_feature_matrix_out = \ {'X_Feature': X_Feature, 'X_Time': X_Time, 'X_names': X_names, 'X_zero_var_list': X_zero_var_list, 'X_zero_var_val': X_zero_var_val, 'X_int_type_list': X_int_type_list, 'X_int_type_idx': X_int_type_idx, 'X_float_type_list': X_float_type_list, 'X_float_type_idx': X_float_type_idx, 'X_weather_type_idx': X_weather_type_idx, 'X_sensor_type_idx': X_sensor_type_idx} build_feature_matrix_out = obj(build_feature_matrix_out) if len(X_names+X_zero_var_list) != len(data_used): log.error('Missing name is found in X_names or X_zero_var_list') raise NameError('Missing name is found in X_names or X_zero_var_list') else: zero_var_idx = [data_used.index(name_str) for name_str in X_zero_var_list] nzero_var_idx = list(set(data_used_idx)-set(zero_var_idx)) if X_Feature.shape[0] > 0: # From below all index are reference to X_Feature sf_idx = list(set(X_sensor_type_idx)&set(X_float_type_idx)) # Equivalent to np.array(data_used)[np.array(nzero_var_idx)[sf_idx]] sf_name = list(np.array(X_names)[sf_idx]) si_idx = list(set(X_sensor_type_idx)&set(X_int_type_idx)) si_name = list(np.array(X_names)[si_idx]) wf_idx = list(set(X_weather_type_idx)&set(X_float_type_idx)) wf_name = list(np.array(X_names)[wf_idx]) wi_idx = list(set(X_weather_type_idx)&set(X_int_type_idx)) wi_name = list(np.array(X_names)[wi_idx]) #Euclidian Distance Matrix of Floating type of data only wf+o float_idx = list(set(sf_idx)| set(wf_idx)) int_idx = list(set(si_idx)| set(wi_idx)) # Float Type Measurement Clustering X_Feature_sfe, sf_exemplars_dict, exemplars_, labels_ = \ cluster_measurement_points(X_Feature[:, sf_idx], sf_name, corr_bnd=[0.1, 0.9], alg='aff') sfe_idx = list(np.array(sf_idx)[exemplars_]) #plot_label(X_Feature,X_names,labels_,exemplars_,[4,5,6,7]) # InT Type Measurement Clustering X_Feature_sie, si_exemplars_dict, exemplars_, labels_ = \ cluster_measurement_points(X_Feature[:, si_idx], si_name, corr_bnd=[0.0, 0.9], alg='aff') sie_idx = list(np.array(si_idx)[exemplars_]) # sensor -float type sfe_state, sfe_corr_val = x_input_to_states(X_Feature_sfe, CORR_VAL_OUT=1) # sensor -integer type sie_state = X_Feature_sie # weather -float type wf_state, wf_corr_val = x_input_to_states(X_Feature[:, wf_idx], CORR_VAL_OUT=1) # weather -integer type wi_state = X_Feature[:, wi_idx] empty_states = np.array([[] for i in range(len(X_Time))]) if len(sfe_state) == 0: sfe_state = empty_states if len(sie_state) == 0: sie_state = empty_states if len(wf_state) ==0: wf_state = empty_states if len(wi_state) == 0: wi_state = empty_states # Exemplar sensor only X_Sensor_STATE = np.append(sfe_state,sie_state, axis=1) X_Sensor_STATE = X_Sensor_STATE.astype(int) X_Sensor_NAMES = list(np.array(X_names)[sfe_idx]) + list(np.array(X_names)[sie_idx]) X_Weather_STATE = np.append(wf_state,wi_state, axis=1) X_Weather_STATE = X_Weather_STATE.astype(int) X_Weather_NAMES = list(np.array(X_names)[wf_idx])+list(np.array(X_names)[wi_idx]) # months of a year,days of a week, and hours of a day # (Monday, Tuesday,Wendsday,Thursday,Saturday,Sunday) =(0,1,2,3,4,5,6) X_Time_STATE_temp = build_time_states(X_Time) X_Time_NAMES_temp = ['MTH', 'WD', 'HR'] X_Time_STATE = list() X_Time_NAMES = list() for xt_col, xt_name in zip(X_Time_STATE_temp.T,X_Time_NAMES_temp): if len(set(xt_col)) > 1: X_Time_STATE.append(xt_col) X_Time_NAMES.append(xt_name) X_Time_STATE = np.array(X_Time_STATE).T ################################################# # FORMATTED DATA FOR REGUALR EVENT ################################################# #DO_PROB_EST=1 ** Save this variables*** #avgdata_mat = np.hstack([X_Sensor_STATE,X_Weather_STATE,X_Time_STATE]) #avgdata_names = X_Sensor_NAMES+X_Weather_NAMES+X_Time_NAMES avgdata_exemplar = dict(sf_exemplars_dict.items()+si_exemplars_dict.items()) avgdata_zvar = X_zero_var_list avgdata_dict = dict() avgdata_dict.update({'build_feature_matrix_out': build_feature_matrix_out}) avgdata_dict.update({'avgdata_state_mat': X_Sensor_STATE}) avgdata_dict.update({'avgdata_weather_mat': X_Weather_STATE}) avgdata_dict.update({'avgdata_time_mat': X_Time_STATE}) avgdata_dict.update({'avg_time_slot': X_Time}) avgdata_dict.update({'avgdata_exemplar': avgdata_exemplar}) avgdata_dict.update({'avgdata_zvar': avgdata_zvar}) avgdata_dict.update({'sensor_names': X_Sensor_NAMES}) avgdata_dict.update({'weather_names': X_Weather_NAMES}) avgdata_dict.update({'time_names': X_Time_NAMES}) dsout.update({'avgdata_dict': avgdata_dict}) if proc_diff: log.info('-' * 40) log.info('processing diff.feature..') log.info('-' * 40) #################################### # Irregular Event Extraction #################################### # Interpolatoin with outlier removal, Here we exclude weather data from irregualr event analysis # since weather data noramlly show slow changes in time.so we dont expect in any meaningful diffs values measurement_point_set, num_type_set = interpolation_measurement(data_dict, sensor_list, err_rate=1, sgm_bnd=20) # Irregualr matrix Xdiff_Mat,\ Xdiff_Time,\ Xdiff_Names,\ Xdiff_zero_var_list,\ Xdiff_zero_var_val,\ Xdiff_int_type_list,\ Xdiff_int_type_idx,\ Xdiff_float_type_list,\ Xdiff_float_type_idx =\ build_diff_matrix(measurement_point_set, time_slots, num_type_set, sensor_list, PARALLEL=PARALLEL) build_diff_matrix_out = \ {'Xdiff_Mat':Xdiff_Mat, 'Xdiff_Time':Xdiff_Time, 'Xdiff_Names':Xdiff_Names, 'Xdiff_zero_var_list':Xdiff_zero_var_list, 'Xdiff_zero_var_val':Xdiff_zero_var_val, 'Xdiff_int_type_list':Xdiff_int_type_list, 'Xdiff_int_type_idx':Xdiff_int_type_idx, 'Xdiff_float_type_list':Xdiff_float_type_list, 'Xdiff_float_type_idx':Xdiff_float_type_idx} build_diff_matrix_out = obj(build_diff_matrix_out) if Xdiff_Mat.shape[0] > 0: #============================================================================== # Restructure diff_marix's and weather matix for the same common time slot #============================================================================== time_slots_array = np.sort(np.array(list(set(Xdiff_Time) & set(X_Time)))) # Extract subset of X_Weather_STATE removed_idx_list = list() for ridx, slot in enumerate(X_Time): slot_idx = np.where(time_slots_array==slot)[0] # slot not in common time slots if len(slot_idx) == 0: removed_idx_list.append(ridx) XDIFF_Weather_STATE = np.delete(X_Weather_STATE, removed_idx_list,axis=0) # Extract subset of Xdiff_Mat removed_idx_list = list() for ridx,slot in enumerate(Xdiff_Time): slot_idx = np.where(time_slots_array == slot)[0] # slot not in common time slots if len(slot_idx) == 0: removed_idx_list.append(ridx) Xdiff_Mat = np.delete(Xdiff_Mat, removed_idx_list, axis=0) # Update Xdiff_Time Xdiff_Time = time_slots_array XDIFF_Weather_STATE = np.array(XDIFF_Weather_STATE) # From below all index are reference to X_Feature xdiff_sf_idx = Xdiff_float_type_idx xdiff_sf_name = Xdiff_float_type_list xdiff_si_idx = Xdiff_int_type_idx xdiff_si_name = Xdiff_int_type_list # Float Type Measurement Clustering X_Diff_sfe, sf_diff_exemplars_dict, exemplars_, labels_ = \ cluster_measurement_points(Xdiff_Mat[:, xdiff_sf_idx], xdiff_sf_name, corr_bnd=[0.1, 0.9]) xdiff_sfe_idx = list(np.array(xdiff_sf_idx)[exemplars_]) # InT Type Measurement Clustering X_Diff_sie, si_diff_exemplars_dict, exemplars_, labels_ = \ cluster_measurement_points(Xdiff_Mat[:, xdiff_si_idx], xdiff_si_name, corr_bnd=[0.1, 0.9]) xdiff_sie_idx = list(np.array(xdiff_si_idx)[exemplars_]) # sensor -float type xdiff_sfe_state, xdiff_sfe_corr_val =\ x_input_to_states(X_Diff_sfe, CORR_VAL_OUT=1, PARALLEL=PARALLEL) # sensor -integer type xdiff_sie_state = X_Diff_sie empty_states = np.array([[] for i in range(len(Xdiff_Time))]) if len(xdiff_sfe_state) == 0: xdiff_sfe_state = empty_states if len(xdiff_sie_state) == 0: xdiff_sie_state = empty_states if len(wf_state) == 0: wf_state = empty_states if len(wi_state) == 0: wi_state = empty_states # Exemplar sensor only XDIFF_Sensor_STATE = np.append(xdiff_sfe_state,xdiff_sie_state, axis=1) XDIFF_Sensor_STATE = XDIFF_Sensor_STATE.astype(int) XDIFF_Sensor_NAMES = list(np.array(Xdiff_Names)[xdiff_sfe_idx])+list(np.array(Xdiff_Names)[xdiff_sie_idx]) # months of a year,days of a week, and hours of a day # (Monday, Tuesday,Wendsday,Thursday,Saturday,Sunday) =(0,1,2,3,4,5,6) XDIFF_Time_STATE_temp = build_time_states(Xdiff_Time) XDIFF_Time_NAMES_temp = ['MTH', 'WD', 'HR'] XDIFF_Time_STATE = list() XDIFF_Time_NAMES = list() for xt_col, xt_name in zip(XDIFF_Time_STATE_temp.T, XDIFF_Time_NAMES_temp): if len(set(xt_col)) > 1: XDIFF_Time_STATE.append(xt_col) XDIFF_Time_NAMES.append(xt_name) XDIFF_Time_STATE = np.array(XDIFF_Time_STATE).T ################################################# # FORMATTED DATA FOR IRREGUALR EVENT ################################################# log.info("FORMATTED DATA FOR IRREGUALR EVENT") #** Save this variables*** #diffdata_mat = np.hstack([XDIFF_Sensor_STATE,X_Weather_STATE,XDIFF_Time_STATE]) #diffdata_names = XDIFF_Sensor_NAMES+X_Weather_NAMES+XDIFF_Time_NAMES diffdata_exemplar = dict(sf_diff_exemplars_dict.items() + si_diff_exemplars_dict.items()) diffdata_zvar = Xdiff_zero_var_list diffdata_dict = dict() diffdata_dict.update({'build_diff_matrix_out': build_diff_matrix_out}) diffdata_dict.update({'diffdata_state_mat': XDIFF_Sensor_STATE}) diffdata_dict.update({'diffdata_weather_mat': XDIFF_Weather_STATE}) diffdata_dict.update({'diffdata_time_mat': XDIFF_Time_STATE}) diffdata_dict.update({'diff_time_slot': Xdiff_Time}) diffdata_dict.update({'diffdata_exemplar': diffdata_exemplar}) diffdata_dict.update({'diffdata_zvar': diffdata_zvar}) diffdata_dict.update({'sensor_names': XDIFF_Sensor_NAMES}) diffdata_dict.update({'weather_names': X_Weather_NAMES}) diffdata_dict.update({'time_names': X_Time_NAMES}) dsout.update({'diffdata_dict': diffdata_dict}) dsout.update({'bldg_key': remove_dot(bldg_key)}) return dsout
def get_weather_timelet(data_dict, t_slots, timelet_inv, use_weather_data_bin=True): log.info('------------------------------------') log.info('Retrieving weather data... ') log.info('------------------------------------') t_start = t_slots[0] t_end = t_slots[-1] log.info('start time: ' + str(t_start) + ' ~ end time: ' + str(t_end)) # Date iteration given start time and end-time # Iterate for each day for all weather data types for date_idx, date in enumerate(daterange(t_start, t_end, inclusive=True)): log.info("weather date : " + date.strftime("%Y-%m-%d")) temp = date.strftime("%Y,%m,%d").rsplit(',') if use_weather_data_bin: filename = WEATHER_DIR + "%04d_%02d_%02d.bin" % (int( temp[0]), int(temp[1]), int(temp[2])) data_day = mt.loadObjectBinaryFast(filename) else: data_day = rw.retrieve_data('SDH', int(temp[0]), int(temp[1]), int(temp[2]), view='d') # split the data into t data_day = data_day.split('\n') # Iterate for each time index(h_idx) of a day for all weather data types for h_idx, hour_sample in enumerate(data_day): hour_samples = hour_sample.split(',') # Initialize weather data lists of dictionary # The first row is always the list of weather data types if (h_idx == 0) and (date_idx == 0): sensor_name_list = hour_sample.split(',') sensor_name_list = [ sensor_name.replace('/', '-') for sensor_name in sensor_name_list ] for sample_idx, each_sample in enumerate(hour_samples): sensor_name = sensor_name_list[sample_idx] sensor_read = [[] for i in range(len(t_slots))] stime_read = [[] for i in range(len(t_slots)) ] # Creat the list of lists for minute index utc_t = [] val = [] #data_dict.update({sensor_name:sensor_read}) #data_dict.update({sensor_name:zip(mtime_read,sensor_read)}) data_dict.update( {sensor_name: [stime_read, sensor_read, [utc_t, val]]}) elif h_idx > 0: ################################################################ # 'DateUTC' is the one sample_DateUTC = hour_samples[sensor_name_list.index( 'DateUTC')] # convert to UTC time to VTT local time. utc_dt = dt.datetime.strptime(sample_DateUTC, "%Y-%m-%d %H:%M:%S") vtt_dt_aware = utc_dt.replace( tzinfo=from_zone).astimezone(to_zone) # convert to offset-naive from offset-aware datetimes vtt_dt = dt.datetime(*(vtt_dt_aware.timetuple()[:6])) ### WARNING: vtt_utc is not utc #log.warn("vtt_utc is not utc") vtt_utc = dtime_to_unix([vtt_dt]) # Check boundary condition if int((vtt_dt - t_slots[0]).total_seconds()) < 0 or int( (vtt_dt - t_slots[-1]).total_seconds()) >= timelet_inv.seconds: log.debug('skipping weather data out of analysis range...') continue slot_idx = int((vtt_dt - t_slots[0]).total_seconds() / timelet_inv.seconds) cur_sec_val = (vtt_dt - t_slots[slot_idx]).total_seconds() if cur_sec_val >= timelet_inv.seconds: log.critical('sec: ' + str(cur_sec_val)) raise NameError( 'Seconds from an hour idx cannot be greater than ' + str(timelet_inv.seconds) + 'secs') # time slot index a given weather sample time try: for sample_idx, each_sample in enumerate(hour_samples): # convert string type to float time if possible try: each_sample = float(each_sample) except ValueError: each_sample = each_sample sensor_name = sensor_name_list[sample_idx] if sensor_name in data_dict: if each_sample != 'N/A' and each_sample != []: #data_dict[sensor_name][vtt_dt_idx].append(each_sample) data_dict[sensor_name][0][slot_idx].append( cur_sec_val) data_dict[sensor_name][1][slot_idx].append( each_sample) data_dict[sensor_name][2][0].append(vtt_utc) data_dict[sensor_name][2][1].append( each_sample) else: raise NameError( 'Inconsistency in the list of weather data') except ValueError: slot_idx = -1 # hour_sample is list of weather filed name, discard else: hour_sample = list() return sensor_name_list
def run(self): from log_util import log try: while True: cmd = None try: cmd = self.cmd_q.get(block=True, timeout=0.1) except Exception as e: continue finally: if cmd: self.cmd_q.task_done() try: with open(META_DIR + "wip.json", 'w') as f: f.write(simplejson.dumps({"wip": 1})) cmdset = simplejson.loads(cmd) sensor_hash = cmdset['selected-nodes'] s_date = datetime.strptime(cmdset['start-date'], '%Y-%m-%d') e_date = datetime.strptime(cmdset['end-date'], '%Y-%m-%d') if not len(sensor_hash): log.critical("No sensor is selected!") else: log.info('****************************** Begining of DDEA *******************************') bldg_key = 'SODA' #exemplar by user #pname_key = '_POWER_' pname_key = 'POWER' s_epoch = int(time.mktime(s_date.timetuple())) e_epoch = int(time.mktime(e_date.timetuple())) time_inv = dt.timedelta(seconds=cmdset['time-interval']) log.info("Cleaning up old output...") mt.remove_all_files(FIG_DIR) mt.remove_all_files(JSON_DIR) mt.remove_all_files(PROC_OUT_DIR) log.info("start epoch : " + str(s_epoch) + " end epoch : " + str(e_epoch)) log.info(str(time_inv) + ' time slot interval is set for this data set !!!') log.info("BLDG_KEY : " + bldg_key + " PNAME_KEY : " + pname_key) log.info('*' * 80) log.info("Retrieve sensor data from quasar TSDB") sensor_names_hash = mt.sensor_name_uid_dict(bldg_key, sensor_hash) sensor_data = read_sensor_data(sensor_names_hash, s_epoch, e_epoch) if sensor_data and len(sensor_data): ddea_process(sensor_names_hash, sensor_data, s_epoch, e_epoch, time_inv, bldg_key, pname_key) else: log.critical("No sensor data available for time period and sensor selected!") log.info('******************************** End of DDEA **********************************') os.remove(META_DIR + "wip.json") cmd_lock.clear() log.info("execution-lock cleared") log.info('~' * 80) except Exception as e: os.remove(META_DIR + "wip.json") cmd_lock.clear() print e log.error(str(e)) except Exception as e: os.remove(META_DIR + "wip.json") cmd_lock.clear() print e log.error(str(e)) finally: sys.exit(0)
def check_bounded_distance_constraint_condition(dist_mat, labels, min_dist, max_dist): intra_err_cnt = 0 num_clusters = int(labels.max() + 1) log.info('-' * 80) log.info('Intra-Cluster distance check.....') log.info('Condition: inter-cluster distance is upper-bounded by' + str(round(max_dist, 2))) log.info('-' * 80) for i in range(num_clusters): idx_set = np.nonzero(labels == (i))[0] #print '----------------------------------------------------------' #print i,'th cluster: ',idx_set for idx_pair in pair_in_idx(idx_set): #print idx_pair, 'dist-',round(dist_mat[idx_pair[0],idx_pair[1]],2) dist_val_ = dist_mat[idx_pair[0], idx_pair[1]] # Rule violation if dist_val_ > max_dist: log.info('*** the distance of pairs :' + str(idx_pair) + ' in ' + str(i) + 'th cluster ~' + str(np.round(dist_val_, 2)) + ' > max_dist=' + str(np.round(max_dist, 2)) + '***') intra_err_cnt = intra_err_cnt + 1 log.info('-' * 80) log.info('Inter-Cluster distance check.....') log.info('Condition: intra-cluster distance is lower-bounded by ' + str(round(min_dist, 2))) log.info('-' * 80) cluster_pairs = pair_in_idx(range(num_clusters)) inter_err_cnt = 0 for c_pair in cluster_pairs: idx_set_0 = np.nonzero(labels == (c_pair[0]))[0] idx_set_1 = np.nonzero(labels == (c_pair[1]))[0] #print '----------------------------------------------------------' #print 'The pairwise distance between ',c_pair[0],'th cluster and',c_pair[1],'th cluster' for idx_pair in pair_in_idx(idx_set_0, idx_set_1): #print idx_pair, 'dist-',round(dist_mat[idx_pair[0],idx_pair[1]],2) dist_val_ = dist_mat[idx_pair[0], idx_pair[1]] # Rule violation if dist_val_ < min_dist: log.info( '*** the distance of pairs :' + str(idx_pair[0]) + ' in ' + str(c_pair[0]) + ' and ' + str(idx_pair[1]) + ' in ' + str(c_pair[1]) + ' ~ ' + str(round(dist_val_, 2)) + ' < min_dist=', str(round(min_dist, 2)) + '***') inter_err_cnt += inter_err_cnt + 1 return intra_err_cnt, inter_err_cnt
def construct_data_dict(sensor_data, ans_start_t, ans_end_t, timelet_inv, include_weather=1, PARALLEL=False): log.info('-' * 80) log.info('mapping sensor list into hasing table using dictionary') log.info('Align sensor data into a single time_slots referece... from ' + str(ans_start_t) + ' to ' + str(ans_end_t)) log.info('-' * 80) # Variable Declare and initialization time_slots = list() start = ans_start_t while start < ans_end_t: time_slots.append(start) start = start + timelet_inv # Data dictionary # All sensor and weather data is processed and structred into # a consistent single data format -- Dictionary data_dict = dict() sensor_list = list() purge_list = list() # Data Access is following .... #data_dict[key][time_slot_idx][(min_idx=0 or values=1)] if PARALLEL: log.info("construct_data_dict >>> Parallel enabled") args = [(sensor_uuid, sensor_reading, time_slots, ans_start_t, ans_end_t, timelet_inv) for sensor_uuid, sensor_reading in sensor_data.iteritems()] p = Pool(CPU_CORE_NUM) timed_vlist = p.map(pp_construct_data_dict, args) p.close() p.join() for v in timed_vlist: sensor_uuid, timed_value = v if len(timed_value): sensor_list.append(sensor_uuid) data_dict.update({sensor_uuid: timed_value}) else: purge_list.append(sensor_uuid) else: for sensor_uuid, sensor_reading in sensor_data.iteritems(): log.info('sampling sensor uuid ' + sensor_uuid) len_time_slots = len(time_slots) # sensor value is read by time dict_sensor_val, dict_stime, utc_t, val =\ get_val_timelet(sensor_reading, time_slots, ans_start_t, ans_end_t, timelet_inv) if dict_sensor_val == -1: log.debug('append purge list: dict_sensor_val=-1 ' + sensor_uuid) purge_list.append(sensor_uuid) elif len(utc_t) < len_time_slots: log.debug('append purge list:len(utc_t)<len_time_slots' + sensor_uuid) purge_list.append(sensor_uuid) elif len(val) < len_time_slots: log.debug('append purge list:len(val)<len_time_slots' + sensor_uuid) purge_list.append(sensor_uuid) else: sensor_list.append(sensor_uuid) # Convert list to array type for bin file size and loading time, dict_sensor_val_temp = np.array( [np.asarray(val_) for val_ in dict_sensor_val]) dict_stime_temp = np.array( [np.asarray(t_) for t_ in dict_stime]) utc_t_val_temp = np.asarray([utc_t, val]) data_dict.update({ sensor_uuid: [dict_stime_temp, dict_sensor_val_temp, utc_t_val_temp] }) log.info('-' * 20) data_dict.update({'time_slots': time_slots}) log.info('-' * 40) # directly access internet if include_weather == 1: log.info("Construction weather dict") #weather_list -that is pretty much fixed from database #(*) is the data to be used for our analysis #0 TimeEEST #1 TemperatureC (*) #2 Dew PointC (*) #3 Humidity (*) #4 Sea Level PressurehPa #5 VisibilityKm #6 Wind Direction #7 Wind SpeedKm/h #8 Gust SpeedKm/h #9 Precipitationmm #10 Events (*) #11 Conditions (*) #12 WindDirDegrees #13 DateUTC weather_list = get_weather_timelet(data_dict, time_slots, timelet_inv) # Convert symbols to Integer representaion data_dict['Conditions'][1], Conditions_dict = symbol_to_state( data_dict['Conditions'][1]) data_dict['Events'][1], Events_dict = symbol_to_state( data_dict['Events'][1]) data_dict.update({'sensor_list': sensor_list}) data_dict.update({'weather_list': weather_list}) data_dict.update({'Conditions_dict': Conditions_dict}) data_dict.update({'Events_dict': Events_dict}) # Change List to Array type for key_id in weather_list: temp_list = list() for k, list_val_ in enumerate(data_dict[key_id]): temp_list.append(np.asanyarray(list_val_)) data_dict[key_id] = temp_list # use stored bin file elif include_weather == 2: log.info('use weather_dict.bin') # This part to be filled with Khiem...... else: log.info('skip weather database...') return data_dict, purge_list
def plotting_bldg_bn(bldg): plt.ioff() log.info('Getting anal_out from ' + bldg.bldg_tag) try: for sig_tag, anal_out in bldg.anal_out.iteritems(): for bn_prob in anal_out: p_name = bn_prob['p_name'] try: fig_name = 'BN for Sensors ' + p_name plt.figure(fig_name, figsize=(30.0, 30.0)) col_name = bn_prob['s_labels'] rbn.nx_plot(bn_prob['s_hc'], col_name, graph_layout='spring', node_text_size=30) plt.savefig(FIG_DIR + bldg.bldg_tag + '_' + p_name + '_' + sig_tag + '_bn_sensors' + get_pngid() + '.png', bbox_inches='tight') plt.close() except Exception as e: log.error(traceback.print_exc()) log.error('error in ' + fig_name + ' ' + str(e)) pass try: fig_name = 'BN for Time ' + p_name plt.figure(fig_name, figsize=(30.0, 30.0)) rbn.nx_plot(bn_prob['t_hc'], bn_prob['t_labels'], graph_layout='spring', node_text_size=30) plt.savefig(FIG_DIR + bldg.bldg_tag + '_' + p_name + '_' + sig_tag + '_bn_time' + get_pngid() + '.png', bbox_inches='tight') plt.close() except Exception as e: log.error(traceback.print_exc()) log.error('error in ' + fig_name + ' ' + str(e)) pass try: fig_name = 'BN for Weather ' + p_name plt.figure(fig_name, figsize=(30.0, 30.0)) rbn.nx_plot(bn_prob['w_hc'], bn_prob['w_labels'], graph_layout='spring', node_text_size=30) plt.savefig(FIG_DIR + bldg.bldg_tag + '_' + p_name + '_' + sig_tag + '_bn_weather' + get_pngid() + '.png', bbox_inches='tight') plt.close() except Exception as e: log.error(traceback.print_exc()) log.error('error in ' + fig_name + ' ' + str(e)) pass try: fig_name = 'BN for Sensor-Time-Weather ' + p_name plt.figure(fig_name, figsize=(30.0, 30.0)) rbn.nx_plot(bn_prob['all_hc'], bn_prob['all_labels'], graph_layout='spring', node_text_size=30) plt.savefig(FIG_DIR + bldg.bldg_tag + '_' + p_name + '_' + sig_tag + '_bn_sensor_time_weather' + get_pngid() + '.png', bbox_inches='tight') plt.close() except Exception as e: log.error(traceback.print_exc()) log.error('error in ' + fig_name + ' ' + str(e)) pass try: fig_name = 'BN PEAK LH Analysis for Sensor-Time-Weather ' + p_name plt.figure(fig_name, figsize=(30.0, 30.0)) plt.subplot(2, 1, 1) plt.plot(bn_prob['all_cause_symbol_xtick'], bn_prob['high_peak_prob'], '-^') plt.plot(bn_prob['all_cause_symbol_xtick'], bn_prob['low_peak_prob'], '-.v') plt.ylabel('Likelihood', fontsize=20) plt.xticks(bn_prob['all_cause_symbol_xtick'], bn_prob['all_cause_symbol_xlabel'], rotation=270, fontsize=20) plt.tick_params(labelsize=20) plt.legend(('High Peak', 'Low Peak'), loc='center right', prop={'size': 25}) plt.tick_params(labelsize=20) plt.grid() plt.ylim([-0.05, 1.05]) plt.title('Likelihood of ' + str(remove_dot(p_name)) + ' given ' + '\n' + str(remove_dot(bn_prob['all_cause_label'])), fontsize=20) plt.savefig(FIG_DIR + bldg.bldg_tag + '_' + p_name + '_' + sig_tag + '_LH_sensor_time_weather' + get_pngid() + '.png', bbox_inches='tight') plt.close() except Exception as e: log.error(traceback.print_exc()) log.error('error in ' + fig_name + ' ' + str(e)) pass except Exception as e: log.error(traceback.print_exc()) log.error(str(e)) pass plt.ion()
def verify_data_format(data_dict, PARALLEL=False): # Verify there is no [] or N/A in the list # Only FLoat or Int format is allowed log.info('Checking any inconsisent data format...') log.info('-' * 40) list_of_wrong_data_format = list() time_slots = data_dict['time_slots'] weather_list_used = [data_dict['weather_list'][i] for i in [1, 2, 3, 10, 11]] key_list = weather_list_used+ data_dict['sensor_list'] if not PARALLEL: for key in key_list: log.info('checking ' + str(key) + '...') for i, samples in enumerate(data_dict[key][1]): for j, each_sample in enumerate(samples): if each_sample == []: list_of_wrong_data_format.append([key, i, j]) log.info(str(each_sample) + ' at ' + str(time_slots[i]) + ' in ' + str(key)) elif not isinstance(each_sample, int) and not isinstance(each_sample, float): list_of_wrong_data_format.append([key, i, j]) log.info(str(each_sample) + ' at ' + str(time_slots[i]) + ' in ' + str(key)) log.info('-' * 40) # PARALLEL else: manager = mp.Manager() q = manager.Queue() p = mp.Pool(CPU_CORE_NUM) param_list = [(key, data_dict[key][1], time_slots, q) for key in key_list] p.map(pp_verify_sensor_data_format, param_list) p.close() p.join() while not q.empty(): item = q.get() log.warn('queue item: ' + str(item)) list_of_wrong_data_format.append(item) if len(list_of_wrong_data_format) > 0: log.critical('Inconsistent data format in the list of data_used') raise NameError('Inconsistent data format in the list of data_used') return list_of_wrong_data_format
def show_clusters(exemplars,labels,input_names): n_labels = labels.max() for i in range(n_labels + 1): log.info('Cluster %i: %s' % ((i + 1), ', '.join(input_names[labels == i])))
def _bn_anaylsis_all(bldg_obj, p_name, sig_tag='avg', num_picks_bn=15, learning_alg='hc'): s_names = bldg_obj.sigtags[sig_tag].names['sensor'] p_idx = s_names.index(p_name) data_state_mat = bldg_obj.sigtags[sig_tag].data_state_mat log.info('power - sensors + weather + time ...') s_cause_label, s_labels, s_hc, s_cp_mat, s_bndata_mat = \ _bn_anaylsis(bldg_obj, p_name, attr='sensor', sig_tag=sig_tag, num_picks_bn=num_picks_bn, learning_alg=learning_alg) t_cause_label, t_labels, t_hc, t_cp_mat, t_bndata_mat = \ _bn_anaylsis(bldg_obj, p_name, attr='time', sig_tag=sig_tag, num_picks_bn=num_picks_bn, learning_alg=learning_alg) w_cause_label, w_labels, w_hc, w_cp_mat, w_bndata_mat = \ _bn_anaylsis(bldg_obj, p_name, attr='weather', sig_tag=sig_tag, num_picks_bn=num_picks_bn, learning_alg=learning_alg) #s_cause_label=s_labels; w_cause_label=w_labels;t_cause_label=t_labels s_cause_idx = [ bldg_obj.sigtags[sig_tag].names['sensor'].index(name) for name in s_cause_label ] t_cause_idx = [ bldg_obj.sigtags[sig_tag].names['time'].index(name) for name in t_cause_label ] w_cause_idx = [ bldg_obj.sigtags[sig_tag].names['weather'].index(name) for name in w_cause_label ] bndata_mat = np.vstack( (bldg_obj.sigtags[sig_tag].data_state_mat[:, p_idx].T,\ bldg_obj.sigtags[sig_tag].data_state_mat[:, s_cause_idx].T, \ bldg_obj.sigtags[sig_tag].data_weather_mat_[:, w_cause_idx].T, \ bldg_obj.sigtags[sig_tag].data_time_mat[:, t_cause_idx].T)).T cols = [ name for name in [p_name] + s_cause_label + w_cause_label + t_cause_label ] b_arc_list = \ pair_in_idx([p_name], s_cause_label + w_cause_label + t_cause_label) + \ pair_in_idx(s_cause_label, w_cause_label+t_cause_label) + \ pair_in_idx(w_cause_label, t_cause_label) + \ pair_in_idx(t_cause_label, t_cause_label) # this is the heart and soul of ddea black_arc_frame = rbn.construct_arcs_frame(b_arc_list) factor_data_mat = rbn.convert_pymat_to_rfactor(bndata_mat) data_frame = rbn.construct_data_frame(factor_data_mat, cols) if learning_alg == 'tabu': hc_b = rbn.bnlearn.tabu(data_frame, blacklist=black_arc_frame, score='bic') elif learning_alg == 'mmhc': hc_b = rbn.bnlearn.mmhc(data_frame, blacklist=black_arc_frame, score='bic') else: hc_b = rbn.bnlearn.hc(data_frame, blacklist=black_arc_frame, score='bic') amat = rbn.py_get_amat(hc_b) cause_label = list(np.array(cols)[np.nonzero(amat[:, 0] == 1)[0]]) cause_idx = [cols.index(label_) for label_ in cause_label] return cause_label, cols, hc_b, amat, bndata_mat
def cluster_measurement_points(m_matrix, m_name, corr_bnd = [0.1,0.9],alg='aff'): exemplars_dict = dict() if m_matrix.shape[1] == 0: return [], exemplars_dict, [], [] elif m_matrix.shape[1] == 1: exemplars_ = [0] labels_= [0] exemplars_name = m_name else: distmat_input = find_norm_dist_matrix(m_matrix) # Find representative set of sensor measurements min_dist_ = np.sqrt(2*(1-(corr_bnd[1]))) max_dist_ = np.sqrt(2*(1-(corr_bnd[0]))) if alg == 'pack': log.info('use pack clustering algoirthm') exemplars_, labels_ = max_pack_cluster(distmat_input, min_dist=min_dist_, max_dist=max_dist_) else: log.info('use affinity clustering algoirthm') SIMM_MAT = 2 - distmat_input exemplars_, labels_ = cluster.affinity_propagation(SIMM_MAT, damping=0.5) num_clusters = int(labels_.max()+1) log.info('-' * 40) log.info(str(num_clusters) + 'clusters out of ' + str(len(labels_)) + 'measurements') log.info('-' * 40) validity, intra_dist, inter_dist = compute_cluster_err(distmat_input, labels_) log.info('validity: ' + str(round(validity,2)) + ', intra_dist: ' + str(np.round(intra_dist,2)) + ', inter_dist: ' + str(np.round(inter_dist,2))) log.info('-' * 40) exemplars_name = list(np.array(m_name)[exemplars_]) for label_id, (m_idx,exemplar_label) in enumerate(zip(exemplars_, exemplars_name)): log.info(str(exemplar_label)) children_set = list(set(np.nonzero(labels_ == label_id)[0]) - set([m_idx])) log.info('Label ' + str(label_id) + ' : ' + str(m_idx) + '<--' + str(children_set) ) exemplars_dict.update({exemplar_label : list(np.array(m_name)[children_set])}) return m_matrix[:, exemplars_], exemplars_dict, exemplars_, labels_
def plotting_bldg_lh(bldg, bldg_key=[], attr='sensor', num_picks=30): log.info('-' * 40) log.info('plotting lh for ' + attr) log.info('-' * 40) sig_tag_set = ['avg', 'diff'] plt.ioff() if not len(bldg_key): bldg_tag_set = [bldg.bldg_tag] else: bldg_tag_set = [bldg_key] for bldg_tag in bldg_tag_set: if bldg_tag == bldg.bldg_tag: log.info('-' * 40) log.info(bldg_tag + " is to be plotted...") log.info('-' * 40) for sig_tag in sig_tag_set: try: p_names = bldg.sigtags[sig_tag].p_names for pname in p_names: try: blank_idx = pname.index('.') pname = pname.replace('.', '_') except: pass optprob_set = None optstate_set = None for anal in bldg.analysis[sig_tag]: if anal.sensor_tag == pname: optprob_set = anal.attrs[attr].optprob_set optstate_set = anal.attrs[attr].optstate_set break s_names = bldg.sigtags[sig_tag].names[attr] num_picks = 30 sort_idx = np.argsort(optprob_set)[::-1] sort_lh = optprob_set[sort_idx[:num_picks]].T sort_state = optstate_set[sort_idx[:num_picks]].T x_label = list(np.array(s_names)[sort_idx[:num_picks]]) x_ticks = range(len(x_label)) plt.figure(figsize=(20.0, 15.0)) plt.subplot(2, 1, 1) plt.plot(sort_lh, '-*') plt.xticks(x_ticks, x_label, rotation=270, fontsize="small") if sig_tag == 'avg': plt.title('Most relavant ' + attr + ' attributes to the peak (demand) of '+ pname, fontsize=20) else: plt.title('Most relavant ' + attr + ' attributes to the peak variations of '+ pname, fontsize=20) plt.tick_params(labelsize='large') plt.ylim([-0.05, 1.05]) plt.ylabel('Likelihood (From 0 to 1)', fontsize=18) plt.savefig(FIG_DIR + bldg_tag + '_' + pname + '_' + attr + '_' + sig_tag + '_lh_sensors.png', bbox_inches='tight') plt.close() except Exception as e: log.error(traceback.print_exc()) log.error(str(e)) pass plt.close() plt.ion()
def __init__(self, config, session): self.config = config self.session = session prefix = '%s_%.2f' % (config.method, config.alpha) self.filepath = '%s-dim_%d' % ( prefix, config.dim, ) self.train_dir = './train_dir/seed_%d/scale_%d/%s' % ( self.config.seed, self.config.scale, self.filepath) self.fig_dir = '/home/dilin/Dropbox/tmp/figures/seed_%d/scale_%d/%s' % ( self.config.seed, self.config.scale, self.filepath) for folder in [self.train_dir, self.fig_dir]: if not os.path.exists(folder): os.makedirs(folder) # clean train folder if self.config.clean: files = glob.glob(folder + '/*') for f in files: os.remove(f) log.infov("Train Dir: %s, Figure Dir: %s", self.train_dir, self.fig_dir) # --- create model --- self.p_target = config.p_target self.model = Model(config, self.p_target) # --- optimizer --- self.global_step = tf.Variable(0, name="global_step") self.learning_rate = config.learning_rate if config.lr_weight_decay: self.learning_rate = tf.train.exponential_decay( self.learning_rate, global_step=self.global_step, decay_steps=10000, decay_rate=0.1, staircase=True, name='decaying_learning_rate') self.summary_op = tf.summary.merge_all() self.saver = tf.train.Saver(max_to_keep=1) self.summary_writer = tf.summary.FileWriter(self.train_dir) self.checkpoint_secs = 300 # 5 min self.train_op = self.optimize_adagrad( self.model.loss, train_vars=self.model.q_train_vars, lr=self.learning_rate) tf.global_variables_initializer().run() if config.checkpoint is not None: self.ckpt_path = tf.train.latest_checkpoint(self.config.checkpoint) if self.ckpt_path is not None: log.info("Checkpoint path: %s", self.ckpt_path) self.saver.restore(self.session, self.ckpt_path) log.info( "Loaded the pretrain parameters from the provided checkpoint path" )
def plotting_bldg_bn(bldg): plt.ioff() log.info('Getting anal_out from ' + bldg.bldg_tag) try: for sig_tag, anal_out in bldg.anal_out.iteritems(): for bn_prob in anal_out: p_name = bn_prob['p_name'] try: fig_name = 'BN for Sensors ' + p_name plt.figure(fig_name, figsize=(30.0, 30.0)) col_name = bn_prob['s_labels'] rbn.nx_plot(bn_prob['s_hc'], col_name, graph_layout='spring', node_text_size=30) plt.savefig(FIG_DIR + bldg.bldg_tag + '_' + p_name + '_' + sig_tag + '_bn_sensors' + get_pngid() + '.png', bbox_inches='tight') plt.close() except Exception as e: log.error(traceback.print_exc()) log.error('error in ' + fig_name + ' ' + str(e)) pass try: fig_name = 'BN for Time ' + p_name plt.figure(fig_name, figsize=(30.0,30.0)) rbn.nx_plot(bn_prob['t_hc'], bn_prob['t_labels'], graph_layout='spring', node_text_size=30) plt.savefig(FIG_DIR + bldg.bldg_tag + '_' + p_name + '_' + sig_tag + '_bn_time' + get_pngid() + '.png', bbox_inches='tight') plt.close() except Exception as e: log.error(traceback.print_exc()) log.error('error in ' + fig_name + ' ' + str(e)) pass try: fig_name = 'BN for Weather ' + p_name plt.figure(fig_name, figsize=(30.0,30.0)) rbn.nx_plot(bn_prob['w_hc'], bn_prob['w_labels'], graph_layout='spring', node_text_size=30) plt.savefig(FIG_DIR + bldg.bldg_tag + '_' + p_name + '_' + sig_tag + '_bn_weather' + get_pngid() +'.png', bbox_inches='tight') plt.close() except Exception as e: log.error(traceback.print_exc()) log.error('error in ' + fig_name + ' ' + str(e)) pass try: fig_name = 'BN for Sensor-Time-Weather ' + p_name plt.figure(fig_name, figsize=(30.0,30.0)) rbn.nx_plot(bn_prob['all_hc'], bn_prob['all_labels'], graph_layout='spring', node_text_size=30) plt.savefig(FIG_DIR + bldg.bldg_tag + '_' + p_name + '_' + sig_tag + '_bn_sensor_time_weather' + get_pngid() + '.png', bbox_inches='tight') plt.close() except Exception as e: log.error(traceback.print_exc()) log.error('error in ' + fig_name + ' ' + str(e)) pass try: fig_name = 'BN PEAK LH Analysis for Sensor-Time-Weather ' + p_name plt.figure(fig_name, figsize=(30.0, 30.0)) plt.subplot(2, 1, 1) plt.plot(bn_prob['all_cause_symbol_xtick'], bn_prob['high_peak_prob'], '-^') plt.plot(bn_prob['all_cause_symbol_xtick'], bn_prob['low_peak_prob'], '-.v') plt.ylabel('Likelihood', fontsize=20) plt.xticks(bn_prob['all_cause_symbol_xtick'], bn_prob['all_cause_symbol_xlabel'], rotation=270, fontsize=20) plt.tick_params(labelsize=20) plt.legend(('High Peak', 'Low Peak'), loc='center right', prop={'size':25}) plt.tick_params(labelsize=20) plt.grid() plt.ylim([-0.05,1.05]) plt.title('Likelihood of '+ str(remove_dot(p_name))+' given '+'\n'+str(remove_dot(bn_prob['all_cause_label'])), fontsize=20) plt.savefig(FIG_DIR + bldg.bldg_tag + '_' + p_name + '_' + sig_tag + '_LH_sensor_time_weather' + get_pngid() + '.png', bbox_inches='tight') plt.close() except Exception as e: log.error(traceback.print_exc()) log.error('error in ' + fig_name + ' ' + str(e)) pass except Exception as e: log.error(traceback.print_exc()) log.error(str(e)) pass plt.ion()
def ddea_process(sensor_names_hash, sensor_data, start_time, end_time, timelet_inv, bldg_key, pname_key, plot_analysis=False): #----------------------------- DATA PRE-PROCESSING ------------------------- from log_util import log log.info('#' * 80) log.info('# Data Pre-Processing') log.info('#' * 80) ans_start_t = dt.datetime.fromtimestamp(start_time) ans_end_t = dt.datetime.fromtimestamp(end_time) data_dict, purge_list = \ construct_data_dict(sensor_data, ans_start_t, ans_end_t, timelet_inv, PARALLEL=IS_USING_PARALLEL_OPT) # This perform data summerization process. log.info('-' * 40) log.info('VERIFY DATA FORMAT...') log.info('-' * 40) # This is for data verification purpose # You cab skip it if you are sure that there would be no bug in the 'construct_data_dict' function. list_of_wrong_data_format = \ verify_data_format(data_dict, PARALLEL=IS_USING_PARALLEL_OPT) if len(list_of_wrong_data_format) > 0: log.critical('Measurement list below') log.critical('-' * 40) log.critical(str(list_of_wrong_data_format)) raise NameError('Errors in data format') if SAVE_PROC_BIN: # Save summarized Data in Bin Format log.info("Saving data_dict in bin format...") mt.saveObjectBinaryFast(data_dict, PROC_OUT_DIR + '/' + bldg_key.lower() + '_data_dict.bin') #----------------------------- DATA SUMMERIZATION -------------------------- # This perform data summerization process. log.info('#' * 80) log.info('DATA SUMMARIZATION...') log.info('#' * 80) # Compute Average Feature if PROC_AVG == True # Compute Differential Feature if PROC_DIFF == True bldg_load_out = data_summerization(bldg_key, data_dict, PARALLEL=IS_USING_PARALLEL_OPT) if SAVE_PROC_BIN: # Save summarized Data in Bin Format log.info("Saving summarized building data in bin format...") mt.saveObjectBinaryFast(bldg_load_out, PROC_OUT_DIR + bldg_key.lower() + '_ds_out.bin') # Export Summarized Data to JSON feat_avg_exist = bool('avgdata_dict' in bldg_load_out.keys()) feat_diff_exist = bool('diffdata_dict' in bldg_load_out.keys()) if feat_avg_exist and feat_diff_exist: log.info("Saving summarized building data in JSON format...") save_processed_json(sensor_names_hash, bldg_load_out) if feat_avg_exist: save_avg_data_summary_json(bldg_key, sensor_names_hash, bldg_load_out['avgdata_dict']) if feat_diff_exist: save_diff_data_summary_json(bldg_key, sensor_names_hash, bldg_load_out['diffdata_dict']) #------------------------------- MODEL DISCOVERY --------------------------- log.info('#' * 80) log.info('MODEL DISCOVERY...') log.info('#' * 80) log.info('Building for '+ bldg_key + '....') ## CREATE BUILDING OBJECT ## bldg = pbp.create_bldg_object(bldg_load_out, bldg_key, pname_key, PARALLEL=IS_USING_PARALLEL_OPT) ## BAYESIAN NETWORK PROBABILITY ANALYSIS OBJECT ## if feat_avg_exist: avg = pbp.bn_probability_analysis(bldg, sig_tag='avg') bldg.anal_out.update({'avg': avg}) if feat_diff_exist: diff = pbp.bn_probability_analysis(bldg, sig_tag='diff') bldg.anal_out.update({'diff': diff}) if SAVE_PROC_BIN: # Save a building data in Bin format log.info("Saving building graph in bin format...") mt.saveObjectBinaryFast(bldg, PROC_OUT_DIR + bldg_key.lower() + '_bldg_out.bin') # Export a building graph in json format log.info("Saving building graph in JSON format...") all_labels, all_edges = conv_bn_graph_json(bldg) save_bn_graph_json(bldg_key, sensor_names_hash, all_labels, all_edges) if plot_analysis: log.info('#' * 80) log.info('ANALYTICS PLOTTING...') log.info('#' * 80) # Analysis of BN network result - All result will be saved in fig_dir. pbp.plotting_bldg_lh(bldg, attr='sensor', num_picks=30) pbp.plotting_bldg_lh(bldg, attr='time', num_picks=30) pbp.plotting_bldg_lh(bldg, attr='weather', num_picks=30) pbp.plotting_bldg_bn(bldg)
def weather_convert(wdata_mat, wdata_name, Conditions_dict, Events_dict, PARALLEL=False): """ New dictionary by state classification of weather data """ weather_dict = dict() # coolect index of weather data point in previous data try: temp_idx = wdata_name.index('TemperatureC') except: temp_idx = list() try: dewp_idx = wdata_name.index('Dew_PointC') except: dewp_idx = list() try: humd_idx = wdata_name.index('Humidity') except: humd_idx = list() try: evnt_idx = wdata_name.index('Events') except: evnt_idx = list() try: cond_idx = wdata_name.index('Conditions') except: cond_idx = list() ############################################################################ # Weather state classification ############################################################################ for class_idx in [temp_idx, dewp_idx, humd_idx]: obs = wdata_mat[:, class_idx][:, np.newaxis] label, opt_num_cluster, model, score, score_err_sum=\ state_retrieval(obs, max_num_cluster=30, off_set=1, est_method='kmean', PARALLEL=PARALLEL) if class_idx == temp_idx: log.info('Temp state classification...') weather_dict.update({'Temp': model.cluster_centers_}) elif class_idx == dewp_idx: log.info('Dewp state classification...') weather_dict.update({'Dewp': model.cluster_centers_}) elif class_idx == humd_idx: log.info('Humd state classification...') weather_dict.update({'Humd': model.cluster_centers_}) else: log.info('not found') for label_id in range(label.max() + 1): label_idx = np.nonzero(label == label_id)[0] wdata_mat[label_idx, class_idx] = np.round( model.cluster_centers_[label_id][0]) ################################################## # Reclassify the Condition states into clarity of the sky ################################################## cond_state = [[]] * 9 # Clear cond_state[8] = ['Clear'] # 'Partly Cloudy' cond_state[7] = ['Partly Cloudy', 'Scattered Clouds'] # 'Overcast' cond_state[6] = ['Mostly Cloudy', 'Overcast'] # Light Rain cond_state[5] = [ 'Fog', 'Mist', 'Shallow Fog', 'Patches of Fog', 'Light Freezing Fog' ] cond_state[4] = [ 'Drizzle', 'Heavy Drizzle', 'Light Drizzle', 'Light Freezing Drizzle' ] # Heavy Rain cond_state[3] = [ 'Rain', 'Rain Showers', 'Thunderstorms and Rain', 'Heavy Rain', 'Heavy Rain Showers', 'Freezing Rain', 'Light Freezing Rain', 'Light Rain Showers', 'Light Rain', 'Light Thunderstorms and Rain' ] cond_state[2] = [ 'Ice Pellets', 'Ice Crystals', 'Light Ice Crystals', 'Light Ice Pellets' ] # 'Snow' cond_state[1] = [ 'Snow', 'Snow Showers', 'Light Snow', 'Light Snow Grains', 'Light Snow Showers' ] cond_state[0] = ['Unknown'] cond_data_array = wdata_mat[:, cond_idx].copy() log.info('Condition state classification...') for k in range(len(cond_state)): for cond_str in cond_state[k]: if cond_str in Conditions_dict.keys(): cond_val_old = Conditions_dict[cond_str] idx_temp = np.nonzero(cond_data_array == cond_val_old)[0] if len(idx_temp) > 0: wdata_mat[idx_temp, cond_idx] = k Conditions_dict_temp = dict() Conditions_dict_temp.update({'Clear': 8}) Conditions_dict_temp.update({'Cloudy': 7}) Conditions_dict_temp.update({'Overcast': 6}) Conditions_dict_temp.update({'Fog': 5}) Conditions_dict_temp.update({'Drizzle': 4}) Conditions_dict_temp.update({'Rain': 3}) Conditions_dict_temp.update({'Ice': 2}) Conditions_dict_temp.update({'Snow': 1}) Conditions_dict_temp.update({'Unknown': 0}) # Abbr' of weather factor type is weather_dict.update({'Cond': Conditions_dict_temp}) #################################################################### # Reclassify the Event states into rain/snow/fog weather conditons #################################################################### event_state = [[]] * 4 # No event event_state[0] = [''] # Snow event_state[1] = ['Rain-Snow', 'Snow', 'Fog-Snow'] # Rain event_state[2] = ['Rain', 'Thunderstorm', 'Rain-Thunderstorm'] # Fog event_state[3] = ['Fog', 'Fog-Rain'] log.info('Event state classification...') event_data_array = wdata_mat[:, evnt_idx].copy() for k in range(len(event_state)): for event_str in event_state[k]: if event_str in Events_dict.keys(): event_val_old = Events_dict[event_str] idx_temp = np.nonzero(event_data_array == event_val_old)[0] if len(idx_temp) > 0: wdata_mat[idx_temp, evnt_idx] = k Events_dict_temp = dict() Events_dict_temp.update({'NoEvent': 0}) Events_dict_temp.update({'Snow': 1}) Events_dict_temp.update({'Rain': 2}) Events_dict_temp.update({'Fog': 3}) weather_dict.update({'Event': Events_dict_temp}) return wdata_mat, weather_dict
def build_diff_matrix(measurement_point_set, time_slots, num_type_set, irr_data_name, conf_lev=0.5, PARALLEL=False): #time_slots_utc = dtime_to_unix(time_slots) Xdiff = list() input_names = list() INT_type_list = list() FLOAT_type_list = list() INT_type_idx = list() FLOAT_type_idx = list() zero_var_list = list() # whose variance is zero, hence carry no information, zero_var_val = list() num_of_samples = len(time_slots) #TIMELET_INV_seconds = (time_slots[1]-time_slots[0]).seconds log.info('=' * 40) if not PARALLEL: for k, (set_val, set_name) in enumerate(zip(measurement_point_set, irr_data_name)): log.info(str(irr_data_name[k])) try: num_type = num_type_set[k] diff_mean = get_diff(set_val, time_slots, num_type, conf_lev) if num_type == FLOAT_TYPE: #norm_diff_mean,output_status=normalize_data(diff_mean[:,0]) norm_diff_mean, output_status = normalize_data(diff_mean) elif num_type == INT_TYPE: #num_discrete_vals=len(set(list(diff_mean[:,0]))) num_discrete_vals = len(set(list(diff_mean))) log.info('num_discrete_vals : ' + str(num_discrete_vals)) if num_discrete_vals > 1: output_status = 0 norm_diff_mean = diff_mean else: output_status = -1 #norm_diff_mean = list(set(diff_mean[:,0])) norm_diff_mean = list(set(diff_mean)) else: pass if len(np.nonzero(norm_diff_mean == np.inf)[0])>num_of_samples/5: raise except Exception as e: log.error(traceback.print_exc()) log.error('Error in processing data feature, excluded from analysis ' + str(e)) output_status = -1 norm_diff_mean = None if output_status == -1: #zero_var_flag=1 zero_var_list.append(set_name) zero_var_val.append(norm_diff_mean) log.warn('too small variance for float type or a single value for int type, added to zero var list') else: input_names.append(set_name) Xdiff.append(norm_diff_mean) if num_type == FLOAT_TYPE: FLOAT_type_list.append(set_name) FLOAT_type_idx.append(len(Xdiff)-1) elif num_type == INT_TYPE: INT_type_list.append(set_name) INT_type_idx.append(len(Xdiff)-1) log.info('-' * 20) log.info('-' * 40) # PARALLEL ENABLED else: log.info('Build diff matrix: Parallel enabled...') # Construct param list for workers param_list = list() for k, (set_val, set_name) in enumerate(zip(measurement_point_set, irr_data_name)): param_list.append((k, time_slots, conf_lev, set_val, set_name, num_type_set[k])) p = mp.Pool(CPU_CORE_NUM) ret_dict = dict(p.map(build_diff, param_list)) p.close() p.join() for k in sorted(ret_dict.keys()): """ v = ret_dict[k] output_status = v[0] norm_diff_mean = v[1] """ output_status, norm_diff_mean = ret_dict[k] set_name = irr_data_name[k] num_type = num_type_set[k] if output_status == -1: zero_var_list.append(set_name) #zero_var_flag=1 zero_var_val.append(norm_diff_mean) log.warn("too small variance for float type or a single value for int type, added to zero var list") else: input_names.append(set_name) try: Xdiff.append(norm_diff_mean) except Exception as e: log.error(traceback.print_exc()) log.error(str(e)) if num_type == FLOAT_TYPE: FLOAT_type_list.append(set_name) FLOAT_type_idx.append(len(Xdiff)-1) elif num_type == INT_TYPE: INT_type_list.append(set_name) INT_type_idx.append(len(Xdiff)-1) log.info('-' * 20) Xdiff = np.array(Xdiff).T deleted_timeslot_idx = list() log.info('-' * 20) log.info('removing time slots having no sample...') inf_idx_set = list() for col_vec in Xdiff.T: inf_idx = np.nonzero(col_vec == np.infty)[0] inf_idx_set=np.r_[inf_idx_set, inf_idx] inf_col_idx = list(set(list(inf_idx_set))) deleted_timeslot_idx = np.array([int(x) for x in inf_col_idx]).astype(int) log.info('time slots ' + str(deleted_timeslot_idx) + ' removed...') log.info('-' * 20) Xdiff = np.delete(Xdiff, deleted_timeslot_idx, axis=0) new_time_slot = np.delete(time_slots, deleted_timeslot_idx) # Checking whether it has any ill entry value verify_data_mat(Xdiff) log.info('*-' * 20) log.info("* deleted_timeslot_idx : " + str(deleted_timeslot_idx)) log.info('*-' * 20) return Xdiff,\ new_time_slot,\ input_names,\ zero_var_list,\ zero_var_val, \ INT_type_list,\ INT_type_idx,\ FLOAT_type_list,\ FLOAT_type_idx
def _bn_anaylsis(bldg_obj, p_name, attr='sensor', sig_tag='avg', num_picks_bn=15, learning_alg='hc'): s_names = bldg_obj.sigtags[sig_tag].names['sensor'] p_idx = s_names.index(p_name) data_state_mat = bldg_obj.sigtags[sig_tag].data_state_mat anlist = bldg_obj.analysis[sig_tag] optprob_set = None optstate_set = None for anal in anlist: if anal.sensor_tag == p_name: optprob_set = anal.attrs[attr].optprob_set optstate_set = anal.attrs[attr].optstate_set break if optprob_set is None or optstate_set is None: raise Exception("Invalid p_name", p_name) sort_idx = np.argsort(optprob_set)[::-1] if attr == 'sensor': log.info('power - sensors...') idx_select = [p_idx] + list(sort_idx[:num_picks_bn]) cols = [s_names[k] for k in idx_select] bndata_mat = bldg_obj.sigtags[sig_tag].data_state_mat[:, idx_select] b_arc_list = pair_in_idx([cols[0]], cols[1:]) elif attr == 'weather': log.info('power - weather...') w_names = bldg_obj.sigtags[sig_tag].names['weather'] cols = [p_name] + [w_name for w_name in w_names] bndata_mat = np.vstack((bldg_obj.sigtags[sig_tag].data_state_mat[:, p_idx].T, bldg_obj.sigtags[sig_tag].data_weather_mat.T)).T b_arc_list = pair_in_idx([cols[0]], cols[1:]) elif attr == 'time': log.info('power - time...') t_names = bldg_obj.sigtags[sig_tag].names['time'] cols = [p_name] + [t_name for t_name in t_names] bndata_mat = np.vstack((bldg_obj.sigtags[sig_tag].data_state_mat[:, p_idx].T, bldg_obj.sigtags[sig_tag].data_time_mat.T)).T b_arc_list = pair_in_idx([cols[0]], cols[1:]) + pair_in_idx(cols[1:], cols[1:]) else: log.info('error') return 0 # this is the heart and soul of ddea black_arc_frame = rbn.construct_arcs_frame(b_arc_list) factor_data_mat = rbn.convert_pymat_to_rfactor(bndata_mat) data_frame = rbn.construct_data_frame(factor_data_mat, cols) if learning_alg == 'tabu': hc_b = rbn.bnlearn.tabu(data_frame, blacklist=black_arc_frame, score='bic') elif learning_alg == 'mmhc': hc_b = rbn.bnlearn.mmhc(data_frame, blacklist=black_arc_frame, score='bic') else: hc_b = rbn.bnlearn.hc(data_frame, blacklist=black_arc_frame, score='bic') amat = rbn.py_get_amat(hc_b) cause_label = list(np.array(cols)[np.nonzero(amat[:, 0] == 1)[0]]) cause_idx = [cols.index(label_) for label_ in cause_label] return cause_label, cols, hc_b, amat, bndata_mat
def weather_convert(wdata_mat, wdata_name, Conditions_dict, Events_dict, PARALLEL=False): """ New dictionary by state classification of weather data """ weather_dict = dict() # coolect index of weather data point in previous data try: temp_idx = wdata_name.index('TemperatureC') except: temp_idx = list() try: dewp_idx = wdata_name.index('Dew_PointC') except: dewp_idx = list() try: humd_idx = wdata_name.index('Humidity') except: humd_idx = list() try: evnt_idx = wdata_name.index('Events') except: evnt_idx = list() try: cond_idx = wdata_name.index('Conditions') except: cond_idx = list() ############################################################################ # Weather state classification ############################################################################ for class_idx in [temp_idx, dewp_idx, humd_idx]: obs = wdata_mat[:, class_idx][:, np.newaxis] label, opt_num_cluster, model, score, score_err_sum=\ state_retrieval(obs, max_num_cluster=30, off_set=1, est_method='kmean', PARALLEL=PARALLEL) if class_idx == temp_idx: log.info('Temp state classification...') weather_dict.update({'Temp':model.cluster_centers_}) elif class_idx == dewp_idx: log.info('Dewp state classification...') weather_dict.update({'Dewp':model.cluster_centers_}) elif class_idx == humd_idx: log.info('Humd state classification...') weather_dict.update({'Humd':model.cluster_centers_}) else: log.info('not found') for label_id in range(label.max()+1): label_idx = np.nonzero(label==label_id)[0] wdata_mat[label_idx, class_idx] = np.round(model.cluster_centers_[label_id][0]) ################################################## # Reclassify the Condition states into clarity of the sky ################################################## cond_state=[[]]*9 # Clear cond_state[8] = ['Clear'] # 'Partly Cloudy' cond_state[7] = ['Partly Cloudy','Scattered Clouds'] # 'Overcast' cond_state[6] = ['Mostly Cloudy','Overcast'] # Light Rain cond_state[5] = ['Fog','Mist', 'Shallow Fog','Patches of Fog','Light Freezing Fog'] cond_state[4] = ['Drizzle', 'Heavy Drizzle','Light Drizzle','Light Freezing Drizzle'] # Heavy Rain cond_state[3] = ['Rain','Rain Showers','Thunderstorms and Rain','Heavy Rain','Heavy Rain Showers', 'Freezing Rain','Light Freezing Rain','Light Rain Showers','Light Rain','Light Thunderstorms and Rain'] cond_state[2] = ['Ice Pellets', 'Ice Crystals','Light Ice Crystals','Light Ice Pellets'] # 'Snow' cond_state[1] = ['Snow','Snow Showers','Light Snow','Light Snow Grains','Light Snow Showers'] cond_state[0] = ['Unknown'] cond_data_array = wdata_mat[:,cond_idx].copy() log.info('Condition state classification...') for k in range(len(cond_state)): for cond_str in cond_state[k]: if cond_str in Conditions_dict.keys(): cond_val_old = Conditions_dict[cond_str] idx_temp = np.nonzero(cond_data_array==cond_val_old)[0] if len(idx_temp)>0: wdata_mat[idx_temp,cond_idx]=k Conditions_dict_temp = dict() Conditions_dict_temp.update({'Clear':8}) Conditions_dict_temp.update({'Cloudy':7}) Conditions_dict_temp.update({'Overcast':6}) Conditions_dict_temp.update({'Fog':5}) Conditions_dict_temp.update({'Drizzle':4}) Conditions_dict_temp.update({'Rain':3}) Conditions_dict_temp.update({'Ice':2}) Conditions_dict_temp.update({'Snow':1}) Conditions_dict_temp.update({'Unknown':0}) # Abbr' of weather factor type is weather_dict.update({'Cond':Conditions_dict_temp}) #################################################################### # Reclassify the Event states into rain/snow/fog weather conditons #################################################################### event_state=[[]]*4 # No event event_state[0]=[''] # Snow event_state[1]=['Rain-Snow','Snow','Fog-Snow'] # Rain event_state[2]=['Rain','Thunderstorm','Rain-Thunderstorm'] # Fog event_state[3]=['Fog','Fog-Rain'] log.info('Event state classification...') event_data_array=wdata_mat[:, evnt_idx].copy() for k in range(len(event_state)): for event_str in event_state[k]: if event_str in Events_dict.keys(): event_val_old=Events_dict[event_str] idx_temp=np.nonzero(event_data_array==event_val_old)[0] if len(idx_temp)>0: wdata_mat[idx_temp, evnt_idx]=k Events_dict_temp = dict() Events_dict_temp.update({'NoEvent': 0}) Events_dict_temp.update({'Snow': 1}) Events_dict_temp.update({'Rain': 2}) Events_dict_temp.update({'Fog': 3}) weather_dict.update({'Event': Events_dict_temp}) return wdata_mat, weather_dict
def _bn_anaylsis(bldg_obj, p_name, attr='sensor', sig_tag='avg', num_picks_bn=15, learning_alg='hc'): s_names = bldg_obj.sigtags[sig_tag].names['sensor'] p_idx = s_names.index(p_name) data_state_mat = bldg_obj.sigtags[sig_tag].data_state_mat anlist = bldg_obj.analysis[sig_tag] optprob_set = None optstate_set = None for anal in anlist: if anal.sensor_tag == p_name: optprob_set = anal.attrs[attr].optprob_set optstate_set = anal.attrs[attr].optstate_set break if optprob_set is None or optstate_set is None: raise Exception("Invalid p_name", p_name) sort_idx = np.argsort(optprob_set)[::-1] if attr == 'sensor': log.info('power - sensors...') idx_select = [p_idx] + list(sort_idx[:num_picks_bn]) cols = [s_names[k] for k in idx_select] bndata_mat = bldg_obj.sigtags[sig_tag].data_state_mat[:, idx_select] b_arc_list = pair_in_idx([cols[0]], cols[1:]) elif attr == 'weather': log.info('power - weather...') w_names = bldg_obj.sigtags[sig_tag].names['weather'] cols = [p_name] + [w_name for w_name in w_names] bndata_mat = np.vstack( (bldg_obj.sigtags[sig_tag].data_state_mat[:, p_idx].T, bldg_obj.sigtags[sig_tag].data_weather_mat.T)).T b_arc_list = pair_in_idx([cols[0]], cols[1:]) elif attr == 'time': log.info('power - time...') t_names = bldg_obj.sigtags[sig_tag].names['time'] cols = [p_name] + [t_name for t_name in t_names] bndata_mat = np.vstack( (bldg_obj.sigtags[sig_tag].data_state_mat[:, p_idx].T, bldg_obj.sigtags[sig_tag].data_time_mat.T)).T b_arc_list = pair_in_idx([cols[0]], cols[1:]) + pair_in_idx( cols[1:], cols[1:]) else: log.info('error') return 0 # this is the heart and soul of ddea black_arc_frame = rbn.construct_arcs_frame(b_arc_list) factor_data_mat = rbn.convert_pymat_to_rfactor(bndata_mat) data_frame = rbn.construct_data_frame(factor_data_mat, cols) if learning_alg == 'tabu': hc_b = rbn.bnlearn.tabu(data_frame, blacklist=black_arc_frame, score='bic') elif learning_alg == 'mmhc': hc_b = rbn.bnlearn.mmhc(data_frame, blacklist=black_arc_frame, score='bic') else: hc_b = rbn.bnlearn.hc(data_frame, blacklist=black_arc_frame, score='bic') amat = rbn.py_get_amat(hc_b) cause_label = list(np.array(cols)[np.nonzero(amat[:, 0] == 1)[0]]) cause_idx = [cols.index(label_) for label_ in cause_label] return cause_label, cols, hc_b, amat, bndata_mat