Example #1
0
def pp_construct_data_dict(args):
    (sensor_uuid, sensor_reading, time_slots, ans_start_t, ans_end_t, timelet_inv) = args
    log.info('sampling sensor uuid ' + sensor_uuid)
    log.info('-' * 20)

    len_time_slots = len(time_slots)
    ret = None

    # sensor value is read by time
    dict_sensor_val, dict_stime, utc_t, val =\
        get_val_timelet(sensor_reading, time_slots, ans_start_t, ans_end_t, timelet_inv)

    dict_sensor_val_temp = np.array([np.asarray(v) for v in dict_sensor_val])

    dict_stime_temp = np.array([np.asarray(t) for t in dict_stime])

    utc_t_val_temp = np.asarray([utc_t, val])

    if dict_sensor_val == -1:
        log.debug('append purge list: dict_sensor_val=-1 ' + sensor_uuid)
        # return an empty array to indicate that this uuid has to be purged
        ret = (sensor_uuid, [])

    elif len(utc_t) < len_time_slots:
        log.debug('append purge list:len(utc_t)<len_time_slots' + sensor_uuid)
        ret = (sensor_uuid, [])

    elif len(val) < len_time_slots:
        log.debug('append purge list:len(val)<len_time_slots' + sensor_uuid)
        ret = (sensor_uuid, [])

    else:
        ret = (sensor_uuid, [dict_stime_temp, dict_sensor_val_temp, utc_t_val_temp])

    return ret
Example #2
0
def x_input_to_states(xinput, CORR_VAL_OUT=0, PARALLEL = False):
    sinput = np.zeros(xinput.shape)
    num_samples = xinput.shape[0]
    num_sensors = xinput.shape[1]
    if num_samples < num_sensors:
        log.warn('number of samplesa are smaller than number of sensors')

    log.info('Mapping ' + str(xinput.shape) + ' marix to discrete states ')

    for k, samples in enumerate(xinput.T):
        obs = samples[:, np.newaxis]
        label, opt_num_cluster, model, score, score_err_sum = state_retrieval(obs, max_num_cluster=6, est_method='kmean', PARALLEL=PARALLEL)
        high_peak_label_idx = np.argmax(model.cluster_centers_)
        low_peak_label_idx = np.argmin(model.cluster_centers_)
        high_peak_idx = np.nonzero(label == high_peak_label_idx)[0]
        sinput[high_peak_idx,k] = 1
        low_peak_idx = np.nonzero(label == low_peak_label_idx)[0]
        sinput[low_peak_idx, k] = -1

    corr_state_val = list()
    if CORR_VAL_OUT == 1:
        log.info('Compute Correlation Score....')
        for k, (row1, row2) in enumerate(zip(sinput.T, xinput.T)):
            corr_state_val.append(round(stats.pearsonr(row1, row2)[0],3))
    corr_state_val = np.array(corr_state_val)
    return sinput, corr_state_val
def x_input_to_states(xinput, CORR_VAL_OUT=0, PARALLEL = False):
    sinput = np.zeros(xinput.shape)
    num_samples = xinput.shape[0]
    num_sensors = xinput.shape[1]
    if num_samples < num_sensors:
        log.warn('number of samplesa are smaller than number of sensors')

    log.info('Mapping ' + str(xinput.shape) + ' marix to discrete states ')

    for k, samples in enumerate(xinput.T):
        obs = samples[:, np.newaxis]
        label, opt_num_cluster, model, score, score_err_sum = state_retrieval(obs, max_num_cluster=6, est_method='kmean', PARALLEL=PARALLEL)
        high_peak_label_idx = np.argmax(model.cluster_centers_)
        low_peak_label_idx = np.argmin(model.cluster_centers_)
        high_peak_idx = np.nonzero(label == high_peak_label_idx)[0]
        sinput[high_peak_idx,k] = 1
        low_peak_idx = np.nonzero(label == low_peak_label_idx)[0]
        sinput[low_peak_idx, k] = -1

    corr_state_val = list()
    if CORR_VAL_OUT == 1:
        log.info('Compute Correlation Score....')
        for k, (row1, row2) in enumerate(zip(sinput.T, xinput.T)):
            corr_state_val.append(round(stats.pearsonr(row1, row2)[0],3))
    corr_state_val = np.array(corr_state_val)
    return sinput, corr_state_val
Example #4
0
def build_diff(args):
    (k, time_slots, conf_lev, set_val, set_name, num_type) = args

    log.info(set_name)
    try:

        diff_mean = get_diff(set_val, time_slots, num_type, conf_lev)
        if num_type == FLOAT_TYPE:
            #norm_diff_mean,output_status=normalize_data(diff_mean[:,0])
            norm_diff_mean,output_status=normalize_data(diff_mean)

        elif num_type == INT_TYPE:

            #num_discrete_vals=len(set(list(diff_mean[:,0])))
            num_discrete_vals=len(set(list(diff_mean)))
            log.info('num_discrete_vals :' + str(num_discrete_vals))
            if num_discrete_vals>1:
                output_status = 0
                norm_diff_mean = diff_mean
            else:
                output_status = -1
                norm_diff_mean = list(set(diff_mean))
                #norm_diff_mean=list(set(diff_mean[:,0]))
        else:
            pass

    except Exception as e:
        log.error(traceback.print_exc())
        log.error('Error in processing data feature, excluded from analysis ' + str(e))
        output_status = -1
        norm_diff_mean = None
        return (k,[output_status, norm_diff_mean])

    return (k, [output_status, norm_diff_mean])
def interpolation_measurement(data_dict, input_names, err_rate=1, sgm_bnd=20):
    log.info('-' * 40)
    log.info('interploattion starts....')
    log.info('-' * 40)
    measurement_point_set = list()
    num_of_discrete_val = list()
    sampling_interval_set = list()
    num_type_set = list()
    err_rate = 1
    sgm_bnd = 20

    for i, key_name in enumerate(input_names):
        log.info(key_name + '...')
        t_ = np.array(data_dict[key_name][2][0])

        if len(t_) == 0:
            continue

        intpl_intv = np.ceil((t_[-1]-t_[0]) /len(t_))
        sampling_interval_set.append(intpl_intv)
        val_ = np.array(data_dict[key_name][2][1])
        num_of_discrete_val_temp = len(set(val_))
        num_of_discrete_val.append(num_of_discrete_val_temp)

        # filtering outlier
        # assuming 1% of errors and 30 x standard deviation rules
        outlier_idx = outlier_detect(val_, err_rate, sgm_bnd)

        if len(outlier_idx) > 0:
            log.info('outlier samples are detected: outlier_idx: ' + str(outlier_idx))
            t_ = np.delete(t_,outlier_idx)
            val_ = np.delete(val_,outlier_idx)
        t_new = np.r_[t_[0]:t_[-1]:intpl_intv]

        """
        if num_of_discrete_val_temp<MIN_NUM_VAL_FOR_FLOAT:
            num_type=INT_TYPE
            val_new=fast_nearest_interp(t_new, t_,val_)
        else:
            num_type=FLOAT_TYPE
            val_new = np.interp(t_new, t_,val_)
        """

        num_type = check_data_type(data_dict[key_name][2][1])
        if num_type == INT_TYPE:
            val_new = fast_nearest_interp(t_new, t_, val_)
        else:
            #num_type=FLOAT_TYPE
            val_new = np.interp(t_new, t_, val_)

        c = np.vstack([t_new,val_new])
        measurement_point_set.append(c)
        num_type_set.append(num_type)

    #return measurement_point_set,num_type_set,num_of_discrete_val,sampling_interval_set
    return measurement_point_set, np.array(num_type_set)
Example #6
0
def interpolation_measurement(data_dict, input_names, err_rate=1, sgm_bnd=20):
    log.info('-' * 40)
    log.info('interploattion starts....')
    log.info('-' * 40)
    measurement_point_set = list()
    num_of_discrete_val = list()
    sampling_interval_set = list()
    num_type_set = list()
    err_rate = 1
    sgm_bnd = 20

    for i, key_name in enumerate(input_names):
        log.info(key_name + '...')
        t_ = np.array(data_dict[key_name][2][0])

        if len(t_) == 0:
            continue

        intpl_intv = np.ceil((t_[-1]-t_[0]) /len(t_))
        sampling_interval_set.append(intpl_intv)
        val_ = np.array(data_dict[key_name][2][1])
        num_of_discrete_val_temp = len(set(val_))
        num_of_discrete_val.append(num_of_discrete_val_temp)

        # filtering outlier
        # assuming 1% of errors and 30 x standard deviation rules
        outlier_idx = outlier_detect(val_, err_rate, sgm_bnd)

        if len(outlier_idx) > 0:
            log.info('outlier samples are detected: outlier_idx: ' + str(outlier_idx))
            t_ = np.delete(t_,outlier_idx)
            val_ = np.delete(val_,outlier_idx)
        t_new = np.r_[t_[0]:t_[-1]:intpl_intv]

        """
        if num_of_discrete_val_temp<MIN_NUM_VAL_FOR_FLOAT:
            num_type=INT_TYPE
            val_new=fast_nearest_interp(t_new, t_,val_)
        else:
            num_type=FLOAT_TYPE
            val_new = np.interp(t_new, t_,val_)
        """

        num_type = check_data_type(data_dict[key_name][2][1])
        if num_type == INT_TYPE:
            val_new = fast_nearest_interp(t_new, t_, val_)
        else:
            #num_type=FLOAT_TYPE
            val_new = np.interp(t_new, t_, val_)

        c = np.vstack([t_new,val_new])
        measurement_point_set.append(c)
        num_type_set.append(num_type)

    #return measurement_point_set,num_type_set,num_of_discrete_val,sampling_interval_set
    return measurement_point_set, np.array(num_type_set)
Example #7
0
def max_pack_cluster(DIST_MAT, min_dist=0.3, max_dist=1.0):
    # minium distance for clusters set by max_dist=1.0 , min_dist=0.3
    # Initionalize
    num_nodes = DIST_MAT.shape[0]
    label = np.inf * np.ones(num_nodes)
    label_num = 0
    remain_index = np.arange(num_nodes)
    dist_mat = DIST_MAT.copy()
    exemplar_list = list()

    while dist_mat.shape[0] > 2:

        if udiag_min(dist_mat) > max_dist:
            log.info('all samples are seperated further than max_dist')
            log.info('remaining samples will be individual clusters')
            # Assign different labels to all raminig samples
            inf_idx = np.nonzero(label == np.inf)[0]
            for r in inf_idx:
                exemplar_list.append(int(r))
            #label[inf_idx]=label_num+np.arange(len(inf_idx))
            label[inf_idx] = np.int_(label_num + np.arange(len(inf_idx)))
            break

        elif udiag_max(dist_mat) < min_dist:
            # Assign the same label to all raminig samples
            log.info('all samples are seperated within min_dist')
            log.info('remaining samples will be the same')
            inf_idx = np.nonzero(label == np.inf)[0]
            exemplar_list.append(int(inf_idx[0]))
            label[inf_idx] = int(label_num)
            break

        else:
            exemplar_idx, max_cluster_idx = max_diff_dist_idx(
                dist_mat, min_dist, max_dist)
            dcluster_idx = remain_index[max_cluster_idx]
            exemplar_list.append(np.int_(remain_index[exemplar_idx]))
            # Update dist_mat and remain_idx
            dist_mat = np.delete(dist_mat, max_cluster_idx, axis=0)
            dist_mat = np.delete(dist_mat, max_cluster_idx, axis=1)
            remain_index = np.delete(remain_index, max_cluster_idx, axis=0)
            # Adding label info
            label[dcluster_idx] = label_num
            label_num += 1
            log.info('dist_mat.max()=' + str(dist_mat.max()))

    unassigned_idx = np.nonzero(label == np.inf)[0]
    if len(unassigned_idx) > 0:
        label[unassigned_idx] = label_num + np.arange(len(unassigned_idx))
        exemplar_list = exemplar_list + list(unassigned_idx)

        #raise NameError('There exist the unassigned: '+str(unassigned_idx))
    intra_err_cnt, inter_err_cnt = check_bounded_distance_constraint_condition(
        DIST_MAT, label, min_dist, max_dist)
    return np.int_(exemplar_list), np.int_(label)
    def __init__(self, config, dataset, session):
        self.config = config
        self.session = session
        self.dataset = dataset
        self.filepath = '%s-%.1f' % (
            config.method,
            config.alpha,
        )
        self.train_dir = './train_dir/%s' % self.filepath

        for folder in [self.train_dir]:
            if not os.path.exists(folder):
                os.makedirs(folder)
            # clean train folder
            if self.config.clean:
                files = glob.glob(folder + '/*')
                for f in files:
                    os.remove(f)

        # --- create model ---
        self.model = Model(config)

        # --- optimizer ---
        #self.global_step = tf.contrib.framework.get_or_create_global_step(graph=None)
        self.global_step = tf.Variable(0, name="global_step")

        self.learning_rate = config.learning_rate
        if config.lr_weight_decay:
            self.learning_rate = tf.train.exponential_decay(
                self.learning_rate,
                global_step=self.global_step,
                decay_steps=10000,
                decay_rate=0.5,
                staircase=True,
                name='decaying_learning_rate')

        self.summary_op = tf.summary.merge_all()
        self.saver = tf.train.Saver(max_to_keep=1)
        self.summary_writer = tf.summary.FileWriter(self.train_dir)
        self.checkpoint_secs = 300  # 5 min

        self.train_op = self.optimize_adam(self.model.kl_loss,
                                           lr=self.learning_rate)

        tf.global_variables_initializer().run()
        if config.checkpoint is not None:
            self.ckpt_path = tf.train.latest_checkpoint(self.config.checkpoint)
            if self.ckpt_path is not None:
                log.info("Checkpoint path: %s", self.ckpt_path)
                self.saver.restore(self.session, self.ckpt_path)
                log.info(
                    "Loaded the pretrain parameters from the provided checkpoint path"
                )
Example #9
0
def max_pack_cluster(DIST_MAT,min_dist=0.3,max_dist=1.0):
    # minium distance for clusters set by max_dist=1.0 , min_dist=0.3
    # Initionalize
    num_nodes = DIST_MAT.shape[0]
    label = np.inf*np.ones(num_nodes)
    label_num = 0
    remain_index = np.arange(num_nodes)
    dist_mat = DIST_MAT.copy()
    exemplar_list = list()

    while dist_mat.shape[0] > 2:

        if udiag_min(dist_mat) > max_dist:
            log.info('all samples are seperated further than max_dist')
            log.info('remaining samples will be individual clusters')
            # Assign different labels to all raminig samples
            inf_idx=np.nonzero(label == np.inf)[0]
            for r in inf_idx:
                exemplar_list.append(int(r))
            #label[inf_idx]=label_num+np.arange(len(inf_idx))
            label[inf_idx] = np.int_(label_num+np.arange(len(inf_idx)))
            break
            
        elif udiag_max(dist_mat)<min_dist:
            # Assign the same label to all raminig samples
            log.info('all samples are seperated within min_dist')
            log.info('remaining samples will be the same')
            inf_idx=np.nonzero(label==np.inf)[0]
            exemplar_list.append(int(inf_idx[0]))
            label[inf_idx]=int(label_num)
            break

        else:
            exemplar_idx,max_cluster_idx=max_diff_dist_idx(dist_mat,min_dist,max_dist)
            dcluster_idx=remain_index[max_cluster_idx]
            exemplar_list.append(np.int_(remain_index[exemplar_idx]))
            # Update dist_mat and remain_idx
            dist_mat=np.delete(dist_mat, max_cluster_idx, axis=0)
            dist_mat=np.delete(dist_mat, max_cluster_idx, axis=1)    
            remain_index=np.delete(remain_index,max_cluster_idx, axis=0)
            # Adding label info
            label[dcluster_idx]=label_num;label_num+=1
            log.info('dist_mat.max()=' + str(dist_mat.max()))

    unassigned_idx=np.nonzero(label==np.inf)[0]
    if len(unassigned_idx)>0:
        label[unassigned_idx]=label_num+np.arange(len(unassigned_idx))
        exemplar_list=exemplar_list+list(unassigned_idx)
        
        #raise NameError('There exist the unassigned: '+str(unassigned_idx))
    intra_err_cnt, inter_err_cnt=check_bounded_distance_constraint_condition(DIST_MAT,label,min_dist,max_dist)        
    return np.int_(exemplar_list),np.int_(label)
Example #10
0
def verify_data_mat(X):
    num_err_temp = np.array([[len(np.nonzero(np.isnan(sample))[0]),len(np.nonzero(sample==np.inf)[0]),len(np.nonzero(np.var(sample)==0)[0])] for sample in X])
    num_err = np.sum(num_err_temp, axis=0)

    for err_idx in np.argwhere( num_err > 0):
        if err_idx == 0:
            NameError('nan entry found')

        if err_idx == 1:
            NameError('inf entry found')

        if err_idx == 2:
            NameError('zero var found')

    log.info('all entry values of data matrix are verifed ok')
Example #11
0
def _bn_anaylsis_all(bldg_obj, p_name, sig_tag='avg', num_picks_bn=15, learning_alg='hc'):
    s_names = bldg_obj.sigtags[sig_tag].names['sensor']
    p_idx = s_names.index(p_name)
    data_state_mat = bldg_obj.sigtags[sig_tag].data_state_mat

    log.info('power - sensors + weather + time ...')
    s_cause_label, s_labels, s_hc, s_cp_mat, s_bndata_mat = \
        _bn_anaylsis(bldg_obj, p_name, attr='sensor', sig_tag=sig_tag, num_picks_bn=num_picks_bn, learning_alg=learning_alg)

    t_cause_label, t_labels, t_hc, t_cp_mat, t_bndata_mat = \
        _bn_anaylsis(bldg_obj, p_name, attr='time', sig_tag=sig_tag, num_picks_bn=num_picks_bn, learning_alg=learning_alg)

    w_cause_label, w_labels, w_hc, w_cp_mat, w_bndata_mat = \
        _bn_anaylsis(bldg_obj, p_name, attr='weather', sig_tag=sig_tag, num_picks_bn=num_picks_bn, learning_alg=learning_alg)
    #s_cause_label=s_labels; w_cause_label=w_labels;t_cause_label=t_labels

    s_cause_idx = [bldg_obj.sigtags[sig_tag].names['sensor'].index(name) for name in s_cause_label]
    t_cause_idx = [bldg_obj.sigtags[sig_tag].names['time'].index(name) for name in t_cause_label]
    w_cause_idx = [bldg_obj.sigtags[sig_tag].names['weather'].index(name) for name in w_cause_label]

    bndata_mat = np.vstack( (bldg_obj.sigtags[sig_tag].data_state_mat[:, p_idx].T,\
        bldg_obj.sigtags[sig_tag].data_state_mat[:, s_cause_idx].T, \
        bldg_obj.sigtags[sig_tag].data_weather_mat_[:, w_cause_idx].T, \
        bldg_obj.sigtags[sig_tag].data_time_mat[:, t_cause_idx].T)).T

    cols = [name for name in [p_name] + s_cause_label + w_cause_label + t_cause_label]

    b_arc_list = \
        pair_in_idx([p_name], s_cause_label + w_cause_label + t_cause_label) + \
        pair_in_idx(s_cause_label, w_cause_label+t_cause_label) + \
        pair_in_idx(w_cause_label, t_cause_label) + \
        pair_in_idx(t_cause_label, t_cause_label)

    # this is the heart and soul of ddea
    black_arc_frame = rbn.construct_arcs_frame(b_arc_list)
    factor_data_mat = rbn.convert_pymat_to_rfactor(bndata_mat)
    data_frame = rbn.construct_data_frame(factor_data_mat, cols)
    if learning_alg == 'tabu':
        hc_b = rbn.bnlearn.tabu(data_frame, blacklist=black_arc_frame, score='bic')
    elif learning_alg == 'mmhc':
        hc_b = rbn.bnlearn.mmhc(data_frame, blacklist=black_arc_frame, score='bic')
    else:
        hc_b = rbn.bnlearn.hc(data_frame, blacklist=black_arc_frame, score='bic')
    amat = rbn.py_get_amat(hc_b)
    cause_label = list(np.array(cols)[np.nonzero(amat[:, 0] == 1)[0]])
    cause_idx = [cols.index(label_) for label_ in cause_label]
    return cause_label, cols, hc_b, amat, bndata_mat
Example #12
0
def create_bldg_object(data_dict, avgdata_dict, diffdata_dict, bldg_tag, pname_key, PARALLEL=False):
    log.info('-' * 40)
    log.info('create object for '+ bldg_tag)
    log.info('-' * 40)

    bldg_object = BuildingObject(bldg_tag)

    # average data
    bldg_object.sigtags['avg'] = _sigtag_property(avgdata_dict, pname_key, 'avg')

    # variance data
    bldg_object.sigtags['diff'] = _sigtag_property(diffdata_dict, pname_key, 'diff')

    #TODO: Name correction for exemplar
    bldg_object.Conditions_dict = data_dict['Conditions_dict']
    bldg_object.Events_dict = data_dict['Events_dict']

    bldg_obj_weather_convert(bldg_object, 'avg', PARALLEL=PARALLEL)
    bldg_obj_weather_convert(bldg_object, 'diff', PARALLEL=PARALLEL)

    # Create classs strucutre for data analysis
    avg_p_name = [BuildingAnalysis(remove_dot(p_name)) for p_name in bldg_object.sigtags['avg'].p_names]
    diff_p_name = [BuildingAnalysis(remove_dot(p_name)) for p_name in bldg_object.sigtags['diff'].p_names]
    bldg_object.analysis = {'avg': avg_p_name, 'diff': diff_p_name}

    _compute_lh_value(bldg_object.sigtags['avg'], bldg_object.analysis['avg'], 'avg')
    _compute_lh_value(bldg_object.sigtags['diff'], bldg_object.analysis['diff'], 'diff')

    return bldg_object
Example #13
0
def _compute_lh_value(blgd_property, bldg_analysis, sig_tag):
    log.info('-' * 40)
    log.info('Compute LH values for ' + sig_tag)
    log.info('-' * 40)

    all_data_state_mat = np.vstack(
        (blgd_property.data_state_mat.T, blgd_property.data_time_mat.T,
         blgd_property.data_weather_mat_.T)).T
    p_idx = blgd_property.p_idx
    p_names = blgd_property.p_names
    len_sensor = blgd_property.data_state_mat.shape[1]
    len_time = blgd_property.data_time_mat.shape[1]
    len_weather = blgd_property.data_weather_mat.shape[1]
    sensor_cause_idx_set = range(len_sensor)
    time_cause_idx_set = range(len_sensor, len_sensor + len_time)
    weather_cause_idx_set = range(len_sensor + len_time,
                                  len_sensor + len_time + len_weather)

    for k, effect_idx in enumerate(p_idx):
        p_name = remove_dot(p_names[k])
        log.info('compute cond. prob of ' + p_name)

        for i in xrange(len(bldg_analysis)):
            bldg_anal_obj = bldg_analysis[i]

            if bldg_anal_obj.sensor_tag == p_name:
                # check weather it is in the set
                effect_state_set = np.array(
                    list(set(all_data_state_mat[:, effect_idx])))
                eff_state = effect_state_set.max()
                bldg_anal_obj.peak_eff_state = eff_state

                s_optstate_set_temp, s_optprob_set_temp = \
                    find_cond_lh_set(all_data_state_mat, sensor_cause_idx_set, effect_idx, eff_state)
                bldg_anal_obj.attrs['sensor'].optprob_set = s_optprob_set_temp
                bldg_anal_obj.attrs[
                    'sensor'].optstate_set = s_optstate_set_temp

                w_optstate_set_temp, w_optprob_set_temp = \
                    find_cond_lh_set(all_data_state_mat, weather_cause_idx_set, effect_idx, eff_state)
                bldg_anal_obj.attrs['weather'].optprob_set = w_optprob_set_temp
                bldg_anal_obj.attrs[
                    'weather'].optstate_set = w_optstate_set_temp

                t_optstate_set_temp, t_optprob_set_temp = \
                    find_cond_lh_set(all_data_state_mat, time_cause_idx_set, effect_idx, eff_state)
                bldg_anal_obj.attrs['time'].optprob_set = t_optprob_set_temp
                bldg_anal_obj.attrs['time'].optstate_set = t_optstate_set_temp
Example #14
0
def pp_construct_data_dict(args):
    (sensor_uuid, sensor_reading, time_slots, ans_start_t, ans_end_t,
     timelet_inv) = args
    log.info('sampling sensor uuid ' + sensor_uuid)
    log.info('-' * 20)

    len_time_slots = len(time_slots)
    ret = None

    # sensor value is read by time
    dict_sensor_val, dict_stime, utc_t, val =\
        get_val_timelet(sensor_reading, time_slots, ans_start_t, ans_end_t, timelet_inv)

    dict_sensor_val_temp = np.array([np.asarray(v) for v in dict_sensor_val])

    dict_stime_temp = np.array([np.asarray(t) for t in dict_stime])

    utc_t_val_temp = np.asarray([utc_t, val])

    if dict_sensor_val == -1:
        log.debug('append purge list: dict_sensor_val=-1 ' + sensor_uuid)
        # return an empty array to indicate that this uuid has to be purged
        ret = (sensor_uuid, [])

    elif len(utc_t) < len_time_slots:
        log.debug('append purge list:len(utc_t)<len_time_slots' + sensor_uuid)
        ret = (sensor_uuid, [])

    elif len(val) < len_time_slots:
        log.debug('append purge list:len(val)<len_time_slots' + sensor_uuid)
        ret = (sensor_uuid, [])

    else:
        ret = (sensor_uuid,
               [dict_stime_temp, dict_sensor_val_temp, utc_t_val_temp])

    return ret
Example #15
0
def _compute_lh_value(blgd_property, bldg_analysis, sig_tag):
    log.info('-' * 40)
    log.info('Compute LH values for ' + sig_tag)
    log.info('-' * 40)

    all_data_state_mat = np.vstack((blgd_property.data_state_mat.T,
                                    blgd_property.data_time_mat.T,
                                    blgd_property.data_weather_mat_.T)).T
    p_idx = blgd_property.p_idx
    p_names = blgd_property.p_names
    len_sensor = blgd_property.data_state_mat.shape[1]
    len_time = blgd_property.data_time_mat.shape[1]
    len_weather = blgd_property.data_weather_mat.shape[1]
    sensor_cause_idx_set = range(len_sensor)
    time_cause_idx_set = range(len_sensor, len_sensor + len_time)
    weather_cause_idx_set = range(len_sensor + len_time, len_sensor + len_time + len_weather)

    for k, effect_idx in enumerate(p_idx):
        p_name = remove_dot(p_names[k])
        log.info('compute cond. prob of ' + p_name)

        for i in xrange(len(bldg_analysis)):
            bldg_anal_obj = bldg_analysis[i]

            if bldg_anal_obj.sensor_tag == p_name:
                # check weather it is in the set
                effect_state_set = np.array(list(set(all_data_state_mat[:, effect_idx])))
                eff_state = effect_state_set.max()
                bldg_anal_obj.peak_eff_state = eff_state

                s_optstate_set_temp, s_optprob_set_temp = \
                    find_cond_lh_set(all_data_state_mat, sensor_cause_idx_set, effect_idx, eff_state)
                bldg_anal_obj.attrs['sensor'].optprob_set = s_optprob_set_temp
                bldg_anal_obj.attrs['sensor'].optstate_set = s_optstate_set_temp

                w_optstate_set_temp, w_optprob_set_temp = \
                    find_cond_lh_set(all_data_state_mat, weather_cause_idx_set, effect_idx, eff_state)
                bldg_anal_obj.attrs['weather'].optprob_set = w_optprob_set_temp
                bldg_anal_obj.attrs['weather'].optstate_set = w_optstate_set_temp

                t_optstate_set_temp, t_optprob_set_temp = \
                    find_cond_lh_set(all_data_state_mat, time_cause_idx_set, effect_idx, eff_state)
                bldg_anal_obj.attrs['time'].optprob_set = t_optprob_set_temp
                bldg_anal_obj.attrs['time'].optstate_set = t_optstate_set_temp
Example #16
0
def create_bldg_object(blgd_data, bldg_tag, pname_key, PARALLEL=False):
    log.info('-' * 40)
    log.info('create object for ' + bldg_tag)
    log.info('-' * 40)

    bldg_object = BuildingObject(bldg_tag)

    data_dict = blgd_data['data_dict']
    avgdata_dict = None
    diffdata_dict = None

    # average data
    if 'avgdata_dict' in blgd_data.keys():
        avgdata_dict = blgd_data['avgdata_dict']
        bldg_object.sigtags['avg'] = _sigtag_property(avgdata_dict, pname_key,
                                                      'avg')

    # variance data
    if 'diffdata_dict' in blgd_data.keys():
        diffdata_dict = blgd_data['diffdata_dict']
        bldg_object.sigtags['diff'] = _sigtag_property(diffdata_dict,
                                                       pname_key, 'diff')

    #TODO: Name correction for exemplar
    bldg_object.Conditions_dict = data_dict['Conditions_dict']
    bldg_object.Events_dict = data_dict['Events_dict']

    # Create classs strucutre for data analysis
    if avgdata_dict:
        bldg_obj_weather_convert(bldg_object, 'avg', PARALLEL=PARALLEL)
        avg_p_name = [
            BuildingAnalysis(remove_dot(p_name))
            for p_name in bldg_object.sigtags['avg'].p_names
        ]
        bldg_object.analysis['avg'] = avg_p_name
        _compute_lh_value(bldg_object.sigtags['avg'],
                          bldg_object.analysis['avg'], 'avg')

    if diffdata_dict:
        bldg_obj_weather_convert(bldg_object, 'diff', PARALLEL=PARALLEL)
        diff_p_name = [
            BuildingAnalysis(remove_dot(p_name))
            for p_name in bldg_object.sigtags['diff'].p_names
        ]
        bldg_object.analysis['diff'] = diff_p_name
        _compute_lh_value(bldg_object.sigtags['diff'],
                          bldg_object.analysis['diff'], 'diff')

    return bldg_object
Example #17
0
def signle_let_cluster_idx(dist_mat, max_dist):
    log.info(str(max_dist))
    num_nodes = dist_mat.shape[0]
    nodes_all_alone = list()
    exemplar_idx = list()
    max_cluster_idx = list()
    for i, dist_vals in enumerate(dist_mat):
        # exclude its own distance
        idx_set = np.r_[np.r_[0:i:1], np.r_[i + 1:num_nodes:1]]
        temp = dist_vals[idx_set]
        num_nodes_away_more_than_max_dist = len(np.nonzero(temp > max_dist)[0])
        #print temp
        if num_nodes_away_more_than_max_dist == num_nodes - 1:
            log.info('-' * 20)
            log.info(str(i) + 'th node check')
            log.info('*** all nodes are away beyond max_dist **')
            nodes_all_alone.append(i)
            #exemplar_idx.append([i])
            exemplar_idx.append(i)
            #max_cluster_idx.append([i])
            max_cluster_idx.append(i)
    return exemplar_idx, max_cluster_idx
Example #18
0
def signle_let_cluster_idx(dist_mat, max_dist):
    log.info(str(max_dist))
    num_nodes=dist_mat.shape[0]
    nodes_all_alone = list()
    exemplar_idx = list()
    max_cluster_idx = list()
    for i, dist_vals in enumerate(dist_mat):
        # exclude its own distance
        idx_set = np.r_[np.r_[0:i:1], np.r_[i+1:num_nodes:1]]
        temp = dist_vals[idx_set]
        num_nodes_away_more_than_max_dist = len(np.nonzero(temp>max_dist)[0])
        #print temp
        if  num_nodes_away_more_than_max_dist==num_nodes-1:
            log.info('-' * 20)
            log.info(str(i) +'th node check')
            log.info('*** all nodes are away beyond max_dist **')
            nodes_all_alone.append(i)
            #exemplar_idx.append([i])
            exemplar_idx.append(i)
            #max_cluster_idx.append([i])
            max_cluster_idx.append(i)
    return exemplar_idx,max_cluster_idx
Example #19
0
def pp_verify_sensor_data_format(tup):
    (key, data_list, time_slots, q) = tup

    log.info(' checking ' + key + '...')

    try:
        for i, samples in enumerate(data_list):
            for j, each_sample in enumerate(samples):

                if each_sample == []:
                    q.put([key, i, j])
                    log.info(str(each_sample) + ' at ' + str(time_slots[i]) + ' in ' + str(key))

                elif not isinstance(each_sample, int) and not isinstance(each_sample, float):
                    q.put([key, i, j])
                    log.info(str(each_sample) + ' at ' + str(time_slots[i]) + ' in ' + str(key))

    except Exception as e:
        log.error(str(e))
Example #20
0
def pp_verify_sensor_data_format(tup):
    (key, data_list, time_slots, q) = tup

    log.info(' checking ' + key + '...')

    try:
        for i, samples in enumerate(data_list):
            for j, each_sample in enumerate(samples):

                if each_sample == []:
                    q.put([key, i, j])
                    log.info(str(each_sample) + ' at ' + str(time_slots[i]) + ' in ' + str(key))

                elif not isinstance(each_sample, int) and not isinstance(each_sample, float):
                    q.put([key, i, j])
                    log.info(str(each_sample) + ' at ' + str(time_slots[i]) + ' in ' + str(key))

    except Exception as e:
        log.error(traceback.print_exc())
        log.error(str(e))
Example #21
0
def create_bldg_object(blgd_data, bldg_tag, pname_key, PARALLEL=False):
    log.info('-' * 40)
    log.info('create object for '+ bldg_tag)
    log.info('-' * 40)

    bldg_object = BuildingObject(bldg_tag)

    data_dict = blgd_data['data_dict']
    avgdata_dict = None
    diffdata_dict = None

    # average data
    if 'avgdata_dict' in blgd_data.keys():
        avgdata_dict = blgd_data['avgdata_dict']
        bldg_object.sigtags['avg'] = _sigtag_property(avgdata_dict, pname_key, 'avg')

    # variance data
    if 'diffdata_dict' in blgd_data.keys():
        diffdata_dict = blgd_data['diffdata_dict']
        bldg_object.sigtags['diff'] = _sigtag_property(diffdata_dict, pname_key, 'diff')

    #TODO: Name correction for exemplar
    bldg_object.Conditions_dict = data_dict['Conditions_dict']
    bldg_object.Events_dict = data_dict['Events_dict']

    # Create classs strucutre for data analysis
    if avgdata_dict:
        bldg_obj_weather_convert(bldg_object, 'avg', PARALLEL=PARALLEL)
        avg_p_name = [BuildingAnalysis(remove_dot(p_name)) for p_name in bldg_object.sigtags['avg'].p_names]
        bldg_object.analysis['avg'] = avg_p_name
        _compute_lh_value(bldg_object.sigtags['avg'], bldg_object.analysis['avg'], 'avg')

    if diffdata_dict:
        bldg_obj_weather_convert(bldg_object, 'diff', PARALLEL=PARALLEL)
        diff_p_name = [BuildingAnalysis(remove_dot(p_name)) for p_name in bldg_object.sigtags['diff'].p_names]
        bldg_object.analysis['diff'] = diff_p_name
        _compute_lh_value(bldg_object.sigtags['diff'], bldg_object.analysis['diff'], 'diff')

    return bldg_object
def data_summerization(bldg_key, data_dict, proc_avg=True, proc_diff=True, PARALLEL=False):

    time_slots = data_dict['time_slots'][:]
    conditions_dict = data_dict['Conditions_dict'].copy()
    events_dict = data_dict['Events_dict'].copy()
    sensor_list = data_dict['sensor_list'][:]
    weather_list = data_dict['weather_list'][:]
    weather_list_used = ['TemperatureC', 'Dew PointC', 'Humidity', 'Events', 'Conditions']

    # data_used is the list of refernece name for all measurements from now on.
    data_used = sensor_list + weather_list_used
    # This is a global ID for data_used measurement
    data_used_idx = range(len(data_used))
    sensor_idx = range(len(sensor_list))
    weather_idx = range(len(sensor_list), len(data_used))
    dsout = {'data_dict': data_dict}

    if proc_avg:
        log.info('-' * 40)
        log.info('processing avg.feature..')
        log.info('-' * 40)

        X_Feature, X_Time, X_names, X_zero_var_list, X_zero_var_val, X_int_type_list,\
        X_int_type_idx, X_float_type_list, X_float_type_idx, X_weather_type_idx, X_sensor_type_idx = \
            build_feature_matrix(data_dict, sensor_list, weather_list_used, time_slots, interpolation=1, max_num_succ_idx_for_itpl=int(len(time_slots)*0.05))

        build_feature_matrix_out = \
            {'X_Feature': X_Feature,
             'X_Time': X_Time,
             'X_names': X_names,
             'X_zero_var_list': X_zero_var_list,
             'X_zero_var_val': X_zero_var_val,
             'X_int_type_list': X_int_type_list,
             'X_int_type_idx': X_int_type_idx,
             'X_float_type_list': X_float_type_list,
             'X_float_type_idx': X_float_type_idx,
             'X_weather_type_idx': X_weather_type_idx,
             'X_sensor_type_idx': X_sensor_type_idx}

        build_feature_matrix_out = obj(build_feature_matrix_out)

        if len(X_names+X_zero_var_list) != len(data_used):
            log.error('Missing name is found in X_names or X_zero_var_list')
            raise NameError('Missing name is found in X_names or X_zero_var_list')

        else:
            zero_var_idx = [data_used.index(name_str) for name_str in X_zero_var_list]
            nzero_var_idx = list(set(data_used_idx)-set(zero_var_idx))
        
        if X_Feature.shape[0] > 0:
            # From below all index are reference to X_Feature
            sf_idx = list(set(X_sensor_type_idx)&set(X_float_type_idx))
            # Equivalent to np.array(data_used)[np.array(nzero_var_idx)[sf_idx]]
            sf_name = list(np.array(X_names)[sf_idx])
            si_idx = list(set(X_sensor_type_idx)&set(X_int_type_idx))
            si_name = list(np.array(X_names)[si_idx])
            wf_idx = list(set(X_weather_type_idx)&set(X_float_type_idx))
            wf_name = list(np.array(X_names)[wf_idx])
            wi_idx = list(set(X_weather_type_idx)&set(X_int_type_idx))
            wi_name = list(np.array(X_names)[wi_idx])

            #Euclidian Distance Matrix of Floating type of data only   wf+o
            float_idx = list(set(sf_idx)| set(wf_idx))
            int_idx = list(set(si_idx)| set(wi_idx))

            # Float Type Measurement Clustering
            X_Feature_sfe, sf_exemplars_dict, exemplars_, labels_ = \
                cluster_measurement_points(X_Feature[:, sf_idx], sf_name, corr_bnd=[0.1, 0.9], alg='aff')

            sfe_idx = list(np.array(sf_idx)[exemplars_])
            #plot_label(X_Feature,X_names,labels_,exemplars_,[4,5,6,7])

            # InT Type Measurement Clustering
            X_Feature_sie, si_exemplars_dict, exemplars_, labels_ = \
                cluster_measurement_points(X_Feature[:, si_idx], si_name, corr_bnd=[0.0, 0.9], alg='aff')
            sie_idx = list(np.array(si_idx)[exemplars_])

            # sensor -float type
            sfe_state, sfe_corr_val = x_input_to_states(X_Feature_sfe, CORR_VAL_OUT=1)

            # sensor -integer type
            sie_state = X_Feature_sie

            # weather -float type
            wf_state, wf_corr_val = x_input_to_states(X_Feature[:, wf_idx], CORR_VAL_OUT=1)

            # weather -integer type
            wi_state = X_Feature[:, wi_idx]

            empty_states = np.array([[] for i in range(len(X_Time))])
            if len(sfe_state) == 0:
                sfe_state = empty_states

            if len(sie_state) == 0:
                sie_state = empty_states

            if len(wf_state) ==0:
                wf_state = empty_states

            if len(wi_state) == 0:
                wi_state = empty_states

            # Exemplar sensor only    
            X_Sensor_STATE = np.append(sfe_state,sie_state, axis=1)
            X_Sensor_STATE = X_Sensor_STATE.astype(int)
            X_Sensor_NAMES = list(np.array(X_names)[sfe_idx]) + list(np.array(X_names)[sie_idx])

            X_Weather_STATE = np.append(wf_state,wi_state, axis=1)
            X_Weather_STATE = X_Weather_STATE.astype(int)
            X_Weather_NAMES = list(np.array(X_names)[wf_idx])+list(np.array(X_names)[wi_idx])

            # months of a year,days of a week, and hours of a day
            # (Monday, Tuesday,Wendsday,Thursday,Saturday,Sunday) =(0,1,2,3,4,5,6)
            X_Time_STATE_temp = build_time_states(X_Time)
            X_Time_NAMES_temp = ['MTH', 'WD', 'HR']
            X_Time_STATE = list()
            X_Time_NAMES = list()

            for xt_col, xt_name in zip(X_Time_STATE_temp.T,X_Time_NAMES_temp):
                if len(set(xt_col)) > 1:
                    X_Time_STATE.append(xt_col)
                    X_Time_NAMES.append(xt_name)
            
            X_Time_STATE = np.array(X_Time_STATE).T

            #################################################
            # FORMATTED DATA  FOR REGUALR EVENT
            #################################################
            #DO_PROB_EST=1  ** Save this variables***
            #avgdata_mat = np.hstack([X_Sensor_STATE,X_Weather_STATE,X_Time_STATE])
            #avgdata_names = X_Sensor_NAMES+X_Weather_NAMES+X_Time_NAMES
            avgdata_exemplar = dict(sf_exemplars_dict.items()+si_exemplars_dict.items())
            avgdata_zvar = X_zero_var_list
            
            avgdata_dict = dict()
            avgdata_dict.update({'build_feature_matrix_out': build_feature_matrix_out})

            avgdata_dict.update({'avgdata_state_mat': X_Sensor_STATE})
            avgdata_dict.update({'avgdata_weather_mat': X_Weather_STATE})
            avgdata_dict.update({'avgdata_time_mat': X_Time_STATE})

            avgdata_dict.update({'avg_time_slot': X_Time})
            avgdata_dict.update({'avgdata_exemplar': avgdata_exemplar})
            avgdata_dict.update({'avgdata_zvar': avgdata_zvar})

            avgdata_dict.update({'sensor_names': X_Sensor_NAMES})
            avgdata_dict.update({'weather_names': X_Weather_NAMES})
            avgdata_dict.update({'time_names': X_Time_NAMES})
            dsout.update({'avgdata_dict': avgdata_dict})

    if proc_diff:
        log.info('-' * 40)
        log.info('processing diff.feature..')
        log.info('-' * 40)
        ####################################
        # Irregular Event Extraction
        ####################################
        # Interpolatoin with outlier removal, Here we exclude weather data from irregualr event analysis
        # since weather data noramlly show slow changes in time.so we dont expect in any meaningful diffs values
        measurement_point_set,num_type_set = interpolation_measurement(data_dict, sensor_list, err_rate=1, sgm_bnd=20)

        # Irregualr matrix
        Xdiff_Mat,\
        Xdiff_Time,\
        Xdiff_Names,\
        Xdiff_zero_var_list,\
        Xdiff_zero_var_val,\
        Xdiff_int_type_list,\
        Xdiff_int_type_idx,\
        Xdiff_float_type_list,\
        Xdiff_float_type_idx =\
            build_diff_matrix(measurement_point_set, time_slots, num_type_set, sensor_list, PARALLEL=PARALLEL)

        build_diff_matrix_out = \
            {'Xdiff_Mat':Xdiff_Mat,
             'Xdiff_Time':Xdiff_Time,
             'Xdiff_Names':Xdiff_Names,
             'Xdiff_zero_var_list':Xdiff_zero_var_list,
             'Xdiff_zero_var_val':Xdiff_zero_var_val,
             'Xdiff_int_type_list':Xdiff_int_type_list,
             'Xdiff_int_type_idx':Xdiff_int_type_idx,
             'Xdiff_float_type_list':Xdiff_float_type_list,
             'Xdiff_float_type_idx':Xdiff_float_type_idx}

        build_diff_matrix_out = obj(build_diff_matrix_out)

        if Xdiff_Mat.shape[0] > 0:
            #==============================================================================
            # Restructure diff_marix's and weather matix  for the same common time slot
            #==============================================================================
            time_slots_array = np.sort(np.array(list(set(Xdiff_Time) & set(X_Time))))

            # Extract subset of X_Weather_STATE
            removed_idx_list = list()
            for ridx, slot in enumerate(X_Time):
                slot_idx = np.where(time_slots_array==slot)[0]

                # slot not in common time slots
                if len(slot_idx) == 0:
                    removed_idx_list.append(ridx)

            XDIFF_Weather_STATE = np.delete(X_Weather_STATE, removed_idx_list,axis=0)

            # Extract subset of Xdiff_Mat
            removed_idx_list = list()
            for ridx,slot in enumerate(Xdiff_Time):
                slot_idx = np.where(time_slots_array == slot)[0]

                # slot not in common time slots
                if len(slot_idx) == 0:
                    removed_idx_list.append(ridx)

            Xdiff_Mat = np.delete(Xdiff_Mat, removed_idx_list, axis=0)

            # Update Xdiff_Time
            Xdiff_Time = time_slots_array
            XDIFF_Weather_STATE = np.array(XDIFF_Weather_STATE)    

            # From below all index are reference to X_Feature
            xdiff_sf_idx = Xdiff_float_type_idx
            xdiff_sf_name = Xdiff_float_type_list
            xdiff_si_idx = Xdiff_int_type_idx
            xdiff_si_name = Xdiff_int_type_list

            # Float Type Measurement Clustering
            X_Diff_sfe, sf_diff_exemplars_dict, exemplars_, labels_ = \
                cluster_measurement_points(Xdiff_Mat[:, xdiff_sf_idx], xdiff_sf_name, corr_bnd=[0.1, 0.9])
            xdiff_sfe_idx = list(np.array(xdiff_sf_idx)[exemplars_])

            # InT Type Measurement Clustering
            X_Diff_sie, si_diff_exemplars_dict, exemplars_, labels_ = \
                cluster_measurement_points(Xdiff_Mat[:, xdiff_si_idx], xdiff_si_name, corr_bnd=[0.1, 0.9])
            xdiff_sie_idx = list(np.array(xdiff_si_idx)[exemplars_])

            # sensor -float type
            xdiff_sfe_state, xdiff_sfe_corr_val =\
                x_input_to_states(X_Diff_sfe, CORR_VAL_OUT=1, PARALLEL=PARALLEL)

            # sensor -integer type
            xdiff_sie_state = X_Diff_sie
            empty_states = np.array([[] for i in range(len(Xdiff_Time))])

            if len(xdiff_sfe_state) == 0:
                xdiff_sfe_state = empty_states

            if len(xdiff_sie_state) == 0:
                xdiff_sie_state = empty_states

            if len(wf_state) == 0:
                wf_state = empty_states

            if len(wi_state) == 0:
                wi_state = empty_states

            # Exemplar sensor only    
            XDIFF_Sensor_STATE = np.append(xdiff_sfe_state,xdiff_sie_state, axis=1)
            XDIFF_Sensor_STATE = XDIFF_Sensor_STATE.astype(int)
            XDIFF_Sensor_NAMES = list(np.array(Xdiff_Names)[xdiff_sfe_idx])+list(np.array(Xdiff_Names)[xdiff_sie_idx])

            # months of a year,days of a week, and hours of a day
            # (Monday, Tuesday,Wendsday,Thursday,Saturday,Sunday) =(0,1,2,3,4,5,6)
            XDIFF_Time_STATE_temp = build_time_states(Xdiff_Time)
            XDIFF_Time_NAMES_temp = ['MTH', 'WD', 'HR']
            XDIFF_Time_STATE = list()
            XDIFF_Time_NAMES = list()
            for xt_col, xt_name in zip(XDIFF_Time_STATE_temp.T, XDIFF_Time_NAMES_temp):
                if len(set(xt_col)) > 1:
                    XDIFF_Time_STATE.append(xt_col)
                    XDIFF_Time_NAMES.append(xt_name)
            XDIFF_Time_STATE = np.array(XDIFF_Time_STATE).T

            #################################################
            # FORMATTED DATA  FOR IRREGUALR EVENT
            #################################################
            #** Save this variables***
            #diffdata_mat = np.hstack([XDIFF_Sensor_STATE,X_Weather_STATE,XDIFF_Time_STATE])
            #diffdata_names = XDIFF_Sensor_NAMES+X_Weather_NAMES+XDIFF_Time_NAMES
            diffdata_exemplar = dict(sf_diff_exemplars_dict.items() + si_diff_exemplars_dict.items())
            diffdata_zvar = Xdiff_zero_var_list

            diffdata_dict = dict()
            diffdata_dict.update({'build_diff_matrix_out': build_diff_matrix_out})

            diffdata_dict.update({'diffdata_state_mat': XDIFF_Sensor_STATE})
            diffdata_dict.update({'diffdata_weather_mat': XDIFF_Weather_STATE})
            diffdata_dict.update({'diffdata_time_mat': XDIFF_Time_STATE})

            diffdata_dict.update({'diff_time_slot': Xdiff_Time})
            diffdata_dict.update({'diffdata_exemplar': diffdata_exemplar})
            diffdata_dict.update({'diffdata_zvar': diffdata_zvar})

            diffdata_dict.update({'sensor_names': XDIFF_Sensor_NAMES})
            diffdata_dict.update({'weather_names': X_Weather_NAMES})
            diffdata_dict.update({'time_names': X_Time_NAMES})

            dsout.update({'diffdata_dict': diffdata_dict})

    dsout.update({'bldg_key': remove_dot(bldg_key)})

    return dsout
Example #23
0
def build_feature_matrix(data_dict, sensor_list, weather_list, time_slots, interpolation=1, max_num_succ_idx_for_itpl=4):

    data_used = sensor_list + weather_list
    log.info('Build data feature matrix now.....')

    if interpolation == 1:
        log.info('Missing samples will be interpolated upto ' + str(max_num_succ_idx_for_itpl) + 'successive time slots')
    else:
        log.info('All time slots with any missing sample will be removed without interpolatoin ')

    num_of_data = len(data_used)
    num_of_samples = len(time_slots)

    # Declare as 2-d list for exception.
    X = list()
    INT_type_list = list()
    FLOAT_type_list = list()
    input_names = list()
    weather_type_idx = list()
    sensor_type_idx = list()
    INT_type_idx = list()
    FLOAT_type_idx = list()
    zero_var_list = list()
    zero_var_val = list()


    # whose variance is zero, hence carry no information,
    # Constrcut X matrix by summerizing hourly samples
    for j, key in enumerate(data_used):
        log.info('-' * 40)
        log.info('building for ' + str(key))

        try:
            num_type = check_data_type(data_dict[key][2][1])

            # Avg. value feature
            x_temp = get_feature(data_dict[key][1], num_type)

            non_inf_idx = np.nonzero(x_temp < np.inf)[0]
            #if non_inf_idx <len(time_slots):measurement_point_set

            # Outlier removal, different parameters for sensors and weather data
            if len(sensor_list) <= j:
                # weather data
                is_weather_data = True
                outlier_idx = outlier_detect(x_temp[non_inf_idx], 5, 10)
            else:
                is_weather_data = False
                outlier_idx = outlier_detect(x_temp[non_inf_idx], 1, 20)

            if len(outlier_idx) > 0:
                log.info('outlier samples are detected: outlier_idx:' + str(outlier_idx))
                x_temp[non_inf_idx[outlier_idx]] = np.inf
            
            # interplolation data, use nearest for int type, use linear for float type
            if interpolation == 1:
                x_temp = interploate_data(x_temp, num_type, max_num_succ_idx_for_itpl)

            norm_data_vec, output_status = normalize_data(x_temp[:, 0])
            if len(np.nonzero(norm_data_vec == np.inf)[0]) > num_of_samples/5:
                raise

        except Exception as e:
            log.error(traceback.print_exc())
            log.error(' Error in processing data feature, excluded from analysis ' + str(e))
            output_status = -1
            norm_data_vec = None

        if output_status == -1:
            zero_var_list.append(key)
            zero_var_val.append(norm_data_vec)
            log.info('too small variance for float type, added to zero var list')

        else:
            input_names.append(key)
            log.info(str(j)+'th sensor update')

            if (num_type == FLOAT_TYPE) and (is_weather_data == False):
                X.append(norm_data_vec)
                FLOAT_type_idx.append(len(X)-1)
                FLOAT_type_list.append(key)

            elif (num_type == INT_TYPE) or (is_weather_data == True):
                X.append(x_temp[:, 0])
                INT_type_idx.append(len(X)-1)
                INT_type_list.append(key)

            else:
                log.error('Sample type must either INT or FLOAT type')
                raise NameError('Sample type must either INT or FLOAT type')

            if key in weather_list:
                weather_type_idx.append(len(X)-1)

            elif key in sensor_list:
                sensor_type_idx.append(len(X)-1)
            else:
                log.error('Sample type must either Weather or Sensor type')
                raise NameError('Sample type must either Weather or Sensor type')

    # Linear Interpolate
    X = np.array(X).T
    if X.shape[0] != num_of_samples:
        log.error('The numeber of rows in feature matrix and the number of the time slots are  different ')
        raise NameError('The numeber of rows in feature matrix and the number of the time slots are  different ')

    if X.shape[1]+len(zero_var_list) != num_of_data:
        log.error('The sume of the numeber of column in feature matrix  and the number of zero var column are  different from the number of input measurements ')
        raise NameError('The sume of the numeber of column in feature matrix  and the number of zero var column are  different from the number of input measurements ')

    deleted_timeslot_idx=[]
    log.info('-' * 20)
    log.info('removing time slots having no sample...')
    inf_idx_set = []
    for col_vec in X.T:
        inf_idx = np.nonzero(col_vec ==np.infty)[0]
        inf_idx_set = np.r_[inf_idx_set, inf_idx]
    inf_col_idx = list(set(list(inf_idx_set)))
    deleted_timeslot_idx = np.array([int(x) for x in inf_col_idx])

    log.info('time slots ' + str(deleted_timeslot_idx) + ' removed...')
    log.info('-' * 20)
    X = np.delete(X, deleted_timeslot_idx, axis=0)
    new_time_slot = np.delete(time_slots, deleted_timeslot_idx)

    # Checking whether it has any ill entry value
    verify_data_mat(X)

    return X, new_time_slot, input_names, zero_var_list, zero_var_val, INT_type_list, INT_type_idx, FLOAT_type_list, FLOAT_type_idx, weather_type_idx, sensor_type_idx
Example #24
0
def plotting_bldg_lh(bldg, bldg_key=[], attr='sensor', num_picks=30):
    log.info('-' * 40)
    log.info('plotting lh for ' + attr)
    log.info('-' * 40)
    sig_tag_set = ['avg', 'diff']
    plt.ioff()

    if not len(bldg_key):
        bldg_tag_set = [bldg.bldg_tag]
    else:
        bldg_tag_set = [bldg_key]

    for bldg_tag in bldg_tag_set:
        if bldg_tag == bldg.bldg_tag:
            log.info('-' * 40)
            log.info(bldg_tag + " is to be plotted...")
            log.info('-' * 40)

            for sig_tag in sig_tag_set:
                try:
                    p_names = bldg.sigtags[sig_tag].p_names

                    for pname in p_names:
                        try:
                            blank_idx = pname.index('.')
                            pname = pname.replace('.', '_')
                        except:
                            pass

                        optprob_set = None
                        optstate_set = None
                        for anal in bldg.analysis[sig_tag]:
                            if anal.sensor_tag == pname:
                                optprob_set = anal.attrs[attr].optprob_set
                                optstate_set = anal.attrs[attr].optstate_set
                                break

                        s_names = bldg.sigtags[sig_tag].names[attr]

                        num_picks = 30
                        sort_idx = np.argsort(optprob_set)[::-1]
                        sort_lh = optprob_set[sort_idx[:num_picks]].T
                        sort_state = optstate_set[sort_idx[:num_picks]].T
                        x_label = list(np.array(s_names)[sort_idx[:num_picks]])
                        x_ticks = range(len(x_label))

                        plt.figure(figsize=(20.0, 15.0))
                        plt.subplot(2, 1, 1)
                        plt.plot(sort_lh, '-*')
                        plt.xticks(x_ticks,
                                   x_label,
                                   rotation=270,
                                   fontsize="small")
                        if sig_tag == 'avg':
                            plt.title('Most relavant ' + attr +
                                      ' attributes to the peak (demand) of ' +
                                      pname,
                                      fontsize=20)
                        else:
                            plt.title(
                                'Most relavant ' + attr +
                                ' attributes to the peak variations of ' +
                                pname,
                                fontsize=20)
                        plt.tick_params(labelsize='large')
                        plt.ylim([-0.05, 1.05])
                        plt.ylabel('Likelihood (From 0 to 1)', fontsize=18)
                        plt.savefig(FIG_DIR + bldg_tag + '_' + pname + '_' +
                                    attr + '_' + sig_tag + '_lh_sensors.png',
                                    bbox_inches='tight')
                        plt.close()

                except Exception as e:
                    log.error(traceback.print_exc())
                    log.error(str(e))
                    pass
    plt.close()
    plt.ion()
Example #25
0
def check_bounded_distance_constraint_condition(dist_mat,labels,min_dist,max_dist):
    intra_err_cnt=0
    num_clusters=int(labels.max()+1)
    log.info('-' * 80)
    log.info('Intra-Cluster distance check.....')
    log.info('Condition: inter-cluster distance is upper-bounded by' + str(round(max_dist,2)))
    log.info('-' * 80)

    for i in range(num_clusters):
        idx_set = np.nonzero(labels==(i))[0]
        #print '----------------------------------------------------------'
        #print i,'th cluster: ',idx_set
        for idx_pair in pair_in_idx(idx_set):
            #print idx_pair, 'dist-',round(dist_mat[idx_pair[0],idx_pair[1]],2)
            dist_val_=dist_mat[idx_pair[0],idx_pair[1]]
            # Rule violation
            if dist_val_ > max_dist:
                log.info('*** the distance of pairs :' + str(idx_pair) + ' in ' + str(i) + 'th cluster ~' +
                         str(np.round(dist_val_,2)) + ' > max_dist=' + str(np.round(max_dist,2)) +'***')

                intra_err_cnt=intra_err_cnt+1
    log.info('-' * 80)
    log.info('Inter-Cluster distance check.....')
    log.info('Condition: intra-cluster distance is lower-bounded by ' + str(round(min_dist,2)))
    log.info('-' * 80)

    cluster_pairs=pair_in_idx(range(num_clusters))
    inter_err_cnt=0
    for c_pair in cluster_pairs:
        idx_set_0=np.nonzero(labels==(c_pair[0]))[0]
        idx_set_1=np.nonzero(labels==(c_pair[1]))[0]
        #print '----------------------------------------------------------'
        #print 'The pairwise distance between ',c_pair[0],'th cluster and',c_pair[1],'th cluster'
        for idx_pair in pair_in_idx(idx_set_0,idx_set_1):
            #print idx_pair, 'dist-',round(dist_mat[idx_pair[0],idx_pair[1]],2)
            dist_val_=dist_mat[idx_pair[0],idx_pair[1]]
            # Rule violation
            if dist_val_<min_dist:
                log.info('*** the distance of pairs :' + str(idx_pair[0]) +
                         ' in ' + str(c_pair[0]) + ' and ' + str(idx_pair[1]) +
                         ' in ' + str(c_pair[1]) + ' ~ ' + str(round(dist_val_,2)) +
                         ' < min_dist=', str(round(min_dist,2)) + '***')
                inter_err_cnt += inter_err_cnt+1
    return intra_err_cnt, inter_err_cnt
Example #26
0
def get_val_timelet(reading, t_slots, ans_start_t, ans_end_t, timelet_inv):

    data = dict()
    data['value'] = np.array([r[1] for r in reading], dtype=float)

    ts_list = list()
    for r in reading:
        local_dt = dt.datetime.fromtimestamp(r[0])
        time_tup = local_dt.timetuple()
        ts_list.append([
            local_dt, time_tup[5], time_tup[4], time_tup[3], time_tup[6],
            time_tup[2], time_tup[1]
        ])

    data['ts'] = np.array(ts_list)

    if not len(data):
        log.critical(
            'Error in file reading: empty data. Skip and need to be purged from sensor list'
        )

        sensor_read = -1
        stime_read = -1
        utc_t = -1
        val = -1
        return sensor_read, stime_read, utc_t, val

    if (len(data["ts"]) < MIN_NUM_VAL_FOR_FLOAT) or (len(data["value"]) <
                                                     MIN_NUM_VAL_FOR_FLOAT):
        log.critical('No data included ' + str(data) +
                     '... Skip and need to be purged from sensor list')

        sensor_read = -1
        stime_read = -1
        utc_t = -1
        val = -1
        return sensor_read, stime_read, utc_t, val

    nan_idx_list = np.nonzero(np.isnan(data["value"]))[0]
    sensor_val = np.delete(data["value"], nan_idx_list, axis=0)
    time_val = np.delete(data["ts"], nan_idx_list, axis=0)

    # Create the list of lists for value
    sensor_read = [[] for i in range(len(t_slots))]

    # Create the list of lists for seconds index
    stime_read = [[] for i in range(len(t_slots))]

    utc_t = []
    val = []

    for t_sample, v_sample in zip(time_val, sensor_val):
        temp_dt = t_sample[DT_IDX]

        if temp_dt < ans_start_t or temp_dt >= ans_end_t:
            continue

        try:
            idx = int(
                (temp_dt - ans_start_t).total_seconds() / timelet_inv.seconds)
            sensor_read[idx].append(v_sample)
            #secs=t_sample[MIN_IDX]*MIN+t_sample[SEC_IDX]
            secs = (temp_dt - t_slots[idx]).total_seconds()
            if secs >= timelet_inv.seconds:
                log.info('sec: ' + str(secs))
                raise NameError(
                    'Seconds from an hour idx cannot be greater than ' +
                    str(timelet_inv.seconds) + 'secs')

            stime_read[idx].append(secs)

        except ValueError:
            idx = -1

        utc_temp = dtime_to_unix([t_sample[DT_IDX]])
        utc_t.append(utc_temp)
        val.append(v_sample)

    return sensor_read, stime_read, utc_t, val
Example #27
0
def cluster_measurement_points(m_matrix,
                               m_name,
                               corr_bnd=[0.1, 0.9],
                               alg='aff'):
    exemplars_dict = dict()

    if m_matrix.shape[1] == 0:
        return [], exemplars_dict, [], []

    elif m_matrix.shape[1] == 1:
        exemplars_ = [0]
        labels_ = [0]
        exemplars_name = m_name

    else:
        distmat_input = find_norm_dist_matrix(m_matrix)

        # Find representative set of sensor measurements
        min_dist_ = np.sqrt(2 * (1 - (corr_bnd[1])))
        max_dist_ = np.sqrt(2 * (1 - (corr_bnd[0])))

        if alg == 'pack':
            log.info('use pack clustering algoirthm')
            exemplars_, labels_ = max_pack_cluster(distmat_input,
                                                   min_dist=min_dist_,
                                                   max_dist=max_dist_)
        else:
            log.info('use affinity clustering algoirthm')
            SIMM_MAT = 2 - distmat_input
            exemplars_, labels_ = cluster.affinity_propagation(SIMM_MAT,
                                                               damping=0.5)

        num_clusters = int(labels_.max() + 1)
        log.info('-' * 40)
        log.info(
            str(num_clusters) + 'clusters out of ' + str(len(labels_)) +
            'measurements')
        log.info('-' * 40)

        validity, intra_dist, inter_dist = compute_cluster_err(
            distmat_input, labels_)

        log.info('validity: ' + str(round(validity, 2)) + ', intra_dist: ' +
                 str(np.round(intra_dist, 2)) + ', inter_dist: ' +
                 str(np.round(inter_dist, 2)))
        log.info('-' * 40)
        exemplars_name = list(np.array(m_name)[exemplars_])

    for label_id, (m_idx,
                   exemplar_label) in enumerate(zip(exemplars_,
                                                    exemplars_name)):
        log.info(str(exemplar_label))
        children_set = list(
            set(np.nonzero(labels_ == label_id)[0]) - set([m_idx]))
        log.info('Label ' + str(label_id) + ' : ' + str(m_idx) + '<--' +
                 str(children_set))
        exemplars_dict.update(
            {exemplar_label: list(np.array(m_name)[children_set])})

    return m_matrix[:, exemplars_], exemplars_dict, exemplars_, labels_
Example #28
0
def show_clusters(exemplars, labels, input_names):
    n_labels = labels.max()
    for i in range(n_labels + 1):
        log.info('Cluster %i: %s' %
                 ((i + 1), ', '.join(input_names[labels == i])))
Example #29
0
def _sigtag_property(data_dict, pname_key, sig_tag):

    data_state_mat = data_dict[sig_tag + 'data_state_mat']
    data_weather_mat = data_dict[sig_tag + 'data_weather_mat']
    data_time_mat = data_dict[sig_tag + 'data_time_mat']
    time_slot = data_dict[sig_tag + '_time_slot']
    data_exemplar = data_dict[sig_tag + 'data_exemplar']
    data_zvar = remove_dot(data_dict[sig_tag + 'data_zvar'])
    sensor_names = remove_dot(data_dict['sensor_names'])
    weather_names = remove_dot(data_dict['weather_names'])
    time_names = remove_dot(data_dict['time_names'])

    if pname_key and len(pname_key):

        #TODO: Name correction for exemplar
        if isinstance(pname_key, list):
            p_idx = [sig_tag + sensor_names.index(p_name) for p_name in pname_key]
            p_names = remove_dot(list(np.array(sensor_names)[list(set(p_idx))]))

        else:

            p_idx = grep(pname_key, sensor_names)
            p_names = remove_dot(list(np.array(sensor_names)[p_idx]))
    else:
        p_idx = [i for i in xrange(0, len(sensor_names))]
        p_names = remove_dot(sensor_names)

    log.info('-' * 40)
    log.info('Power sensor selected -' + sig_tag)
    log.info('-' * 40)

    log.info("p_idx : " + str(p_idx))
    log.info("sensor_names : " + str(sensor_names))
    log.info("p_names : " + str(p_names))

    return BuildingSigtagProperty(sig_tag,  data_state_mat, data_weather_mat,
                                  data_time_mat, time_slot, data_exemplar,
                                  data_zvar, sensor_names, weather_names,
                                  time_names, p_idx, p_names)
Example #30
0
def _sigtag_property(data_dict, pname_key, sig_tag):

    data_state_mat = data_dict[sig_tag + 'data_state_mat']
    data_weather_mat = data_dict[sig_tag + 'data_weather_mat']
    data_time_mat = data_dict[sig_tag + 'data_time_mat']
    time_slot = data_dict[sig_tag + '_time_slot']
    data_exemplar = data_dict[sig_tag + 'data_exemplar']
    data_zvar = remove_dot(data_dict[sig_tag + 'data_zvar'])
    sensor_names = remove_dot(data_dict['sensor_names'])
    weather_names = remove_dot(data_dict['weather_names'])
    time_names = remove_dot(data_dict['time_names'])

    if pname_key and len(pname_key):

        #TODO: Name correction for exemplar
        if isinstance(pname_key, list):
            p_idx = [
                sig_tag + sensor_names.index(p_name) for p_name in pname_key
            ]
            p_names = remove_dot(list(
                np.array(sensor_names)[list(set(p_idx))]))

        else:

            p_idx = grep(pname_key, sensor_names)
            p_names = remove_dot(list(np.array(sensor_names)[p_idx]))
    else:
        p_idx = [i for i in xrange(0, len(sensor_names))]
        p_names = remove_dot(sensor_names)

    log.info('-' * 40)
    log.info('Power sensor selected -' + sig_tag)
    log.info('-' * 40)

    log.info("p_idx : " + str(p_idx))
    log.info("sensor_names : " + str(sensor_names))
    log.info("p_names : " + str(p_names))

    return BuildingSigtagProperty(sig_tag, data_state_mat, data_weather_mat,
                                  data_time_mat, time_slot, data_exemplar,
                                  data_zvar, sensor_names, weather_names,
                                  time_names, p_idx, p_names)
Example #31
0
def state_retrieval(obs, max_num_cluster=6, off_set=0, est_method='kmean', PARALLEL = False):
    log.info('-' * 40)
    log.info('Retrieving discrete states from data using ' + est_method + ' model...')
    log.info('-' * 40)
    log.info('try '+ str(max_num_cluster) + ' clusters..... ')
    score = np.zeros(max_num_cluster)
    model_set = list()

    if not PARALLEL:
        for num_cluster in range(max_num_cluster):
            log.info('Try ' + str(num_cluster+1) + ' clusters ')
            log.info('-----------------------------------')
            if est_method == 'kmean':
                kmean = KMeans(n_clusters=num_cluster+1).fit(obs)
                model_set.append(kmean)
                #score[num_cluster]=-1*np.log(-1*np.sum(kmean.score(obs)))
                #score[num_cluster]=kmean.score(obs)
                #score[num_cluster]=kmean.score(obs)-.5*(num_cluster+1)*1*log10(len(obs))
                #log_ll_val=compute_log_ll(kmean.labels_,obs)
                score[num_cluster] = compute_log_ll(kmean.labels_, obs)

            elif est_method == 'gmm':
                gmm = mixture.GMM(n_components=num_cluster+1).fit(obs)
                model_set.append(gmm)
                score[num_cluster] = np.sum(gmm.score(obs))

            else:
                log.error('not supported est_method')
                raise NameError('not supported est_method')
    else:
        log.info('Parallel enabled...')
        model_set = [0] * max_num_cluster
        score = [0] * max_num_cluster
        params = [(obs, i+1, est_method) for i in range(max_num_cluster)]

        p = Pool(max_num_cluster)
        models = p.map(pp_cluster_state_retrieval, params)
        p.close()
        p.join()

        model_dict = dict(models)
        for k, v in model_dict.iteritems():
            model_set[k] = v[0]
            score[k] = v[1]



    score_err_sum = np.zeros(max_num_cluster)
    log.info('Finding knee points of log likelihood...')

    for i in range(max_num_cluster):
        a_0 = score[:(i)]
        if len(a_0) > 1:
            slope, intercept, r_value, p_value, std_err = stats.linregress(range(len(a_0)),a_0)
            sqr_sum_err0 = sum(((slope*np.arange(len(a_0)) + intercept)-a_0)**2)
        else:
            sqr_sum_err0=0
        a_1 = score[(i):]
        if len(a_1) > 1:
            slope, intercept, r_value, p_value, std_err = stats.linregress(range(len(a_1)),a_1)
            sqr_sum_err1 = sum(((slope*np.arange(len(a_1)) + intercept)-a_1)**2)
        else:
            sqr_sum_err1 = 0
        score_err_sum[i] = sqr_sum_err0 + sqr_sum_err1
    # Optimum number of clusters.
    min_idx = np.argmin(score_err_sum)
    opt_num_cluster = min_idx+1
    log.info('opt_num_cluster: ' + str(opt_num_cluster))

    if est_method == 'kmean':
        label = model_set[min_idx].labels_
    elif est_method == 'gmm':
        label = model_set[min_idx].predict(obs)
    else:
        raise NameError('not supported est_method')
    return label, opt_num_cluster, model_set[min_idx], score, score_err_sum
Example #32
0
def get_weather_timelet(data_dict,t_slots, timelet_inv, use_weather_data_bin=True):

    log.info('------------------------------------')
    log.info('Retrieving weather data... ')
    log.info('------------------------------------')
    t_start = t_slots[0]
    t_end = t_slots[-1]
    log.info('start time: ' + str(t_start) + ' ~ end time: ' + str(t_end))

    # Date iteration given start time and end-time
    # Iterate for each day for all weather data types
    for date_idx, date in enumerate(daterange(t_start, t_end, inclusive=True)):
        log.info("weather date : " + date.strftime("%Y-%m-%d"))

        temp = date.strftime("%Y,%m,%d").rsplit(',')

        if use_weather_data_bin:
            filename = WEATHER_DIR + "%04d_%02d_%02d.bin"%(int(temp[0]), int(temp[1]), int(temp[2]))
            data_day = mt.loadObjectBinaryFast(filename)
        else:
            data_day = rw.retrieve_data('SDH', int(temp[0]), int(temp[1]), int(temp[2]), view='d')

        # split the data into t
        data_day = data_day.split('\n')

        # Iterate for each time index(h_idx) of a day  for all weather data types
        for h_idx, hour_sample in enumerate(data_day):

            hour_samples = hour_sample.split(',')

            # Initialize weather data lists of dictionary
            # The first row is always the list of weather data types
            if (h_idx == 0) and (date_idx == 0):

                sensor_name_list = hour_sample.split(',')
                sensor_name_list = [sensor_name.replace('/', '-') for sensor_name in sensor_name_list]

                for sample_idx, each_sample in enumerate(hour_samples):
                    sensor_name = sensor_name_list[sample_idx]
                    sensor_read = [[] for i in range(len(t_slots))]
                    stime_read = [[] for i in range(len(t_slots))] # Creat the list of lists for minute index
                    utc_t = []
                    val = []
                    #data_dict.update({sensor_name:sensor_read})
                    #data_dict.update({sensor_name:zip(mtime_read,sensor_read)})
                    data_dict.update({sensor_name: [stime_read, sensor_read, [utc_t, val]]})

            elif h_idx > 0:
                ################################################################
                # 'DateUTC' is the one
                sample_DateUTC = hour_samples[sensor_name_list.index('DateUTC')]

                # convert to UTC time to VTT local time.
                utc_dt = dt.datetime.strptime(sample_DateUTC, "%Y-%m-%d %H:%M:%S")
                vtt_dt_aware = utc_dt.replace(tzinfo=from_zone).astimezone(to_zone)

                # convert to offset-naive from offset-aware datetimes
                vtt_dt = dt.datetime(*(vtt_dt_aware.timetuple()[:6]))

                ### WARNING: vtt_utc is not utc
                #log.warn("vtt_utc is not utc")
                vtt_utc = dtime_to_unix([vtt_dt])

                # Check boundary condition
                if int((vtt_dt - t_slots[0]).total_seconds()) < 0 or int((vtt_dt - t_slots[-1]).total_seconds()) >= timelet_inv.seconds:
                    log.debug('skipping weather data out of analysis range...')
                    continue

                slot_idx = int((vtt_dt - t_slots[0]).total_seconds() / timelet_inv.seconds)
                cur_sec_val = (vtt_dt - t_slots[slot_idx]).total_seconds()

                if cur_sec_val >= timelet_inv.seconds:
                    log.critical('sec: ' + str(cur_sec_val))
                    raise NameError('Seconds from an hour idx cannot be greater than '+str(timelet_inv.seconds) +'secs')

                # time slot index a given weather sample time
                try:

                    for sample_idx, each_sample in enumerate(hour_samples):

                        # convert string type to float time if possible
                        try:
                            each_sample = float(each_sample)
                        except ValueError:
                            each_sample = each_sample

                        sensor_name = sensor_name_list[sample_idx]

                        if sensor_name in data_dict:
                            if each_sample != 'N/A' and each_sample !=[]:
                                #data_dict[sensor_name][vtt_dt_idx].append(each_sample)
                                data_dict[sensor_name][0][slot_idx].append(cur_sec_val)
                                data_dict[sensor_name][1][slot_idx].append(each_sample)
                                data_dict[sensor_name][2][0].append(vtt_utc)
                                data_dict[sensor_name][2][1].append(each_sample)

                        else:
                            raise NameError('Inconsistency in the list of weather data')

                except ValueError:
                    slot_idx = -1

            # hour_sample is list of weather filed name, discard
            else:

                hour_sample = list()

    return sensor_name_list
Example #33
0
def get_val_timelet(reading, t_slots, ans_start_t, ans_end_t, timelet_inv):

    data = dict()
    data['value'] = np.array([r[1] for r in reading], dtype=float)

    ts_list = list()
    for r in reading:
        local_dt = dt.datetime.fromtimestamp(r[0])
        time_tup = local_dt.timetuple()
        ts_list.append([local_dt, time_tup[5], time_tup[4], time_tup[3], time_tup[6], time_tup[2], time_tup[1]])

    data['ts'] = np.array(ts_list)


    if not len(data):
        log.critical('Error in file reading: empty data. Skip and need to be purged from sensor list')

        sensor_read = -1
        stime_read = -1
        utc_t = -1
        val = -1
        return sensor_read, stime_read, utc_t, val

    if (len(data["ts"]) < MIN_NUM_VAL_FOR_FLOAT) or (len(data["value"]) < MIN_NUM_VAL_FOR_FLOAT):
        log.critical('No data included ' + str(data) + '... Skip and need to be purged from sensor list')

        sensor_read = -1
        stime_read = -1
        utc_t = -1
        val = -1
        return sensor_read, stime_read, utc_t, val

    nan_idx_list = np.nonzero(np.isnan(data["value"]))[0]
    sensor_val = np.delete(data["value"], nan_idx_list, axis=0)
    time_val = np.delete(data["ts"], nan_idx_list, axis=0)

    # Create the list of lists for value
    sensor_read = [[] for i in range(len(t_slots))]

    # Create the list of lists for seconds index
    stime_read = [[] for i in range(len(t_slots))]

    utc_t = []
    val = []

    for t_sample, v_sample in zip(time_val, sensor_val):
        temp_dt = t_sample[DT_IDX]

        if temp_dt < ans_start_t or temp_dt >= ans_end_t:
            continue

        try:
            idx = int((temp_dt - ans_start_t).total_seconds() / timelet_inv.seconds)
            sensor_read[idx].append(v_sample)
            #secs=t_sample[MIN_IDX]*MIN+t_sample[SEC_IDX]
            secs = (temp_dt - t_slots[idx]).total_seconds()
            if secs >= timelet_inv.seconds:
                log.info('sec: ' + str(secs))
                raise NameError('Seconds from an hour idx cannot be greater than ' + str(timelet_inv.seconds) + 'secs')

            stime_read[idx].append(secs)

        except ValueError:
            idx = -1

        utc_temp = dtime_to_unix([t_sample[DT_IDX]])
        utc_t.append(utc_temp)
        val.append(v_sample)

    return sensor_read, stime_read, utc_t, val
Example #34
0
def construct_data_dict(sensor_data, ans_start_t, ans_end_t, timelet_inv, include_weather=1, PARALLEL=False):

    log.info('-' * 80)
    log.info('mapping sensor list into hasing table using dictionary')
    log.info('Align sensor data into a single time_slots referece... from ' + str(ans_start_t) + ' to ' + str(ans_end_t))
    log.info('-' * 80)

    # Variable Declare and initialization
    time_slots = list()
    start = ans_start_t
    while start < ans_end_t:
        time_slots.append(start)
        start = start + timelet_inv

    # Data dictionary
    # All sensor and weather data is processed and structred into
    # a consistent single data format -- Dictionary
    data_dict = dict()
    sensor_list = list()
    purge_list = list()

    # Data Access is following ....
    #data_dict[key][time_slot_idx][(min_idx=0 or values=1)]

    if PARALLEL:

        log.info("construct_data_dict >>> Parallel enabled")
        args = [(sensor_uuid, sensor_reading, time_slots, ans_start_t, ans_end_t, timelet_inv) for sensor_uuid, sensor_reading in sensor_data.iteritems() ]

        p = Pool(CPU_CORE_NUM)
        timed_vlist = p.map(pp_construct_data_dict, args)
        p.close()
        p.join()

        for v in timed_vlist:
            sensor_uuid, timed_value = v

            if len(timed_value):
                sensor_list.append(sensor_uuid)
                data_dict.update({sensor_uuid: timed_value})

            else:
                purge_list.append(sensor_uuid)

    else:

        for sensor_uuid, sensor_reading in sensor_data.iteritems():

            log.info('sampling sensor uuid ' + sensor_uuid)
            len_time_slots = len(time_slots)

            # sensor value is read by time
            dict_sensor_val, dict_stime, utc_t, val =\
                get_val_timelet(sensor_reading, time_slots, ans_start_t, ans_end_t, timelet_inv)

            if dict_sensor_val == -1:
                log.debug('append purge list: dict_sensor_val=-1 ' + sensor_uuid)
                purge_list.append(sensor_uuid)

            elif len(utc_t) < len_time_slots:
                log.debug('append purge list:len(utc_t)<len_time_slots' + sensor_uuid)
                purge_list.append(sensor_uuid)

            elif len(val) < len_time_slots:
                log.debug('append purge list:len(val)<len_time_slots' + sensor_uuid)
                purge_list.append(sensor_uuid)

            else:
                sensor_list.append(sensor_uuid)

                # Convert list to array type for bin file size and loading time,
                dict_sensor_val_temp = np.array([np.asarray(val_) for val_ in dict_sensor_val])
                dict_stime_temp = np.array([np.asarray(t_) for t_ in dict_stime])
                utc_t_val_temp = np.asarray([utc_t, val])

                data_dict.update({sensor_uuid: [dict_stime_temp, dict_sensor_val_temp, utc_t_val_temp]})

            log.info('-' * 20)

    data_dict.update({'time_slots': time_slots})
    log.info('-' * 40)

    # directly access internet
    if include_weather == 1:
        log.info("Construction weather dict")
        #weather_list -that is pretty much fixed from database
        #(*) is the data to be used for our analysis
        #0 TimeEEST
        #1 TemperatureC (*)
        #2 Dew PointC (*)
        #3 Humidity (*)
        #4 Sea Level PressurehPa
        #5 VisibilityKm
        #6 Wind Direction
        #7 Wind SpeedKm/h
        #8 Gust SpeedKm/h
        #9 Precipitationmm
        #10 Events (*)
        #11 Conditions (*)
        #12 WindDirDegrees
        #13 DateUTC

        weather_list = get_weather_timelet(data_dict, time_slots, timelet_inv)
        # Convert symbols to Integer representaion

        data_dict['Conditions'][1], Conditions_dict = symbol_to_state(data_dict['Conditions'][1])
        data_dict['Events'][1], Events_dict = symbol_to_state(data_dict['Events'][1])
        data_dict.update({'sensor_list': sensor_list})
        data_dict.update({'weather_list' : weather_list})
        data_dict.update({'Conditions_dict': Conditions_dict})
        data_dict.update({'Events_dict' : Events_dict})

        # Change List to Array type
        for key_id in weather_list:
            temp_list = list()
            for k, list_val_ in enumerate(data_dict[key_id]):
                temp_list.append(np.asanyarray(list_val_))

            data_dict[key_id] = temp_list

    # use stored bin file
    elif include_weather == 2:
        log.info('use weather_dict.bin')
        # This part to be filled with Khiem......

    else:
        log.info('skip weather database...')

    return data_dict, purge_list
Example #35
0
def state_retrieval(obs,
                    max_num_cluster=6,
                    off_set=0,
                    est_method='kmean',
                    PARALLEL=False):
    log.info('-' * 40)
    log.info('Retrieving discrete states from data using ' + est_method +
             ' model...')
    log.info('-' * 40)
    log.info('try ' + str(max_num_cluster) + ' clusters..... ')
    score = np.zeros(max_num_cluster)
    model_set = list()

    if not PARALLEL:
        for num_cluster in range(max_num_cluster):
            log.info('Try ' + str(num_cluster + 1) + ' clusters ')
            log.info('-----------------------------------')
            if est_method == 'kmean':
                kmean = KMeans(n_clusters=num_cluster + 1).fit(obs)
                model_set.append(kmean)
                #score[num_cluster]=-1*np.log(-1*np.sum(kmean.score(obs)))
                #score[num_cluster]=kmean.score(obs)
                #score[num_cluster]=kmean.score(obs)-.5*(num_cluster+1)*1*log10(len(obs))
                #log_ll_val=compute_log_ll(kmean.labels_,obs)
                score[num_cluster] = compute_log_ll(kmean.labels_, obs)

            elif est_method == 'gmm':
                gmm = mixture.GMM(n_components=num_cluster + 1).fit(obs)
                model_set.append(gmm)
                score[num_cluster] = np.sum(gmm.score(obs))

            else:
                log.error('not supported est_method')
                raise NameError('not supported est_method')
    else:
        log.info('Parallel enabled...')
        model_set = [0] * max_num_cluster
        score = [0] * max_num_cluster
        params = [(obs, i + 1, est_method) for i in range(max_num_cluster)]

        p = Pool(max_num_cluster)
        models = p.map(pp_cluster_state_retrieval, params)
        p.close()
        p.join()

        model_dict = dict(models)
        for k, v in model_dict.iteritems():
            model_set[k] = v[0]
            score[k] = v[1]

    score_err_sum = np.zeros(max_num_cluster)
    log.info('Finding knee points of log likelihood...')

    for i in range(max_num_cluster):
        a_0 = score[:(i)]
        if len(a_0) > 1:
            slope, intercept, r_value, p_value, std_err = stats.linregress(
                range(len(a_0)), a_0)
            sqr_sum_err0 = sum(
                ((slope * np.arange(len(a_0)) + intercept) - a_0)**2)
        else:
            sqr_sum_err0 = 0
        a_1 = score[(i):]
        if len(a_1) > 1:
            slope, intercept, r_value, p_value, std_err = stats.linregress(
                range(len(a_1)), a_1)
            sqr_sum_err1 = sum(
                ((slope * np.arange(len(a_1)) + intercept) - a_1)**2)
        else:
            sqr_sum_err1 = 0
        score_err_sum[i] = sqr_sum_err0 + sqr_sum_err1
    # Optimum number of clusters.
    min_idx = np.argmin(score_err_sum)
    opt_num_cluster = min_idx + 1
    log.info('opt_num_cluster: ' + str(opt_num_cluster))

    if est_method == 'kmean':
        label = model_set[min_idx].labels_
    elif est_method == 'gmm':
        label = model_set[min_idx].predict(obs)
    else:
        raise NameError('not supported est_method')
    return label, opt_num_cluster, model_set[min_idx], score, score_err_sum
Example #36
0
def data_summerization(bldg_key, data_dict, proc_avg=True, proc_diff=True, PARALLEL=False):

    time_slots = data_dict['time_slots'][:]
    conditions_dict = data_dict['Conditions_dict'].copy()
    events_dict = data_dict['Events_dict'].copy()
    sensor_list = data_dict['sensor_list'][:]
    weather_list = data_dict['weather_list'][:]
    weather_list_used = ['TemperatureC', 'Dew PointC', 'Humidity', 'Events', 'Conditions']

    # data_used is the list of refernece name for all measurements from now on.
    data_used = sensor_list + weather_list_used
    # This is a global ID for data_used measurement
    data_used_idx = range(len(data_used))
    sensor_idx = range(len(sensor_list))
    weather_idx = range(len(sensor_list), len(data_used))
    dsout = {'data_dict': data_dict}

    if proc_avg:
        log.info('-' * 40)
        log.info('processing avg.feature..')
        log.info('-' * 40)

        X_Feature, X_Time, X_names, X_zero_var_list, X_zero_var_val, X_int_type_list,\
        X_int_type_idx, X_float_type_list, X_float_type_idx, X_weather_type_idx, X_sensor_type_idx = \
            build_feature_matrix(data_dict, sensor_list, weather_list_used, time_slots, interpolation=1, max_num_succ_idx_for_itpl=int(len(time_slots)*0.05))

        build_feature_matrix_out = \
            {'X_Feature': X_Feature,
             'X_Time': X_Time,
             'X_names': X_names,
             'X_zero_var_list': X_zero_var_list,
             'X_zero_var_val': X_zero_var_val,
             'X_int_type_list': X_int_type_list,
             'X_int_type_idx': X_int_type_idx,
             'X_float_type_list': X_float_type_list,
             'X_float_type_idx': X_float_type_idx,
             'X_weather_type_idx': X_weather_type_idx,
             'X_sensor_type_idx': X_sensor_type_idx}

        build_feature_matrix_out = obj(build_feature_matrix_out)

        if len(X_names+X_zero_var_list) != len(data_used):
            log.error('Missing name is found in X_names or X_zero_var_list')
            raise NameError('Missing name is found in X_names or X_zero_var_list')

        else:
            zero_var_idx = [data_used.index(name_str) for name_str in X_zero_var_list]
            nzero_var_idx = list(set(data_used_idx)-set(zero_var_idx))
        
        if X_Feature.shape[0] > 0:
            # From below all index are reference to X_Feature
            sf_idx = list(set(X_sensor_type_idx)&set(X_float_type_idx))
            # Equivalent to np.array(data_used)[np.array(nzero_var_idx)[sf_idx]]
            sf_name = list(np.array(X_names)[sf_idx])
            si_idx = list(set(X_sensor_type_idx)&set(X_int_type_idx))
            si_name = list(np.array(X_names)[si_idx])
            wf_idx = list(set(X_weather_type_idx)&set(X_float_type_idx))
            wf_name = list(np.array(X_names)[wf_idx])
            wi_idx = list(set(X_weather_type_idx)&set(X_int_type_idx))
            wi_name = list(np.array(X_names)[wi_idx])

            #Euclidian Distance Matrix of Floating type of data only   wf+o
            float_idx = list(set(sf_idx)| set(wf_idx))
            int_idx = list(set(si_idx)| set(wi_idx))

            # Float Type Measurement Clustering
            X_Feature_sfe, sf_exemplars_dict, exemplars_, labels_ = \
                cluster_measurement_points(X_Feature[:, sf_idx], sf_name, corr_bnd=[0.1, 0.9], alg='aff')

            sfe_idx = list(np.array(sf_idx)[exemplars_])
            #plot_label(X_Feature,X_names,labels_,exemplars_,[4,5,6,7])

            # InT Type Measurement Clustering
            X_Feature_sie, si_exemplars_dict, exemplars_, labels_ = \
                cluster_measurement_points(X_Feature[:, si_idx], si_name, corr_bnd=[0.0, 0.9], alg='aff')
            sie_idx = list(np.array(si_idx)[exemplars_])

            # sensor -float type
            sfe_state, sfe_corr_val = x_input_to_states(X_Feature_sfe, CORR_VAL_OUT=1)

            # sensor -integer type
            sie_state = X_Feature_sie

            # weather -float type
            wf_state, wf_corr_val = x_input_to_states(X_Feature[:, wf_idx], CORR_VAL_OUT=1)

            # weather -integer type
            wi_state = X_Feature[:, wi_idx]

            empty_states = np.array([[] for i in range(len(X_Time))])
            if len(sfe_state) == 0:
                sfe_state = empty_states

            if len(sie_state) == 0:
                sie_state = empty_states

            if len(wf_state) ==0:
                wf_state = empty_states

            if len(wi_state) == 0:
                wi_state = empty_states

            # Exemplar sensor only    
            X_Sensor_STATE = np.append(sfe_state,sie_state, axis=1)
            X_Sensor_STATE = X_Sensor_STATE.astype(int)
            X_Sensor_NAMES = list(np.array(X_names)[sfe_idx]) + list(np.array(X_names)[sie_idx])

            X_Weather_STATE = np.append(wf_state,wi_state, axis=1)
            X_Weather_STATE = X_Weather_STATE.astype(int)
            X_Weather_NAMES = list(np.array(X_names)[wf_idx])+list(np.array(X_names)[wi_idx])

            # months of a year,days of a week, and hours of a day
            # (Monday, Tuesday,Wendsday,Thursday,Saturday,Sunday) =(0,1,2,3,4,5,6)
            X_Time_STATE_temp = build_time_states(X_Time)
            X_Time_NAMES_temp = ['MTH', 'WD', 'HR']
            X_Time_STATE = list()
            X_Time_NAMES = list()

            for xt_col, xt_name in zip(X_Time_STATE_temp.T,X_Time_NAMES_temp):
                if len(set(xt_col)) > 1:
                    X_Time_STATE.append(xt_col)
                    X_Time_NAMES.append(xt_name)
            
            X_Time_STATE = np.array(X_Time_STATE).T

            #################################################
            # FORMATTED DATA  FOR REGUALR EVENT
            #################################################
            #DO_PROB_EST=1  ** Save this variables***
            #avgdata_mat = np.hstack([X_Sensor_STATE,X_Weather_STATE,X_Time_STATE])
            #avgdata_names = X_Sensor_NAMES+X_Weather_NAMES+X_Time_NAMES
            avgdata_exemplar = dict(sf_exemplars_dict.items()+si_exemplars_dict.items())
            avgdata_zvar = X_zero_var_list
            
            avgdata_dict = dict()
            avgdata_dict.update({'build_feature_matrix_out': build_feature_matrix_out})

            avgdata_dict.update({'avgdata_state_mat': X_Sensor_STATE})
            avgdata_dict.update({'avgdata_weather_mat': X_Weather_STATE})
            avgdata_dict.update({'avgdata_time_mat': X_Time_STATE})

            avgdata_dict.update({'avg_time_slot': X_Time})
            avgdata_dict.update({'avgdata_exemplar': avgdata_exemplar})
            avgdata_dict.update({'avgdata_zvar': avgdata_zvar})

            avgdata_dict.update({'sensor_names': X_Sensor_NAMES})
            avgdata_dict.update({'weather_names': X_Weather_NAMES})
            avgdata_dict.update({'time_names': X_Time_NAMES})
            dsout.update({'avgdata_dict': avgdata_dict})

    if proc_diff:
        log.info('-' * 40)
        log.info('processing diff.feature..')
        log.info('-' * 40)
        ####################################
        # Irregular Event Extraction
        ####################################
        # Interpolatoin with outlier removal, Here we exclude weather data from irregualr event analysis
        # since weather data noramlly show slow changes in time.so we dont expect in any meaningful diffs values
        measurement_point_set, num_type_set = interpolation_measurement(data_dict, sensor_list, err_rate=1, sgm_bnd=20)

        # Irregualr matrix
        Xdiff_Mat,\
        Xdiff_Time,\
        Xdiff_Names,\
        Xdiff_zero_var_list,\
        Xdiff_zero_var_val,\
        Xdiff_int_type_list,\
        Xdiff_int_type_idx,\
        Xdiff_float_type_list,\
        Xdiff_float_type_idx =\
            build_diff_matrix(measurement_point_set, time_slots, num_type_set, sensor_list, PARALLEL=PARALLEL)

        build_diff_matrix_out = \
            {'Xdiff_Mat':Xdiff_Mat,
             'Xdiff_Time':Xdiff_Time,
             'Xdiff_Names':Xdiff_Names,
             'Xdiff_zero_var_list':Xdiff_zero_var_list,
             'Xdiff_zero_var_val':Xdiff_zero_var_val,
             'Xdiff_int_type_list':Xdiff_int_type_list,
             'Xdiff_int_type_idx':Xdiff_int_type_idx,
             'Xdiff_float_type_list':Xdiff_float_type_list,
             'Xdiff_float_type_idx':Xdiff_float_type_idx}

        build_diff_matrix_out = obj(build_diff_matrix_out)

        if Xdiff_Mat.shape[0] > 0:
            #==============================================================================
            # Restructure diff_marix's and weather matix  for the same common time slot
            #==============================================================================
            time_slots_array = np.sort(np.array(list(set(Xdiff_Time) & set(X_Time))))

            # Extract subset of X_Weather_STATE
            removed_idx_list = list()
            for ridx, slot in enumerate(X_Time):
                slot_idx = np.where(time_slots_array==slot)[0]

                # slot not in common time slots
                if len(slot_idx) == 0:
                    removed_idx_list.append(ridx)

            XDIFF_Weather_STATE = np.delete(X_Weather_STATE, removed_idx_list,axis=0)

            # Extract subset of Xdiff_Mat
            removed_idx_list = list()
            for ridx,slot in enumerate(Xdiff_Time):
                slot_idx = np.where(time_slots_array == slot)[0]

                # slot not in common time slots
                if len(slot_idx) == 0:
                    removed_idx_list.append(ridx)

            Xdiff_Mat = np.delete(Xdiff_Mat, removed_idx_list, axis=0)

            # Update Xdiff_Time
            Xdiff_Time = time_slots_array
            XDIFF_Weather_STATE = np.array(XDIFF_Weather_STATE)    

            # From below all index are reference to X_Feature
            xdiff_sf_idx = Xdiff_float_type_idx
            xdiff_sf_name = Xdiff_float_type_list
            xdiff_si_idx = Xdiff_int_type_idx
            xdiff_si_name = Xdiff_int_type_list

            # Float Type Measurement Clustering
            X_Diff_sfe, sf_diff_exemplars_dict, exemplars_, labels_ = \
                cluster_measurement_points(Xdiff_Mat[:, xdiff_sf_idx], xdiff_sf_name, corr_bnd=[0.1, 0.9])
            xdiff_sfe_idx = list(np.array(xdiff_sf_idx)[exemplars_])

            # InT Type Measurement Clustering
            X_Diff_sie, si_diff_exemplars_dict, exemplars_, labels_ = \
                cluster_measurement_points(Xdiff_Mat[:, xdiff_si_idx], xdiff_si_name, corr_bnd=[0.1, 0.9])
            xdiff_sie_idx = list(np.array(xdiff_si_idx)[exemplars_])

            # sensor -float type
            xdiff_sfe_state, xdiff_sfe_corr_val =\
                x_input_to_states(X_Diff_sfe, CORR_VAL_OUT=1, PARALLEL=PARALLEL)

            # sensor -integer type
            xdiff_sie_state = X_Diff_sie
            empty_states = np.array([[] for i in range(len(Xdiff_Time))])

            if len(xdiff_sfe_state) == 0:
                xdiff_sfe_state = empty_states

            if len(xdiff_sie_state) == 0:
                xdiff_sie_state = empty_states

            if len(wf_state) == 0:
                wf_state = empty_states

            if len(wi_state) == 0:
                wi_state = empty_states

            # Exemplar sensor only    
            XDIFF_Sensor_STATE = np.append(xdiff_sfe_state,xdiff_sie_state, axis=1)
            XDIFF_Sensor_STATE = XDIFF_Sensor_STATE.astype(int)
            XDIFF_Sensor_NAMES = list(np.array(Xdiff_Names)[xdiff_sfe_idx])+list(np.array(Xdiff_Names)[xdiff_sie_idx])

            # months of a year,days of a week, and hours of a day
            # (Monday, Tuesday,Wendsday,Thursday,Saturday,Sunday) =(0,1,2,3,4,5,6)
            XDIFF_Time_STATE_temp = build_time_states(Xdiff_Time)
            XDIFF_Time_NAMES_temp = ['MTH', 'WD', 'HR']
            XDIFF_Time_STATE = list()
            XDIFF_Time_NAMES = list()
            for xt_col, xt_name in zip(XDIFF_Time_STATE_temp.T, XDIFF_Time_NAMES_temp):
                if len(set(xt_col)) > 1:
                    XDIFF_Time_STATE.append(xt_col)
                    XDIFF_Time_NAMES.append(xt_name)
            XDIFF_Time_STATE = np.array(XDIFF_Time_STATE).T

            #################################################
            # FORMATTED DATA  FOR IRREGUALR EVENT
            #################################################
            log.info("FORMATTED DATA  FOR IRREGUALR EVENT")
            #** Save this variables***
            #diffdata_mat = np.hstack([XDIFF_Sensor_STATE,X_Weather_STATE,XDIFF_Time_STATE])
            #diffdata_names = XDIFF_Sensor_NAMES+X_Weather_NAMES+XDIFF_Time_NAMES
            diffdata_exemplar = dict(sf_diff_exemplars_dict.items() + si_diff_exemplars_dict.items())
            diffdata_zvar = Xdiff_zero_var_list

            diffdata_dict = dict()
            diffdata_dict.update({'build_diff_matrix_out': build_diff_matrix_out})

            diffdata_dict.update({'diffdata_state_mat': XDIFF_Sensor_STATE})
            diffdata_dict.update({'diffdata_weather_mat': XDIFF_Weather_STATE})
            diffdata_dict.update({'diffdata_time_mat': XDIFF_Time_STATE})

            diffdata_dict.update({'diff_time_slot': Xdiff_Time})
            diffdata_dict.update({'diffdata_exemplar': diffdata_exemplar})
            diffdata_dict.update({'diffdata_zvar': diffdata_zvar})

            diffdata_dict.update({'sensor_names': XDIFF_Sensor_NAMES})
            diffdata_dict.update({'weather_names': X_Weather_NAMES})
            diffdata_dict.update({'time_names': X_Time_NAMES})

            dsout.update({'diffdata_dict': diffdata_dict})

    dsout.update({'bldg_key': remove_dot(bldg_key)})

    return dsout
Example #37
0
def get_weather_timelet(data_dict,
                        t_slots,
                        timelet_inv,
                        use_weather_data_bin=True):

    log.info('------------------------------------')
    log.info('Retrieving weather data... ')
    log.info('------------------------------------')
    t_start = t_slots[0]
    t_end = t_slots[-1]
    log.info('start time: ' + str(t_start) + ' ~ end time: ' + str(t_end))

    # Date iteration given start time and end-time
    # Iterate for each day for all weather data types
    for date_idx, date in enumerate(daterange(t_start, t_end, inclusive=True)):
        log.info("weather date : " + date.strftime("%Y-%m-%d"))

        temp = date.strftime("%Y,%m,%d").rsplit(',')

        if use_weather_data_bin:
            filename = WEATHER_DIR + "%04d_%02d_%02d.bin" % (int(
                temp[0]), int(temp[1]), int(temp[2]))
            data_day = mt.loadObjectBinaryFast(filename)
        else:
            data_day = rw.retrieve_data('SDH',
                                        int(temp[0]),
                                        int(temp[1]),
                                        int(temp[2]),
                                        view='d')

        # split the data into t
        data_day = data_day.split('\n')

        # Iterate for each time index(h_idx) of a day  for all weather data types
        for h_idx, hour_sample in enumerate(data_day):

            hour_samples = hour_sample.split(',')

            # Initialize weather data lists of dictionary
            # The first row is always the list of weather data types
            if (h_idx == 0) and (date_idx == 0):

                sensor_name_list = hour_sample.split(',')
                sensor_name_list = [
                    sensor_name.replace('/', '-')
                    for sensor_name in sensor_name_list
                ]

                for sample_idx, each_sample in enumerate(hour_samples):
                    sensor_name = sensor_name_list[sample_idx]
                    sensor_read = [[] for i in range(len(t_slots))]
                    stime_read = [[] for i in range(len(t_slots))
                                  ]  # Creat the list of lists for minute index
                    utc_t = []
                    val = []
                    #data_dict.update({sensor_name:sensor_read})
                    #data_dict.update({sensor_name:zip(mtime_read,sensor_read)})
                    data_dict.update(
                        {sensor_name: [stime_read, sensor_read, [utc_t, val]]})

            elif h_idx > 0:
                ################################################################
                # 'DateUTC' is the one
                sample_DateUTC = hour_samples[sensor_name_list.index(
                    'DateUTC')]

                # convert to UTC time to VTT local time.
                utc_dt = dt.datetime.strptime(sample_DateUTC,
                                              "%Y-%m-%d %H:%M:%S")
                vtt_dt_aware = utc_dt.replace(
                    tzinfo=from_zone).astimezone(to_zone)

                # convert to offset-naive from offset-aware datetimes
                vtt_dt = dt.datetime(*(vtt_dt_aware.timetuple()[:6]))

                ### WARNING: vtt_utc is not utc
                #log.warn("vtt_utc is not utc")
                vtt_utc = dtime_to_unix([vtt_dt])

                # Check boundary condition
                if int((vtt_dt - t_slots[0]).total_seconds()) < 0 or int(
                    (vtt_dt -
                     t_slots[-1]).total_seconds()) >= timelet_inv.seconds:
                    log.debug('skipping weather data out of analysis range...')
                    continue

                slot_idx = int((vtt_dt - t_slots[0]).total_seconds() /
                               timelet_inv.seconds)
                cur_sec_val = (vtt_dt - t_slots[slot_idx]).total_seconds()

                if cur_sec_val >= timelet_inv.seconds:
                    log.critical('sec: ' + str(cur_sec_val))
                    raise NameError(
                        'Seconds from an hour idx cannot be greater than ' +
                        str(timelet_inv.seconds) + 'secs')

                # time slot index a given weather sample time
                try:

                    for sample_idx, each_sample in enumerate(hour_samples):

                        # convert string type to float time if possible
                        try:
                            each_sample = float(each_sample)
                        except ValueError:
                            each_sample = each_sample

                        sensor_name = sensor_name_list[sample_idx]

                        if sensor_name in data_dict:
                            if each_sample != 'N/A' and each_sample != []:
                                #data_dict[sensor_name][vtt_dt_idx].append(each_sample)
                                data_dict[sensor_name][0][slot_idx].append(
                                    cur_sec_val)
                                data_dict[sensor_name][1][slot_idx].append(
                                    each_sample)
                                data_dict[sensor_name][2][0].append(vtt_utc)
                                data_dict[sensor_name][2][1].append(
                                    each_sample)

                        else:
                            raise NameError(
                                'Inconsistency in the list of weather data')

                except ValueError:
                    slot_idx = -1

            # hour_sample is list of weather filed name, discard
            else:

                hour_sample = list()

    return sensor_name_list
Example #38
0
    def run(self):

        from log_util import log

        try:
            while True:
                cmd = None
                try:
                    cmd = self.cmd_q.get(block=True, timeout=0.1)
                except Exception as e:
                    continue

                finally:
                    if cmd:
                        self.cmd_q.task_done()

                        try:

                            with open(META_DIR + "wip.json", 'w') as f:
                                f.write(simplejson.dumps({"wip": 1}))

                            cmdset = simplejson.loads(cmd)
                            sensor_hash = cmdset['selected-nodes']
                            s_date = datetime.strptime(cmdset['start-date'], '%Y-%m-%d')
                            e_date = datetime.strptime(cmdset['end-date'], '%Y-%m-%d')

                            if not len(sensor_hash):
                                log.critical("No sensor is selected!")
                            else:

                                log.info('****************************** Begining of DDEA *******************************')

                                bldg_key = 'SODA'
                                #exemplar by user
                                #pname_key = '_POWER_'
                                pname_key = 'POWER'

                                s_epoch = int(time.mktime(s_date.timetuple()))
                                e_epoch = int(time.mktime(e_date.timetuple()))
                                time_inv = dt.timedelta(seconds=cmdset['time-interval'])

                                log.info("Cleaning up old output...")

                                mt.remove_all_files(FIG_DIR)
                                mt.remove_all_files(JSON_DIR)
                                mt.remove_all_files(PROC_OUT_DIR)

                                log.info("start epoch : " + str(s_epoch) + " end epoch : " + str(e_epoch))
                                log.info(str(time_inv) + ' time slot interval is set for this data set !!!')
                                log.info("BLDG_KEY : " + bldg_key + " PNAME_KEY : " + pname_key)
                                log.info('*' * 80)

                                log.info("Retrieve sensor data from quasar TSDB")

                                sensor_names_hash = mt.sensor_name_uid_dict(bldg_key, sensor_hash)

                                sensor_data = read_sensor_data(sensor_names_hash, s_epoch, e_epoch)

                                if sensor_data and len(sensor_data):
                                    ddea_process(sensor_names_hash, sensor_data, s_epoch, e_epoch, time_inv, bldg_key, pname_key)
                                else:
                                    log.critical("No sensor data available for time period and sensor selected!")

                                log.info('******************************** End of DDEA **********************************')

                            os.remove(META_DIR + "wip.json")
                            cmd_lock.clear()

                            log.info("execution-lock cleared")
                            log.info('~' * 80)

                        except Exception as e:
                            os.remove(META_DIR + "wip.json")
                            cmd_lock.clear()
                            print e
                            log.error(str(e))

        except Exception as e:
            os.remove(META_DIR + "wip.json")
            cmd_lock.clear()
            print e
            log.error(str(e))

        finally:
            sys.exit(0)
Example #39
0
def check_bounded_distance_constraint_condition(dist_mat, labels, min_dist,
                                                max_dist):
    intra_err_cnt = 0
    num_clusters = int(labels.max() + 1)
    log.info('-' * 80)
    log.info('Intra-Cluster distance check.....')
    log.info('Condition: inter-cluster distance is upper-bounded by' +
             str(round(max_dist, 2)))
    log.info('-' * 80)

    for i in range(num_clusters):
        idx_set = np.nonzero(labels == (i))[0]
        #print '----------------------------------------------------------'
        #print i,'th cluster: ',idx_set
        for idx_pair in pair_in_idx(idx_set):
            #print idx_pair, 'dist-',round(dist_mat[idx_pair[0],idx_pair[1]],2)
            dist_val_ = dist_mat[idx_pair[0], idx_pair[1]]
            # Rule violation
            if dist_val_ > max_dist:
                log.info('*** the distance of pairs :' + str(idx_pair) +
                         ' in ' + str(i) + 'th cluster ~' +
                         str(np.round(dist_val_, 2)) + ' > max_dist=' +
                         str(np.round(max_dist, 2)) + '***')

                intra_err_cnt = intra_err_cnt + 1
    log.info('-' * 80)
    log.info('Inter-Cluster distance check.....')
    log.info('Condition: intra-cluster distance is lower-bounded by ' +
             str(round(min_dist, 2)))
    log.info('-' * 80)

    cluster_pairs = pair_in_idx(range(num_clusters))
    inter_err_cnt = 0
    for c_pair in cluster_pairs:
        idx_set_0 = np.nonzero(labels == (c_pair[0]))[0]
        idx_set_1 = np.nonzero(labels == (c_pair[1]))[0]
        #print '----------------------------------------------------------'
        #print 'The pairwise distance between ',c_pair[0],'th cluster and',c_pair[1],'th cluster'
        for idx_pair in pair_in_idx(idx_set_0, idx_set_1):
            #print idx_pair, 'dist-',round(dist_mat[idx_pair[0],idx_pair[1]],2)
            dist_val_ = dist_mat[idx_pair[0], idx_pair[1]]
            # Rule violation
            if dist_val_ < min_dist:
                log.info(
                    '*** the distance of pairs :' + str(idx_pair[0]) + ' in ' +
                    str(c_pair[0]) + ' and ' + str(idx_pair[1]) + ' in ' +
                    str(c_pair[1]) + ' ~ ' + str(round(dist_val_, 2)) +
                    ' < min_dist=',
                    str(round(min_dist, 2)) + '***')
                inter_err_cnt += inter_err_cnt + 1
    return intra_err_cnt, inter_err_cnt
Example #40
0
def construct_data_dict(sensor_data,
                        ans_start_t,
                        ans_end_t,
                        timelet_inv,
                        include_weather=1,
                        PARALLEL=False):

    log.info('-' * 80)
    log.info('mapping sensor list into hasing table using dictionary')
    log.info('Align sensor data into a single time_slots referece... from ' +
             str(ans_start_t) + ' to ' + str(ans_end_t))
    log.info('-' * 80)

    # Variable Declare and initialization
    time_slots = list()
    start = ans_start_t
    while start < ans_end_t:
        time_slots.append(start)
        start = start + timelet_inv

    # Data dictionary
    # All sensor and weather data is processed and structred into
    # a consistent single data format -- Dictionary
    data_dict = dict()
    sensor_list = list()
    purge_list = list()

    # Data Access is following ....
    #data_dict[key][time_slot_idx][(min_idx=0 or values=1)]

    if PARALLEL:

        log.info("construct_data_dict >>> Parallel enabled")
        args = [(sensor_uuid, sensor_reading, time_slots, ans_start_t,
                 ans_end_t, timelet_inv)
                for sensor_uuid, sensor_reading in sensor_data.iteritems()]

        p = Pool(CPU_CORE_NUM)
        timed_vlist = p.map(pp_construct_data_dict, args)
        p.close()
        p.join()

        for v in timed_vlist:
            sensor_uuid, timed_value = v

            if len(timed_value):
                sensor_list.append(sensor_uuid)
                data_dict.update({sensor_uuid: timed_value})

            else:
                purge_list.append(sensor_uuid)

    else:

        for sensor_uuid, sensor_reading in sensor_data.iteritems():

            log.info('sampling sensor uuid ' + sensor_uuid)
            len_time_slots = len(time_slots)

            # sensor value is read by time
            dict_sensor_val, dict_stime, utc_t, val =\
                get_val_timelet(sensor_reading, time_slots, ans_start_t, ans_end_t, timelet_inv)

            if dict_sensor_val == -1:
                log.debug('append purge list: dict_sensor_val=-1 ' +
                          sensor_uuid)
                purge_list.append(sensor_uuid)

            elif len(utc_t) < len_time_slots:
                log.debug('append purge list:len(utc_t)<len_time_slots' +
                          sensor_uuid)
                purge_list.append(sensor_uuid)

            elif len(val) < len_time_slots:
                log.debug('append purge list:len(val)<len_time_slots' +
                          sensor_uuid)
                purge_list.append(sensor_uuid)

            else:
                sensor_list.append(sensor_uuid)

                # Convert list to array type for bin file size and loading time,
                dict_sensor_val_temp = np.array(
                    [np.asarray(val_) for val_ in dict_sensor_val])
                dict_stime_temp = np.array(
                    [np.asarray(t_) for t_ in dict_stime])
                utc_t_val_temp = np.asarray([utc_t, val])

                data_dict.update({
                    sensor_uuid:
                    [dict_stime_temp, dict_sensor_val_temp, utc_t_val_temp]
                })

            log.info('-' * 20)

    data_dict.update({'time_slots': time_slots})
    log.info('-' * 40)

    # directly access internet
    if include_weather == 1:
        log.info("Construction weather dict")
        #weather_list -that is pretty much fixed from database
        #(*) is the data to be used for our analysis
        #0 TimeEEST
        #1 TemperatureC (*)
        #2 Dew PointC (*)
        #3 Humidity (*)
        #4 Sea Level PressurehPa
        #5 VisibilityKm
        #6 Wind Direction
        #7 Wind SpeedKm/h
        #8 Gust SpeedKm/h
        #9 Precipitationmm
        #10 Events (*)
        #11 Conditions (*)
        #12 WindDirDegrees
        #13 DateUTC

        weather_list = get_weather_timelet(data_dict, time_slots, timelet_inv)
        # Convert symbols to Integer representaion

        data_dict['Conditions'][1], Conditions_dict = symbol_to_state(
            data_dict['Conditions'][1])
        data_dict['Events'][1], Events_dict = symbol_to_state(
            data_dict['Events'][1])
        data_dict.update({'sensor_list': sensor_list})
        data_dict.update({'weather_list': weather_list})
        data_dict.update({'Conditions_dict': Conditions_dict})
        data_dict.update({'Events_dict': Events_dict})

        # Change List to Array type
        for key_id in weather_list:
            temp_list = list()
            for k, list_val_ in enumerate(data_dict[key_id]):
                temp_list.append(np.asanyarray(list_val_))

            data_dict[key_id] = temp_list

    # use stored bin file
    elif include_weather == 2:
        log.info('use weather_dict.bin')
        # This part to be filled with Khiem......

    else:
        log.info('skip weather database...')

    return data_dict, purge_list
Example #41
0
def plotting_bldg_bn(bldg):
    plt.ioff()

    log.info('Getting anal_out from ' + bldg.bldg_tag)

    try:
        for sig_tag, anal_out in bldg.anal_out.iteritems():

            for bn_prob in anal_out:

                p_name = bn_prob['p_name']

                try:
                    fig_name = 'BN for Sensors ' + p_name
                    plt.figure(fig_name, figsize=(30.0, 30.0))
                    col_name = bn_prob['s_labels']
                    rbn.nx_plot(bn_prob['s_hc'],
                                col_name,
                                graph_layout='spring',
                                node_text_size=30)
                    plt.savefig(FIG_DIR + bldg.bldg_tag + '_' + p_name + '_' +
                                sig_tag + '_bn_sensors' + get_pngid() + '.png',
                                bbox_inches='tight')
                    plt.close()

                except Exception as e:
                    log.error(traceback.print_exc())
                    log.error('error in ' + fig_name + ' ' + str(e))
                    pass

                try:
                    fig_name = 'BN for Time ' + p_name
                    plt.figure(fig_name, figsize=(30.0, 30.0))
                    rbn.nx_plot(bn_prob['t_hc'],
                                bn_prob['t_labels'],
                                graph_layout='spring',
                                node_text_size=30)
                    plt.savefig(FIG_DIR + bldg.bldg_tag + '_' + p_name + '_' +
                                sig_tag + '_bn_time' + get_pngid() + '.png',
                                bbox_inches='tight')
                    plt.close()

                except Exception as e:
                    log.error(traceback.print_exc())
                    log.error('error in ' + fig_name + ' ' + str(e))
                    pass

                try:
                    fig_name = 'BN for Weather ' + p_name
                    plt.figure(fig_name, figsize=(30.0, 30.0))
                    rbn.nx_plot(bn_prob['w_hc'],
                                bn_prob['w_labels'],
                                graph_layout='spring',
                                node_text_size=30)
                    plt.savefig(FIG_DIR + bldg.bldg_tag + '_' + p_name + '_' +
                                sig_tag + '_bn_weather' + get_pngid() + '.png',
                                bbox_inches='tight')
                    plt.close()

                except Exception as e:
                    log.error(traceback.print_exc())
                    log.error('error in ' + fig_name + ' ' + str(e))
                    pass

                try:
                    fig_name = 'BN for Sensor-Time-Weather ' + p_name
                    plt.figure(fig_name, figsize=(30.0, 30.0))
                    rbn.nx_plot(bn_prob['all_hc'],
                                bn_prob['all_labels'],
                                graph_layout='spring',
                                node_text_size=30)
                    plt.savefig(FIG_DIR + bldg.bldg_tag + '_' + p_name + '_' +
                                sig_tag + '_bn_sensor_time_weather' +
                                get_pngid() + '.png',
                                bbox_inches='tight')
                    plt.close()
                except Exception as e:
                    log.error(traceback.print_exc())
                    log.error('error in ' + fig_name + ' ' + str(e))
                    pass

                try:
                    fig_name = 'BN PEAK LH Analysis for Sensor-Time-Weather ' + p_name
                    plt.figure(fig_name, figsize=(30.0, 30.0))
                    plt.subplot(2, 1, 1)
                    plt.plot(bn_prob['all_cause_symbol_xtick'],
                             bn_prob['high_peak_prob'], '-^')
                    plt.plot(bn_prob['all_cause_symbol_xtick'],
                             bn_prob['low_peak_prob'], '-.v')
                    plt.ylabel('Likelihood', fontsize=20)

                    plt.xticks(bn_prob['all_cause_symbol_xtick'],
                               bn_prob['all_cause_symbol_xlabel'],
                               rotation=270,
                               fontsize=20)
                    plt.tick_params(labelsize=20)
                    plt.legend(('High Peak', 'Low Peak'),
                               loc='center right',
                               prop={'size': 25})
                    plt.tick_params(labelsize=20)

                    plt.grid()
                    plt.ylim([-0.05, 1.05])
                    plt.title('Likelihood of ' + str(remove_dot(p_name)) +
                              ' given ' + '\n' +
                              str(remove_dot(bn_prob['all_cause_label'])),
                              fontsize=20)
                    plt.savefig(FIG_DIR + bldg.bldg_tag + '_' + p_name + '_' +
                                sig_tag + '_LH_sensor_time_weather' +
                                get_pngid() + '.png',
                                bbox_inches='tight')
                    plt.close()
                except Exception as e:
                    log.error(traceback.print_exc())
                    log.error('error in ' + fig_name + ' ' + str(e))
                    pass

    except Exception as e:
        log.error(traceback.print_exc())
        log.error(str(e))
        pass

    plt.ion()
Example #42
0
def verify_data_format(data_dict, PARALLEL=False):
    # Verify there is no  [] or N/A in the list
    # Only FLoat or Int format is allowed
    log.info('Checking any inconsisent data format...')
    log.info('-' * 40)

    list_of_wrong_data_format = list()
    time_slots = data_dict['time_slots']
    weather_list_used = [data_dict['weather_list'][i] for i in [1, 2, 3, 10, 11]]
    key_list = weather_list_used+ data_dict['sensor_list']

    if not PARALLEL:
        for key in key_list:
            log.info('checking ' + str(key) + '...')
            for i, samples in enumerate(data_dict[key][1]):
                for j, each_sample in enumerate(samples):

                    if each_sample == []:
                        list_of_wrong_data_format.append([key, i, j])
                        log.info(str(each_sample) + ' at ' + str(time_slots[i]) + ' in ' + str(key))

                    elif not isinstance(each_sample, int) and not isinstance(each_sample, float):
                        list_of_wrong_data_format.append([key, i, j])
                        log.info(str(each_sample) + ' at ' + str(time_slots[i]) + ' in ' + str(key))

        log.info('-' * 40)

    # PARALLEL
    else:
        manager = mp.Manager()
        q = manager.Queue()

        p = mp.Pool(CPU_CORE_NUM)
        param_list = [(key, data_dict[key][1], time_slots, q) for key in key_list]

        p.map(pp_verify_sensor_data_format, param_list)

        p.close()
        p.join()

        while not q.empty():
            item = q.get()
            log.warn('queue item: ' + str(item))
            list_of_wrong_data_format.append(item)
    
    if len(list_of_wrong_data_format) > 0:
        log.critical('Inconsistent data format in the list of data_used')
        raise NameError('Inconsistent data format in the list of data_used')

    return list_of_wrong_data_format
Example #43
0
def show_clusters(exemplars,labels,input_names):
    n_labels = labels.max()
    for i in range(n_labels + 1):
        log.info('Cluster %i: %s' % ((i + 1), ', '.join(input_names[labels == i])))
Example #44
0
def _bn_anaylsis_all(bldg_obj,
                     p_name,
                     sig_tag='avg',
                     num_picks_bn=15,
                     learning_alg='hc'):
    s_names = bldg_obj.sigtags[sig_tag].names['sensor']
    p_idx = s_names.index(p_name)
    data_state_mat = bldg_obj.sigtags[sig_tag].data_state_mat

    log.info('power - sensors + weather + time ...')
    s_cause_label, s_labels, s_hc, s_cp_mat, s_bndata_mat = \
        _bn_anaylsis(bldg_obj, p_name, attr='sensor', sig_tag=sig_tag, num_picks_bn=num_picks_bn, learning_alg=learning_alg)

    t_cause_label, t_labels, t_hc, t_cp_mat, t_bndata_mat = \
        _bn_anaylsis(bldg_obj, p_name, attr='time', sig_tag=sig_tag, num_picks_bn=num_picks_bn, learning_alg=learning_alg)

    w_cause_label, w_labels, w_hc, w_cp_mat, w_bndata_mat = \
        _bn_anaylsis(bldg_obj, p_name, attr='weather', sig_tag=sig_tag, num_picks_bn=num_picks_bn, learning_alg=learning_alg)
    #s_cause_label=s_labels; w_cause_label=w_labels;t_cause_label=t_labels

    s_cause_idx = [
        bldg_obj.sigtags[sig_tag].names['sensor'].index(name)
        for name in s_cause_label
    ]
    t_cause_idx = [
        bldg_obj.sigtags[sig_tag].names['time'].index(name)
        for name in t_cause_label
    ]
    w_cause_idx = [
        bldg_obj.sigtags[sig_tag].names['weather'].index(name)
        for name in w_cause_label
    ]

    bndata_mat = np.vstack( (bldg_obj.sigtags[sig_tag].data_state_mat[:, p_idx].T,\
        bldg_obj.sigtags[sig_tag].data_state_mat[:, s_cause_idx].T, \
        bldg_obj.sigtags[sig_tag].data_weather_mat_[:, w_cause_idx].T, \
        bldg_obj.sigtags[sig_tag].data_time_mat[:, t_cause_idx].T)).T

    cols = [
        name
        for name in [p_name] + s_cause_label + w_cause_label + t_cause_label
    ]

    b_arc_list = \
        pair_in_idx([p_name], s_cause_label + w_cause_label + t_cause_label) + \
        pair_in_idx(s_cause_label, w_cause_label+t_cause_label) + \
        pair_in_idx(w_cause_label, t_cause_label) + \
        pair_in_idx(t_cause_label, t_cause_label)

    # this is the heart and soul of ddea
    black_arc_frame = rbn.construct_arcs_frame(b_arc_list)
    factor_data_mat = rbn.convert_pymat_to_rfactor(bndata_mat)
    data_frame = rbn.construct_data_frame(factor_data_mat, cols)
    if learning_alg == 'tabu':
        hc_b = rbn.bnlearn.tabu(data_frame,
                                blacklist=black_arc_frame,
                                score='bic')
    elif learning_alg == 'mmhc':
        hc_b = rbn.bnlearn.mmhc(data_frame,
                                blacklist=black_arc_frame,
                                score='bic')
    else:
        hc_b = rbn.bnlearn.hc(data_frame,
                              blacklist=black_arc_frame,
                              score='bic')
    amat = rbn.py_get_amat(hc_b)
    cause_label = list(np.array(cols)[np.nonzero(amat[:, 0] == 1)[0]])
    cause_idx = [cols.index(label_) for label_ in cause_label]
    return cause_label, cols, hc_b, amat, bndata_mat
Example #45
0
def cluster_measurement_points(m_matrix, m_name, corr_bnd = [0.1,0.9],alg='aff'):
    exemplars_dict = dict()

    if m_matrix.shape[1] == 0:
        return [], exemplars_dict, [], []

    elif m_matrix.shape[1] == 1:
        exemplars_ = [0]
        labels_= [0]
        exemplars_name = m_name

    else:
        distmat_input = find_norm_dist_matrix(m_matrix)

        # Find representative set of sensor measurements 
        min_dist_ = np.sqrt(2*(1-(corr_bnd[1])))
        max_dist_ = np.sqrt(2*(1-(corr_bnd[0])))

        if alg == 'pack':
            log.info('use pack clustering algoirthm')
            exemplars_, labels_ = max_pack_cluster(distmat_input, min_dist=min_dist_, max_dist=max_dist_)
        else:
            log.info('use affinity clustering algoirthm')
            SIMM_MAT = 2 - distmat_input
            exemplars_, labels_ = cluster.affinity_propagation(SIMM_MAT, damping=0.5)

        num_clusters = int(labels_.max()+1)
        log.info('-' * 40)
        log.info(str(num_clusters) + 'clusters out of ' + str(len(labels_)) + 'measurements')
        log.info('-' * 40)

        validity, intra_dist, inter_dist = compute_cluster_err(distmat_input, labels_)

        log.info('validity: ' + str(round(validity,2)) + ', intra_dist: ' +
                 str(np.round(intra_dist,2)) + ', inter_dist: ' +
                 str(np.round(inter_dist,2)))
        log.info('-' * 40)
        exemplars_name = list(np.array(m_name)[exemplars_])
    
    for label_id, (m_idx,exemplar_label) in enumerate(zip(exemplars_, exemplars_name)):
        log.info(str(exemplar_label))
        children_set = list(set(np.nonzero(labels_ == label_id)[0]) - set([m_idx]))
        log.info('Label ' + str(label_id) + ' : ' + str(m_idx) + '<--' + str(children_set) )
        exemplars_dict.update({exemplar_label : list(np.array(m_name)[children_set])})

    return m_matrix[:, exemplars_], exemplars_dict, exemplars_, labels_
Example #46
0
def plotting_bldg_lh(bldg, bldg_key=[], attr='sensor', num_picks=30):
    log.info('-' * 40)
    log.info('plotting lh for ' + attr)
    log.info('-' * 40)
    sig_tag_set = ['avg', 'diff']
    plt.ioff()

    if not len(bldg_key):
        bldg_tag_set = [bldg.bldg_tag]
    else:
        bldg_tag_set = [bldg_key]

    for bldg_tag in bldg_tag_set:
        if bldg_tag == bldg.bldg_tag:
            log.info('-' * 40)
            log.info(bldg_tag + " is to be plotted...")
            log.info('-' * 40)

            for sig_tag in sig_tag_set:
                try:
                    p_names = bldg.sigtags[sig_tag].p_names

                    for pname in p_names:
                        try:
                            blank_idx = pname.index('.')
                            pname = pname.replace('.', '_')
                        except:
                            pass

                        optprob_set = None
                        optstate_set = None
                        for anal in bldg.analysis[sig_tag]:
                            if anal.sensor_tag == pname:
                                optprob_set = anal.attrs[attr].optprob_set
                                optstate_set = anal.attrs[attr].optstate_set
                                break

                        s_names = bldg.sigtags[sig_tag].names[attr]

                        num_picks = 30
                        sort_idx = np.argsort(optprob_set)[::-1]
                        sort_lh = optprob_set[sort_idx[:num_picks]].T
                        sort_state = optstate_set[sort_idx[:num_picks]].T
                        x_label = list(np.array(s_names)[sort_idx[:num_picks]])
                        x_ticks = range(len(x_label))

                        plt.figure(figsize=(20.0, 15.0))
                        plt.subplot(2, 1, 1)
                        plt.plot(sort_lh, '-*')
                        plt.xticks(x_ticks, x_label, rotation=270, fontsize="small")
                        if sig_tag == 'avg':
                            plt.title('Most relavant ' + attr + ' attributes to the peak (demand) of '+ pname, fontsize=20)
                        else:
                            plt.title('Most relavant ' + attr + ' attributes to the peak variations of '+ pname, fontsize=20)
                        plt.tick_params(labelsize='large')
                        plt.ylim([-0.05, 1.05])
                        plt.ylabel('Likelihood (From 0 to 1)', fontsize=18)
                        plt.savefig(FIG_DIR + bldg_tag + '_' + pname + '_' + attr + '_' + sig_tag + '_lh_sensors.png', bbox_inches='tight')
                        plt.close()

                except Exception as e:
                    log.error(traceback.print_exc())
                    log.error(str(e))
                    pass
    plt.close()
    plt.ion()
    def __init__(self, config, session):
        self.config = config
        self.session = session

        prefix = '%s_%.2f' % (config.method, config.alpha)

        self.filepath = '%s-dim_%d' % (
            prefix,
            config.dim,
        )

        self.train_dir = './train_dir/seed_%d/scale_%d/%s' % (
            self.config.seed, self.config.scale, self.filepath)
        self.fig_dir = '/home/dilin/Dropbox/tmp/figures/seed_%d/scale_%d/%s' % (
            self.config.seed, self.config.scale, self.filepath)

        for folder in [self.train_dir, self.fig_dir]:
            if not os.path.exists(folder):
                os.makedirs(folder)
            # clean train folder
            if self.config.clean:
                files = glob.glob(folder + '/*')
                for f in files:
                    os.remove(f)

        log.infov("Train Dir: %s, Figure Dir: %s", self.train_dir,
                  self.fig_dir)

        # --- create model ---
        self.p_target = config.p_target
        self.model = Model(config, self.p_target)

        # --- optimizer ---
        self.global_step = tf.Variable(0, name="global_step")

        self.learning_rate = config.learning_rate
        if config.lr_weight_decay:
            self.learning_rate = tf.train.exponential_decay(
                self.learning_rate,
                global_step=self.global_step,
                decay_steps=10000,
                decay_rate=0.1,
                staircase=True,
                name='decaying_learning_rate')

        self.summary_op = tf.summary.merge_all()
        self.saver = tf.train.Saver(max_to_keep=1)
        self.summary_writer = tf.summary.FileWriter(self.train_dir)
        self.checkpoint_secs = 300  # 5 min

        self.train_op = self.optimize_adagrad(
            self.model.loss,
            train_vars=self.model.q_train_vars,
            lr=self.learning_rate)

        tf.global_variables_initializer().run()
        if config.checkpoint is not None:
            self.ckpt_path = tf.train.latest_checkpoint(self.config.checkpoint)
            if self.ckpt_path is not None:
                log.info("Checkpoint path: %s", self.ckpt_path)
                self.saver.restore(self.session, self.ckpt_path)
                log.info(
                    "Loaded the pretrain parameters from the provided checkpoint path"
                )
Example #48
0
def plotting_bldg_bn(bldg):
    plt.ioff()

    log.info('Getting anal_out from ' + bldg.bldg_tag)

    try:
        for sig_tag, anal_out in bldg.anal_out.iteritems():

            for bn_prob in anal_out:

                p_name = bn_prob['p_name']

                try:
                    fig_name = 'BN for Sensors ' + p_name
                    plt.figure(fig_name, figsize=(30.0, 30.0))
                    col_name = bn_prob['s_labels']
                    rbn.nx_plot(bn_prob['s_hc'], col_name, graph_layout='spring', node_text_size=30)
                    plt.savefig(FIG_DIR + bldg.bldg_tag + '_' + p_name + '_' + sig_tag + '_bn_sensors' + get_pngid() + '.png', bbox_inches='tight')
                    plt.close()

                except Exception as e:
                    log.error(traceback.print_exc())
                    log.error('error in ' + fig_name + ' ' + str(e))
                    pass

                try:
                    fig_name = 'BN for Time ' + p_name
                    plt.figure(fig_name, figsize=(30.0,30.0))
                    rbn.nx_plot(bn_prob['t_hc'], bn_prob['t_labels'], graph_layout='spring', node_text_size=30)
                    plt.savefig(FIG_DIR + bldg.bldg_tag + '_' + p_name + '_' + sig_tag + '_bn_time' + get_pngid() + '.png', bbox_inches='tight')
                    plt.close()

                except Exception as e:
                    log.error(traceback.print_exc())
                    log.error('error in ' + fig_name + ' ' + str(e))
                    pass

                try:
                    fig_name = 'BN for Weather ' + p_name
                    plt.figure(fig_name, figsize=(30.0,30.0))
                    rbn.nx_plot(bn_prob['w_hc'], bn_prob['w_labels'], graph_layout='spring', node_text_size=30)
                    plt.savefig(FIG_DIR + bldg.bldg_tag + '_' + p_name + '_' + sig_tag + '_bn_weather' + get_pngid() +'.png', bbox_inches='tight')
                    plt.close()

                except Exception as e:
                    log.error(traceback.print_exc())
                    log.error('error in ' + fig_name + ' ' + str(e))
                    pass

                try:
                    fig_name = 'BN for Sensor-Time-Weather ' + p_name
                    plt.figure(fig_name, figsize=(30.0,30.0))
                    rbn.nx_plot(bn_prob['all_hc'], bn_prob['all_labels'], graph_layout='spring', node_text_size=30)
                    plt.savefig(FIG_DIR + bldg.bldg_tag + '_' + p_name + '_' + sig_tag + '_bn_sensor_time_weather' + get_pngid() + '.png', bbox_inches='tight')
                    plt.close()
                except Exception as e:
                    log.error(traceback.print_exc())
                    log.error('error in ' + fig_name + ' ' + str(e))
                    pass

                try:
                    fig_name = 'BN PEAK LH Analysis for Sensor-Time-Weather ' + p_name
                    plt.figure(fig_name, figsize=(30.0, 30.0))
                    plt.subplot(2, 1, 1)
                    plt.plot(bn_prob['all_cause_symbol_xtick'], bn_prob['high_peak_prob'], '-^')
                    plt.plot(bn_prob['all_cause_symbol_xtick'], bn_prob['low_peak_prob'], '-.v')
                    plt.ylabel('Likelihood', fontsize=20)

                    plt.xticks(bn_prob['all_cause_symbol_xtick'], bn_prob['all_cause_symbol_xlabel'], rotation=270, fontsize=20)
                    plt.tick_params(labelsize=20)
                    plt.legend(('High Peak', 'Low Peak'), loc='center right', prop={'size':25})
                    plt.tick_params(labelsize=20)

                    plt.grid()
                    plt.ylim([-0.05,1.05])
                    plt.title('Likelihood of '+ str(remove_dot(p_name))+' given '+'\n'+str(remove_dot(bn_prob['all_cause_label'])), fontsize=20)
                    plt.savefig(FIG_DIR + bldg.bldg_tag + '_' + p_name + '_' + sig_tag + '_LH_sensor_time_weather' + get_pngid() + '.png', bbox_inches='tight')
                    plt.close()
                except Exception as e:
                    log.error(traceback.print_exc())
                    log.error('error in ' + fig_name + ' ' + str(e))
                    pass

    except Exception as e:
        log.error(traceback.print_exc())
        log.error(str(e))
        pass

    plt.ion()
Example #49
0
def ddea_process(sensor_names_hash, sensor_data, start_time, end_time, timelet_inv, bldg_key, pname_key, plot_analysis=False):
    #----------------------------- DATA PRE-PROCESSING -------------------------
    from log_util import log

    log.info('#' * 80)
    log.info('#  Data Pre-Processing')
    log.info('#' * 80)

    ans_start_t = dt.datetime.fromtimestamp(start_time)
    ans_end_t = dt.datetime.fromtimestamp(end_time)

    data_dict, purge_list = \
            construct_data_dict(sensor_data, ans_start_t, ans_end_t, timelet_inv, PARALLEL=IS_USING_PARALLEL_OPT)

    # This perform data summerization process.
    log.info('-' * 40)
    log.info('VERIFY DATA FORMAT...')
    log.info('-' * 40)

    # This is for data verification purpose
    # You cab skip it if you are sure that there would be no bug in the 'construct_data_dict' function.
    list_of_wrong_data_format = \
        verify_data_format(data_dict, PARALLEL=IS_USING_PARALLEL_OPT)

    if len(list_of_wrong_data_format) > 0:
        log.critical('Measurement list below')
        log.critical('-' * 40)
        log.critical(str(list_of_wrong_data_format))
        raise NameError('Errors in data format')

    if SAVE_PROC_BIN:
        # Save summarized Data in Bin Format
        log.info("Saving data_dict in bin format...")
        mt.saveObjectBinaryFast(data_dict, PROC_OUT_DIR + '/' + bldg_key.lower() + '_data_dict.bin')

    #----------------------------- DATA SUMMERIZATION --------------------------
    # This perform data summerization process.
    log.info('#' * 80)
    log.info('DATA SUMMARIZATION...')
    log.info('#' * 80)

    # Compute Average Feature if PROC_AVG == True
    # Compute Differential Feature if PROC_DIFF == True

    bldg_load_out = data_summerization(bldg_key, data_dict, PARALLEL=IS_USING_PARALLEL_OPT)

    if SAVE_PROC_BIN:
        # Save summarized Data in Bin Format
        log.info("Saving summarized building data in bin format...")
        mt.saveObjectBinaryFast(bldg_load_out, PROC_OUT_DIR + bldg_key.lower() + '_ds_out.bin')

    # Export Summarized Data to JSON
    feat_avg_exist = bool('avgdata_dict' in bldg_load_out.keys())
    feat_diff_exist = bool('diffdata_dict' in bldg_load_out.keys())

    if feat_avg_exist and feat_diff_exist:
        log.info("Saving summarized building data in JSON format...")
        save_processed_json(sensor_names_hash, bldg_load_out)

    if feat_avg_exist:
        save_avg_data_summary_json(bldg_key, sensor_names_hash, bldg_load_out['avgdata_dict'])

    if feat_diff_exist:
        save_diff_data_summary_json(bldg_key, sensor_names_hash, bldg_load_out['diffdata_dict'])

    #------------------------------- MODEL DISCOVERY ---------------------------
    log.info('#' * 80)
    log.info('MODEL DISCOVERY...')
    log.info('#' * 80)

    log.info('Building for '+ bldg_key + '....')

    ## CREATE BUILDING OBJECT ##
    bldg = pbp.create_bldg_object(bldg_load_out, bldg_key, pname_key, PARALLEL=IS_USING_PARALLEL_OPT)

    ## BAYESIAN NETWORK PROBABILITY ANALYSIS OBJECT ##
    if feat_avg_exist:
        avg = pbp.bn_probability_analysis(bldg, sig_tag='avg')
        bldg.anal_out.update({'avg': avg})

    if feat_diff_exist:
        diff = pbp.bn_probability_analysis(bldg, sig_tag='diff')
        bldg.anal_out.update({'diff': diff})

    if SAVE_PROC_BIN:
        # Save a building data in Bin format
        log.info("Saving building graph in bin format...")
        mt.saveObjectBinaryFast(bldg, PROC_OUT_DIR + bldg_key.lower() + '_bldg_out.bin')

    # Export a building graph in json format
    log.info("Saving building graph in JSON format...")
    all_labels, all_edges = conv_bn_graph_json(bldg)
    save_bn_graph_json(bldg_key, sensor_names_hash, all_labels, all_edges)

    if plot_analysis:
        log.info('#' * 80)
        log.info('ANALYTICS PLOTTING...')
        log.info('#' * 80)

        # Analysis of BN network result - All result will be saved in fig_dir.
        pbp.plotting_bldg_lh(bldg, attr='sensor', num_picks=30)
        pbp.plotting_bldg_lh(bldg, attr='time', num_picks=30)
        pbp.plotting_bldg_lh(bldg, attr='weather', num_picks=30)

        pbp.plotting_bldg_bn(bldg)
Example #50
0
def weather_convert(wdata_mat,
                    wdata_name,
                    Conditions_dict,
                    Events_dict,
                    PARALLEL=False):
    """
    New dictionary by state classification of weather data
    """
    weather_dict = dict()
    # coolect index of weather data point in previous data

    try:
        temp_idx = wdata_name.index('TemperatureC')
    except:
        temp_idx = list()

    try:
        dewp_idx = wdata_name.index('Dew_PointC')
    except:
        dewp_idx = list()

    try:
        humd_idx = wdata_name.index('Humidity')
    except:
        humd_idx = list()

    try:
        evnt_idx = wdata_name.index('Events')
    except:
        evnt_idx = list()

    try:
        cond_idx = wdata_name.index('Conditions')
    except:
        cond_idx = list()

    ############################################################################
    # Weather state classification
    ############################################################################
    for class_idx in [temp_idx, dewp_idx, humd_idx]:
        obs = wdata_mat[:, class_idx][:, np.newaxis]
        label, opt_num_cluster, model, score, score_err_sum=\
            state_retrieval(obs, max_num_cluster=30, off_set=1, est_method='kmean', PARALLEL=PARALLEL)
        if class_idx == temp_idx:
            log.info('Temp state classification...')
            weather_dict.update({'Temp': model.cluster_centers_})
        elif class_idx == dewp_idx:
            log.info('Dewp state classification...')
            weather_dict.update({'Dewp': model.cluster_centers_})
        elif class_idx == humd_idx:
            log.info('Humd state classification...')
            weather_dict.update({'Humd': model.cluster_centers_})
        else:
            log.info('not found')

        for label_id in range(label.max() + 1):
            label_idx = np.nonzero(label == label_id)[0]
            wdata_mat[label_idx, class_idx] = np.round(
                model.cluster_centers_[label_id][0])

    ##################################################
    # Reclassify the Condition states into clarity of the sky
    ##################################################
    cond_state = [[]] * 9
    # Clear
    cond_state[8] = ['Clear']
    # 'Partly Cloudy'
    cond_state[7] = ['Partly Cloudy', 'Scattered Clouds']
    # 'Overcast'
    cond_state[6] = ['Mostly Cloudy', 'Overcast']
    # Light Rain
    cond_state[5] = [
        'Fog', 'Mist', 'Shallow Fog', 'Patches of Fog', 'Light Freezing Fog'
    ]
    cond_state[4] = [
        'Drizzle', 'Heavy Drizzle', 'Light Drizzle', 'Light Freezing Drizzle'
    ]
    # Heavy Rain
    cond_state[3] = [
        'Rain', 'Rain Showers', 'Thunderstorms and Rain', 'Heavy Rain',
        'Heavy Rain Showers', 'Freezing Rain', 'Light Freezing Rain',
        'Light Rain Showers', 'Light Rain', 'Light Thunderstorms and Rain'
    ]
    cond_state[2] = [
        'Ice Pellets', 'Ice Crystals', 'Light Ice Crystals',
        'Light Ice Pellets'
    ]
    # 'Snow'
    cond_state[1] = [
        'Snow', 'Snow Showers', 'Light Snow', 'Light Snow Grains',
        'Light Snow Showers'
    ]
    cond_state[0] = ['Unknown']
    cond_data_array = wdata_mat[:, cond_idx].copy()

    log.info('Condition state classification...')
    for k in range(len(cond_state)):
        for cond_str in cond_state[k]:
            if cond_str in Conditions_dict.keys():
                cond_val_old = Conditions_dict[cond_str]
                idx_temp = np.nonzero(cond_data_array == cond_val_old)[0]
                if len(idx_temp) > 0:
                    wdata_mat[idx_temp, cond_idx] = k

    Conditions_dict_temp = dict()
    Conditions_dict_temp.update({'Clear': 8})
    Conditions_dict_temp.update({'Cloudy': 7})
    Conditions_dict_temp.update({'Overcast': 6})
    Conditions_dict_temp.update({'Fog': 5})
    Conditions_dict_temp.update({'Drizzle': 4})
    Conditions_dict_temp.update({'Rain': 3})
    Conditions_dict_temp.update({'Ice': 2})
    Conditions_dict_temp.update({'Snow': 1})
    Conditions_dict_temp.update({'Unknown': 0})
    # Abbr' of weather factor type is
    weather_dict.update({'Cond': Conditions_dict_temp})
    ####################################################################
    # Reclassify the Event states into rain/snow/fog weather conditons
    ####################################################################
    event_state = [[]] * 4
    # No event
    event_state[0] = ['']
    # Snow
    event_state[1] = ['Rain-Snow', 'Snow', 'Fog-Snow']
    # Rain
    event_state[2] = ['Rain', 'Thunderstorm', 'Rain-Thunderstorm']
    # Fog
    event_state[3] = ['Fog', 'Fog-Rain']
    log.info('Event state classification...')

    event_data_array = wdata_mat[:, evnt_idx].copy()
    for k in range(len(event_state)):
        for event_str in event_state[k]:
            if event_str in Events_dict.keys():
                event_val_old = Events_dict[event_str]
                idx_temp = np.nonzero(event_data_array == event_val_old)[0]
                if len(idx_temp) > 0:
                    wdata_mat[idx_temp, evnt_idx] = k

    Events_dict_temp = dict()
    Events_dict_temp.update({'NoEvent': 0})
    Events_dict_temp.update({'Snow': 1})
    Events_dict_temp.update({'Rain': 2})
    Events_dict_temp.update({'Fog': 3})
    weather_dict.update({'Event': Events_dict_temp})
    return wdata_mat, weather_dict
Example #51
0
def build_diff_matrix(measurement_point_set, time_slots, num_type_set, irr_data_name, conf_lev=0.5, PARALLEL=False):

    #time_slots_utc = dtime_to_unix(time_slots)
    Xdiff = list()
    input_names = list()
    INT_type_list = list()
    FLOAT_type_list = list()
    INT_type_idx = list()
    FLOAT_type_idx = list()
    zero_var_list = list()

    # whose variance is zero, hence carry no information,
    zero_var_val = list()
    num_of_samples = len(time_slots)
    #TIMELET_INV_seconds = (time_slots[1]-time_slots[0]).seconds

    log.info('=' * 40)
    if not PARALLEL:
        for k, (set_val, set_name) in enumerate(zip(measurement_point_set, irr_data_name)):
            log.info(str(irr_data_name[k]))
            try:
                num_type = num_type_set[k]
                diff_mean = get_diff(set_val, time_slots, num_type, conf_lev)
                if num_type == FLOAT_TYPE:
                    #norm_diff_mean,output_status=normalize_data(diff_mean[:,0])
                    norm_diff_mean, output_status = normalize_data(diff_mean)
                elif num_type == INT_TYPE:
                    #num_discrete_vals=len(set(list(diff_mean[:,0])))
                    num_discrete_vals = len(set(list(diff_mean)))
                    log.info('num_discrete_vals : ' + str(num_discrete_vals))

                    if num_discrete_vals > 1:
                        output_status = 0
                        norm_diff_mean = diff_mean
                    else:
                        output_status = -1
                        #norm_diff_mean = list(set(diff_mean[:,0]))
                        norm_diff_mean = list(set(diff_mean))
                else:
                    pass
                if len(np.nonzero(norm_diff_mean == np.inf)[0])>num_of_samples/5:
                    raise 
            except Exception as e:
                log.error(traceback.print_exc())
                log.error('Error in processing data feature, excluded from analysis ' + str(e))
                output_status = -1
                norm_diff_mean = None

            if output_status == -1:
                #zero_var_flag=1
                zero_var_list.append(set_name)
                zero_var_val.append(norm_diff_mean)
                log.warn('too small variance for float type or a single value for int type, added to zero var list')
            else:
                input_names.append(set_name)
                Xdiff.append(norm_diff_mean)

                if num_type == FLOAT_TYPE:
                    FLOAT_type_list.append(set_name)
                    FLOAT_type_idx.append(len(Xdiff)-1)

                elif num_type == INT_TYPE:
                    INT_type_list.append(set_name)
                    INT_type_idx.append(len(Xdiff)-1)

            log.info('-' * 20)
        log.info('-' * 40)

    # PARALLEL ENABLED
    else:
        log.info('Build diff matrix: Parallel enabled...')
        # Construct param list for workers
        param_list = list()
        for k, (set_val, set_name) in enumerate(zip(measurement_point_set, irr_data_name)):
            param_list.append((k, time_slots, conf_lev, set_val, set_name, num_type_set[k]))

        p = mp.Pool(CPU_CORE_NUM)
        ret_dict = dict(p.map(build_diff, param_list))
        p.close()
        p.join()

        for k in sorted(ret_dict.keys()):
            """
            v = ret_dict[k]
            output_status = v[0]
            norm_diff_mean = v[1]
            """

            output_status, norm_diff_mean = ret_dict[k]

            set_name = irr_data_name[k]
            num_type = num_type_set[k]

            if output_status == -1:
                zero_var_list.append(set_name)
                #zero_var_flag=1
                zero_var_val.append(norm_diff_mean)
                log.warn("too small variance for float type or a single value for int type, added to zero var list")
            else:
                input_names.append(set_name)
                try:
                    Xdiff.append(norm_diff_mean)
                except Exception as e:
                    log.error(traceback.print_exc())
                    log.error(str(e))

                if num_type == FLOAT_TYPE:
                    FLOAT_type_list.append(set_name)
                    FLOAT_type_idx.append(len(Xdiff)-1)

                elif num_type == INT_TYPE:
                    INT_type_list.append(set_name)
                    INT_type_idx.append(len(Xdiff)-1)
            log.info('-' * 20)


    Xdiff = np.array(Xdiff).T
    deleted_timeslot_idx = list()
    log.info('-' * 20)
    log.info('removing time slots having no sample...')

    inf_idx_set = list()
    for col_vec in Xdiff.T:
        inf_idx = np.nonzero(col_vec == np.infty)[0]
        inf_idx_set=np.r_[inf_idx_set, inf_idx]
    inf_col_idx = list(set(list(inf_idx_set)))
    deleted_timeslot_idx = np.array([int(x) for x in inf_col_idx]).astype(int)
    log.info('time slots ' + str(deleted_timeslot_idx) + ' removed...')
    log.info('-' * 20)

    Xdiff = np.delete(Xdiff, deleted_timeslot_idx, axis=0)
    new_time_slot = np.delete(time_slots, deleted_timeslot_idx)

    # Checking whether it has any ill entry value
    verify_data_mat(Xdiff)

    log.info('*-' * 20)
    log.info("* deleted_timeslot_idx : " + str(deleted_timeslot_idx))
    log.info('*-' * 20)

    return Xdiff,\
           new_time_slot,\
           input_names,\
           zero_var_list,\
           zero_var_val, \
           INT_type_list,\
           INT_type_idx,\
           FLOAT_type_list,\
           FLOAT_type_idx
Example #52
0
def _bn_anaylsis(bldg_obj, p_name, attr='sensor', sig_tag='avg', num_picks_bn=15, learning_alg='hc'):
    s_names = bldg_obj.sigtags[sig_tag].names['sensor']
    p_idx = s_names.index(p_name)
    data_state_mat = bldg_obj.sigtags[sig_tag].data_state_mat

    anlist = bldg_obj.analysis[sig_tag]

    optprob_set = None
    optstate_set = None
    for anal in anlist:
        if anal.sensor_tag == p_name:
            optprob_set = anal.attrs[attr].optprob_set
            optstate_set = anal.attrs[attr].optstate_set
            break

    if optprob_set is None or optstate_set is None:
        raise Exception("Invalid p_name", p_name)

    sort_idx = np.argsort(optprob_set)[::-1]

    if attr == 'sensor':
        log.info('power - sensors...')
        idx_select = [p_idx] + list(sort_idx[:num_picks_bn])
        cols = [s_names[k] for k in idx_select]
        bndata_mat = bldg_obj.sigtags[sig_tag].data_state_mat[:, idx_select]
        b_arc_list = pair_in_idx([cols[0]], cols[1:])

    elif attr == 'weather':
        log.info('power - weather...')
        w_names = bldg_obj.sigtags[sig_tag].names['weather']
        cols = [p_name] + [w_name for w_name in w_names]
        bndata_mat = np.vstack((bldg_obj.sigtags[sig_tag].data_state_mat[:, p_idx].T,
                                bldg_obj.sigtags[sig_tag].data_weather_mat.T)).T
        b_arc_list = pair_in_idx([cols[0]], cols[1:])

    elif attr == 'time':
        log.info('power - time...')
        t_names = bldg_obj.sigtags[sig_tag].names['time']
        cols = [p_name] + [t_name for t_name in t_names]
        bndata_mat = np.vstack((bldg_obj.sigtags[sig_tag].data_state_mat[:, p_idx].T,
                                bldg_obj.sigtags[sig_tag].data_time_mat.T)).T
        b_arc_list = pair_in_idx([cols[0]], cols[1:]) + pair_in_idx(cols[1:], cols[1:])

    else:
        log.info('error')
        return 0


    # this is the heart and soul of ddea
    black_arc_frame = rbn.construct_arcs_frame(b_arc_list)
    factor_data_mat = rbn.convert_pymat_to_rfactor(bndata_mat)
    data_frame = rbn.construct_data_frame(factor_data_mat, cols)

    if learning_alg == 'tabu':
        hc_b = rbn.bnlearn.tabu(data_frame, blacklist=black_arc_frame, score='bic')
    elif learning_alg == 'mmhc':
        hc_b = rbn.bnlearn.mmhc(data_frame, blacklist=black_arc_frame, score='bic')
    else:
        hc_b = rbn.bnlearn.hc(data_frame, blacklist=black_arc_frame, score='bic')

    amat = rbn.py_get_amat(hc_b)
    cause_label = list(np.array(cols)[np.nonzero(amat[:, 0] == 1)[0]])
    cause_idx = [cols.index(label_) for label_ in cause_label]
    return cause_label, cols, hc_b, amat, bndata_mat
Example #53
0
def weather_convert(wdata_mat, wdata_name, Conditions_dict, Events_dict, PARALLEL=False):
    """
    New dictionary by state classification of weather data
    """
    weather_dict = dict()
    # coolect index of weather data point in previous data

    try:
        temp_idx = wdata_name.index('TemperatureC')
    except:
        temp_idx = list()

    try:
        dewp_idx = wdata_name.index('Dew_PointC')
    except:
        dewp_idx = list()

    try:
        humd_idx = wdata_name.index('Humidity')
    except:
        humd_idx = list()

    try:
        evnt_idx = wdata_name.index('Events')
    except:
        evnt_idx = list()

    try:
        cond_idx = wdata_name.index('Conditions')
    except:
        cond_idx = list()

    ############################################################################
    # Weather state classification
    ############################################################################
    for class_idx in [temp_idx, dewp_idx, humd_idx]:
        obs = wdata_mat[:, class_idx][:, np.newaxis]
        label, opt_num_cluster, model, score, score_err_sum=\
            state_retrieval(obs, max_num_cluster=30, off_set=1, est_method='kmean', PARALLEL=PARALLEL)
        if class_idx == temp_idx:
            log.info('Temp state classification...')
            weather_dict.update({'Temp':model.cluster_centers_})
        elif class_idx == dewp_idx:
            log.info('Dewp state classification...')
            weather_dict.update({'Dewp':model.cluster_centers_})
        elif class_idx == humd_idx:
            log.info('Humd state classification...')
            weather_dict.update({'Humd':model.cluster_centers_})
        else:
            log.info('not found')

        for label_id in range(label.max()+1):
            label_idx = np.nonzero(label==label_id)[0]
            wdata_mat[label_idx, class_idx] = np.round(model.cluster_centers_[label_id][0])

    ##################################################
    # Reclassify the Condition states into clarity of the sky
    ##################################################
    cond_state=[[]]*9
    # Clear
    cond_state[8] = ['Clear']
    # 'Partly Cloudy'
    cond_state[7] = ['Partly Cloudy','Scattered Clouds']
    # 'Overcast'
    cond_state[6] = ['Mostly Cloudy','Overcast']
    # Light Rain
    cond_state[5] = ['Fog','Mist', 'Shallow Fog','Patches of Fog','Light Freezing Fog']
    cond_state[4] = ['Drizzle', 'Heavy Drizzle','Light Drizzle','Light Freezing Drizzle']
    # Heavy Rain
    cond_state[3] = ['Rain','Rain Showers','Thunderstorms and Rain','Heavy Rain','Heavy Rain Showers', 'Freezing Rain','Light Freezing Rain','Light Rain Showers','Light Rain','Light Thunderstorms and Rain']
    cond_state[2] = ['Ice Pellets', 'Ice Crystals','Light Ice Crystals','Light Ice Pellets']
    # 'Snow'
    cond_state[1] = ['Snow','Snow Showers','Light Snow','Light Snow Grains','Light Snow Showers']
    cond_state[0] = ['Unknown']
    cond_data_array = wdata_mat[:,cond_idx].copy()

    log.info('Condition state classification...')
    for k in range(len(cond_state)):
        for cond_str in cond_state[k]:
            if cond_str in Conditions_dict.keys():
                cond_val_old = Conditions_dict[cond_str]
                idx_temp = np.nonzero(cond_data_array==cond_val_old)[0]
                if len(idx_temp)>0:
                    wdata_mat[idx_temp,cond_idx]=k

    Conditions_dict_temp = dict()
    Conditions_dict_temp.update({'Clear':8})
    Conditions_dict_temp.update({'Cloudy':7})
    Conditions_dict_temp.update({'Overcast':6})
    Conditions_dict_temp.update({'Fog':5})
    Conditions_dict_temp.update({'Drizzle':4})
    Conditions_dict_temp.update({'Rain':3})
    Conditions_dict_temp.update({'Ice':2})
    Conditions_dict_temp.update({'Snow':1})
    Conditions_dict_temp.update({'Unknown':0})
    # Abbr' of weather factor type is
    weather_dict.update({'Cond':Conditions_dict_temp})
    ####################################################################
    # Reclassify the Event states into rain/snow/fog weather conditons
    ####################################################################
    event_state=[[]]*4
    # No event
    event_state[0]=['']
    # Snow
    event_state[1]=['Rain-Snow','Snow','Fog-Snow']
    # Rain
    event_state[2]=['Rain','Thunderstorm','Rain-Thunderstorm']
    # Fog
    event_state[3]=['Fog','Fog-Rain']
    log.info('Event state classification...')

    event_data_array=wdata_mat[:, evnt_idx].copy()
    for k in range(len(event_state)):
        for event_str in event_state[k]:
            if event_str in Events_dict.keys():
                event_val_old=Events_dict[event_str]
                idx_temp=np.nonzero(event_data_array==event_val_old)[0]
                if len(idx_temp)>0:
                    wdata_mat[idx_temp, evnt_idx]=k

    Events_dict_temp = dict()
    Events_dict_temp.update({'NoEvent': 0})
    Events_dict_temp.update({'Snow': 1})
    Events_dict_temp.update({'Rain': 2})
    Events_dict_temp.update({'Fog': 3})
    weather_dict.update({'Event': Events_dict_temp})
    return wdata_mat, weather_dict
Example #54
0
def _bn_anaylsis(bldg_obj,
                 p_name,
                 attr='sensor',
                 sig_tag='avg',
                 num_picks_bn=15,
                 learning_alg='hc'):
    s_names = bldg_obj.sigtags[sig_tag].names['sensor']
    p_idx = s_names.index(p_name)
    data_state_mat = bldg_obj.sigtags[sig_tag].data_state_mat

    anlist = bldg_obj.analysis[sig_tag]

    optprob_set = None
    optstate_set = None
    for anal in anlist:
        if anal.sensor_tag == p_name:
            optprob_set = anal.attrs[attr].optprob_set
            optstate_set = anal.attrs[attr].optstate_set
            break

    if optprob_set is None or optstate_set is None:
        raise Exception("Invalid p_name", p_name)

    sort_idx = np.argsort(optprob_set)[::-1]

    if attr == 'sensor':
        log.info('power - sensors...')
        idx_select = [p_idx] + list(sort_idx[:num_picks_bn])
        cols = [s_names[k] for k in idx_select]
        bndata_mat = bldg_obj.sigtags[sig_tag].data_state_mat[:, idx_select]
        b_arc_list = pair_in_idx([cols[0]], cols[1:])

    elif attr == 'weather':
        log.info('power - weather...')
        w_names = bldg_obj.sigtags[sig_tag].names['weather']
        cols = [p_name] + [w_name for w_name in w_names]
        bndata_mat = np.vstack(
            (bldg_obj.sigtags[sig_tag].data_state_mat[:, p_idx].T,
             bldg_obj.sigtags[sig_tag].data_weather_mat.T)).T
        b_arc_list = pair_in_idx([cols[0]], cols[1:])

    elif attr == 'time':
        log.info('power - time...')
        t_names = bldg_obj.sigtags[sig_tag].names['time']
        cols = [p_name] + [t_name for t_name in t_names]
        bndata_mat = np.vstack(
            (bldg_obj.sigtags[sig_tag].data_state_mat[:, p_idx].T,
             bldg_obj.sigtags[sig_tag].data_time_mat.T)).T
        b_arc_list = pair_in_idx([cols[0]], cols[1:]) + pair_in_idx(
            cols[1:], cols[1:])

    else:
        log.info('error')
        return 0

    # this is the heart and soul of ddea
    black_arc_frame = rbn.construct_arcs_frame(b_arc_list)
    factor_data_mat = rbn.convert_pymat_to_rfactor(bndata_mat)
    data_frame = rbn.construct_data_frame(factor_data_mat, cols)

    if learning_alg == 'tabu':
        hc_b = rbn.bnlearn.tabu(data_frame,
                                blacklist=black_arc_frame,
                                score='bic')
    elif learning_alg == 'mmhc':
        hc_b = rbn.bnlearn.mmhc(data_frame,
                                blacklist=black_arc_frame,
                                score='bic')
    else:
        hc_b = rbn.bnlearn.hc(data_frame,
                              blacklist=black_arc_frame,
                              score='bic')

    amat = rbn.py_get_amat(hc_b)
    cause_label = list(np.array(cols)[np.nonzero(amat[:, 0] == 1)[0]])
    cause_idx = [cols.index(label_) for label_ in cause_label]
    return cause_label, cols, hc_b, amat, bndata_mat