Esempio n. 1
0
def apply_NB_on_sensor_data_with_hour_of_day_feature( k , shuffle, file_to_read = ' ' , add_str_to_path = ' '):
    
    if file_to_read == ' ':
        file_to_read = r'E:\pgmpy\{path}\sensor_data_each_row_one_features_is_one_on_and_off+hour_of_day.csv'.format(path = add_str_to_path)
        print(add_str_to_path)
    data = read_data_from_CSV_file(dest_file = file_to_read, 
                                   data_type = int, 
                                   has_header = False, 
                                   return_as_pandas_data_frame = False, 
                                   remove_date_and_time = False, 
                                   return_header_separately = False, 
                                   convert_int_columns_to_int = False)
    
    data_target = data[: , -2] # the last col is hour of day and the -2 col is person

    _ , cols = np.shape(data)
    data_features = np.delete(data, cols - 2, axis = 1)
    
    print(data_target)
    print(data_features)
    names, avg_f_score = test_different_classifiers(data_features = data_features, 
                               data_target = data_target, 
                               k = k, 
                               shuffle = shuffle, 
                               selected_classifiers = [16])# NB is 17
    
    print(names, avg_f_score)
Esempio n. 2
0
def digitize_dataset(data_address,
                     selected_bin,
                     address_to_save,
                     isSave,
                     has_header,
                     return_as_pandas_data_frame,
                     remove_activity_column=False):
    '''
    digitize a dataset based on selected_bin
    the source data file is an exported  PCA file
    
    Parameters:
    ===========
    data_address:
    selected_bin:
    address_to_save: 
    
    '''

    data = read_data_from_CSV_file(
        dest_file=data_address,
        data_type=np.float,
        has_header=has_header,
        return_as_pandas_data_frame=return_as_pandas_data_frame)
    _, cols = np.shape(data)

    for i in range(0, cols - 1):  # digitize each column seperately
        data[:, i] = digitize_Dr_Amirkhani(data[:, i], selected_bin)

    data = data.astype(int)

    if isSave:
        np.savetxt(address_to_save, data, delimiter=',', fmt='%s')

    return data
Esempio n. 3
0
def discretization_equal_width(file_path):
    '''
    we first imagine that the optimized bins for each column is 10.
    
    Parameters:
    ===========
    file_path: the path of file which should be discretized. each column is digitized separately.
    
    Returns:
    ==========
    data
    
    '''
    data = read_data_from_CSV_file(dest_file=file_path, data_type=np.float)
    _, cols = np.shape(data)

    for i in range(cols - 1):  # the last column is Person and is not processed
        selected_col = data[:, i]
        bins = np.linspace(
            np.amin(selected_col), np.amax(selected_col),
            10)  # devide the distance between min and max to 10 parts
        digitized = np.digitize(selected_col, bins)
        data[:, i] = digitized

    return data.astype(int)
Esempio n. 4
0
def get_set_of_features_in_each_column(file_address, data,
                                       read_data_from_file):

    if read_data_from_file:
        data = read_data_from_CSV_file(dest_file=file_address,
                                       data_type=np.int,
                                       has_header=True,
                                       return_as_pandas_data_frame=False)
    else:
        if type(data) == pd.DataFrame:  #core.frame.DataFrame:
            data = data.values
            print(data)

    _, number_of_columns = data.shape
    for i in range(0, number_of_columns):
        #print("column number: ", i )
        if (len(collections.Counter(data[:, i]))) == 1:
            print(i)
Esempio n. 5
0
def calculate_inner_and_outer_differece_for_two_residetns_after_clustering(file_address, hasActivity):
    
    data = read_data_from_CSV_file(dest_file = file_address , 
                            data_type = int ,  
                            has_header = True , 
                            return_as_pandas_data_frame = False , 
                            remove_date_and_time = False , 
                            return_header_separately = False , 
                            convert_int_columns_to_int = True)
    
    if hasActivity:
        data = np.delete(data, -1 , 1)
    
    _, cols = np.shape(data)

    data, persons = separate_dataset_based_on_persons(list_of_data = data[:,0:cols - 1], 
                                      list_of_persons = data[:,-1] , 
                                      list_of_activities = 0, 
                                      has_activity = False)# because we remove the activity if does
   
    number_of_persons = len(persons)
    inner_class_diff = 0
    sigma_denominator = 0#c(n1,2) + c(n2,2) in which ni is the number of feature vectors for person i
    
    for per in range(number_of_persons):
        print("person:", per)
        print("len(data[per]):", len(data[per]))
       
        if len(data[per]) > 1000:
            data[per] = cluster_samples_using_KMeans(data[per], 1000)
        
        inner_class_diff = inner_class_diff + calculate_inner_class_diff(data[per])
        print("inner_class_diff:", inner_class_diff)
        person_number_of_samples = len(data[per])
        sigma_denominator = sigma_denominator + (person_number_of_samples * (person_number_of_samples-1) / 2) #c(ni,2)
        print("sigma_denominator:", sigma_denominator)
  
    inner_class_diff = inner_class_diff / sigma_denominator
    
    between_class_diff = calculate_between_class_diff(data[0], data[1])
    between_class_diff = between_class_diff /  (len(data[0]) * len(data[1]) )

    #print(inner_class_diff, between_class_diff)
    return inner_class_diff, between_class_diff
Esempio n. 6
0
def test_test_different_classifiers(address_to_read):
    #for applying the iccke paper on CASAS dataset
    data = read_data_from_CSV_file(dest_file=address_to_read,
                                   data_type=int,
                                   has_header=False,
                                   return_as_pandas_data_frame=False,
                                   remove_date_and_time=False,
                                   return_header_separately=False,
                                   convert_int_columns_to_int=True)
    data_features = data[:, 0:-2]
    #print(type(data_features))
    data_target = data[:, -2]

    names, avg_f_score = test_different_classifiers(
        data_features=data_features,
        data_target=data_target,
        k=10,
        shuffle=True,
        selected_classifiers=[14])  #[0,1,2,3,4,12,14])

    for i in range(len(avg_f_score)):
        print(names[i], "best_f_score:", avg_f_score[i])
Esempio n. 7
0
def create_fixed_size_seq_from_bag_of_events(address_to_read, len_of_each_seq,
                                             has_activity):

    data = read_data_from_CSV_file(dest_file=address_to_read,
                                   data_type=int,
                                   has_header=True)
    rows, cols = data.shape
    #print("all of data shape:" , rows, cols)

    if has_activity:
        list_of_persons = data[:, -2]
        list_of_activities = data[:, -1]
        data = data[:, 0:cols - 2]
    else:
        list_of_persons = data[:, -1]
        list_of_activities = []
        data = data[:, 0:cols - 1]

    # it is important to say that if I use Seq of Bag of sensor events_based_on_activity_and_no_overlap_delta features
    #the length of feature vectors is not constant
    #print(list_of_persons.shape)
    if has_activity:
        data, list_of_persons, _ = separate_dataset_based_on_persons(
            list_of_data=data,
            list_of_persons=list_of_persons,
            list_of_activities=list_of_activities,
            has_activity=has_activity)
    else:
        data, list_of_persons = separate_dataset_based_on_persons(
            list_of_data=data,
            list_of_persons=list_of_persons,
            list_of_activities=list_of_activities,
            has_activity=has_activity)

    number_of_persons = list_of_persons.shape[0]

    for each_person in range(number_of_persons):
        rows, cols = data[each_person].shape
        #print("rows of each person before reshape", rows)
        #rows - rows % len_of_each_seq ignore extra rows of samples to create fixed lenghs sequences
        ignored_rows = rows % len_of_each_seq
        new_rows = int((rows - ignored_rows) / len_of_each_seq)

        #print("#####",list_of_persons[each_person].shape)

        #print("new_rows of each person:", new_rows)
        data[each_person] = data[each_person][:rows - ignored_rows].reshape(
            new_rows, len_of_each_seq, cols)
        list_of_persons[each_person] = list_of_persons[each_person][
            0:new_rows]  #keep personIDs as much as samples
        #print("data[each_person].shape", data[each_person].shape)
        #print(data[each_person].shape)
        #print(list_of_persons[each_person])
        #print("************")

    for person in range(1, number_of_persons):
        #print(list_of_persons[0].shape , list_of_persons[person].shape)
        data[0] = np.concatenate((data[0], data[person]), axis=0)
        list_of_persons[0] = np.concatenate(
            (list_of_persons[0], list_of_persons[person]), axis=0)

    print("create_fixed_size_seq_from_bag_of_events completed!")
    return data[0], list_of_persons[0]
'''
Created on Apr 13, 2018

@author: Adele
'''
from dataPreparation import casas7_to_csv_time_Ordered, casas7_to_csv_based_on_each_person_sensor_events_time_Ordered
from read_write import read_data_from_CSV_file

if __name__ == "__main__":

    #casas7_to_csv_time_Ordered()
    address_to_read = r"E:\Lessons_tutorials\Behavioural user profile articles\Datasets\7 twor.2009\twor.2009\annotated"
    address_to_save = r'E:\pgmpy\sensor_data_each_person_sensor_events+time_ordered.csv'
    #casas7_to_csv_based_on_each_person_sensor_events_time_Ordered(address_to_read, address_to_save , 2)
    a = read_data_from_CSV_file(dest_file=address_to_save,
                                data_type=int,
                                has_header=False,
                                return_as_pandas_data_frame=False,
                                remove_date_and_time=True,
                                return_header_separately=False,
                                convert_int_columns_to_int=True)
    print(a)
    print(a.shape)
Esempio n. 9
0
def featureSelection_based_on_Variance(
        dest_file, threshold, isSave, path_to_save,
        column_indexes_not_apply_feature_selection, has_header,
        is_Panda_dataFrame, remove_work_column):
    '''
    suppose that we have a dataset with boolean features, and we want to remove all features that are either one or zero (on or off) in more than p%(e.g. 80%) of the samples. 
       Boolean features are Bernoulli random variables, and the variance of such variables is given by var[x] = p(1-p)
    
    Parameteres:
    ========
    dest_file
    threshold
    remove_date_and_time = False
    isSave
    column_indexes_not_apply_feature_selection
    has_header 
    path_to_save
    remove_work_column: if True, remove the 'Work' column before applying feature selection (just work in Pandas Dataframe datasets)
    '''
    data = read_data_from_CSV_file(
        dest_file=dest_file,
        data_type=np.int,
        has_header=has_header,
        return_as_pandas_data_frame=is_Panda_dataFrame)
    # remove person, work, date and time columns
    #sensor_data = np.delete(np.delete(np.delete(np.delete(data ,64 , 1), 63 , 1), 62 , 1), 61,1)
    if is_Panda_dataFrame:
        if remove_work_column:
            data = data.drop('Work', axis=1, inplace=False)

        columns = data.columns
        data = data.values

    rows, cols = data.shape

    #print("======================")
    #print("original data shape: " , rows , cols)

    column_indexes_to_apply_feature_selection = list(
        set(range(cols)) - set(column_indexes_not_apply_feature_selection))
    #print("column_indexes_to_apply_feature_selection:" , column_indexes_to_apply_feature_selection)
    #threshold=0.7 * (1 - 0.7)
    select_features = VarianceThreshold(threshold=threshold)  # 80% of the data

    data_new = select_features.fit_transform(
        data[:, column_indexes_to_apply_feature_selection])

    data_columns_not_meet_featrue_selection = data[:,
                                                   column_indexes_not_apply_feature_selection]

    data_new = np.concatenate(
        (data_new, data_columns_not_meet_featrue_selection), axis=1)
    #print(select_features.variances_)

    if is_Panda_dataFrame:
        columns_are_kept = select_features.get_support(indices=True)
        selected_fetures_labels = [columns[x] for x in columns_are_kept]
        #print("selected_fetures_labels:" , selected_fetures_labels)
        column_labels_not_apply_feature_selection = [
            columns[x] for x in column_indexes_not_apply_feature_selection
        ]
        final_labels = np.concatenate(
            (selected_fetures_labels,
             column_labels_not_apply_feature_selection),
            axis=0)
        #print(final_labels)
        data_new = pd.DataFrame(data_new, columns=final_labels)

    if (isSave):
        np.savetxt(path_to_save, data_new, delimiter=',', fmt='%s')

    return data_new
Esempio n. 10
0
def PCA_data_generation_on_separated_train_and_test(
        file_address, base_address_to_save, remove_date_and_time,
        has_activity_column, remove_activity_column, test_file_address,
        base_address_of_test_file_to_save):
    '''
    Parameter:
    =========
    file_address:
    base_address_to_save:
    remove_date_and_time: if is true, the time and date columns are removed
    remove_activity_column: if is true, the activity/work column is removed
    '''

    sensor_data = read_data_from_CSV_file(file_address,
                                          np.int,
                                          has_header=True,
                                          return_as_pandas_data_frame=False)
    test_data = read_data_from_CSV_file(test_file_address,
                                        np.int,
                                        has_header=True,
                                        return_as_pandas_data_frame=False)
    print(sensor_data)  #:, -1])

    if remove_activity_column == True:
        sensor_data = np.delete(sensor_data, -1, 1)
        test_data = np.delete(test_data, -1, 1)

    # check conditions that the last column is the person number
    if (has_activity_column and remove_activity_column) or (has_activity_column
                                                            == False):
        index_of_Person_number = -1
    elif has_activity_column and remove_activity_column == False:
        index_of_Person_number = -2

    train_rows, _ = np.shape(sensor_data)
    train_target = np.zeros((train_rows, 1), dtype=int)

    for ind in range(train_rows):
        train_target[ind][0] = sensor_data[
            ind,
            index_of_Person_number]  # person number is considered as class

    test_rows, _ = np.shape(test_data)
    test_target = np.zeros((test_rows, 1), dtype=int)
    for ind in range(test_rows):
        test_target[ind][0] = test_data[ind, index_of_Person_number]

    sensor_data = np.delete(sensor_data, index_of_Person_number,
                            1)  # remove the Person column
    test_data = np.delete(test_data, index_of_Person_number,
                          1)  # remove the Person column

    for i in range(2, 41):  #cols):
        pca = PCA(n_components=i)

        train_data_new = pca.fit_transform(
            sensor_data
        )  #Fit the model with X and apply the dimensionality reduction on X.
        print(train_data_new.shape)
        print(train_target.shape)
        dest = base_address_to_save + 'PCA_n=' + str(i) + '.csv'
        print(dest)

        np.savetxt(dest,
                   np.concatenate((train_data_new, train_target), axis=1),
                   delimiter=',',
                   fmt='%s')

        #transform test data
        print(sensor_data.shape)
        print(test_data.shape)
        test_data_new = pca.transform(test_data)
        test_dest = base_address_of_test_file_to_save + 'PCA_n=' + str(
            i) + '.csv'
        print(test_dest)

        np.savetxt(test_dest,
                   np.concatenate((test_data_new, test_target), axis=1),
                   delimiter=',',
                   fmt='%s')