def apply_NB_on_sensor_data_with_hour_of_day_feature( k , shuffle, file_to_read = ' ' , add_str_to_path = ' '): if file_to_read == ' ': file_to_read = r'E:\pgmpy\{path}\sensor_data_each_row_one_features_is_one_on_and_off+hour_of_day.csv'.format(path = add_str_to_path) print(add_str_to_path) data = read_data_from_CSV_file(dest_file = file_to_read, data_type = int, has_header = False, return_as_pandas_data_frame = False, remove_date_and_time = False, return_header_separately = False, convert_int_columns_to_int = False) data_target = data[: , -2] # the last col is hour of day and the -2 col is person _ , cols = np.shape(data) data_features = np.delete(data, cols - 2, axis = 1) print(data_target) print(data_features) names, avg_f_score = test_different_classifiers(data_features = data_features, data_target = data_target, k = k, shuffle = shuffle, selected_classifiers = [16])# NB is 17 print(names, avg_f_score)
def digitize_dataset(data_address, selected_bin, address_to_save, isSave, has_header, return_as_pandas_data_frame, remove_activity_column=False): ''' digitize a dataset based on selected_bin the source data file is an exported PCA file Parameters: =========== data_address: selected_bin: address_to_save: ''' data = read_data_from_CSV_file( dest_file=data_address, data_type=np.float, has_header=has_header, return_as_pandas_data_frame=return_as_pandas_data_frame) _, cols = np.shape(data) for i in range(0, cols - 1): # digitize each column seperately data[:, i] = digitize_Dr_Amirkhani(data[:, i], selected_bin) data = data.astype(int) if isSave: np.savetxt(address_to_save, data, delimiter=',', fmt='%s') return data
def discretization_equal_width(file_path): ''' we first imagine that the optimized bins for each column is 10. Parameters: =========== file_path: the path of file which should be discretized. each column is digitized separately. Returns: ========== data ''' data = read_data_from_CSV_file(dest_file=file_path, data_type=np.float) _, cols = np.shape(data) for i in range(cols - 1): # the last column is Person and is not processed selected_col = data[:, i] bins = np.linspace( np.amin(selected_col), np.amax(selected_col), 10) # devide the distance between min and max to 10 parts digitized = np.digitize(selected_col, bins) data[:, i] = digitized return data.astype(int)
def get_set_of_features_in_each_column(file_address, data, read_data_from_file): if read_data_from_file: data = read_data_from_CSV_file(dest_file=file_address, data_type=np.int, has_header=True, return_as_pandas_data_frame=False) else: if type(data) == pd.DataFrame: #core.frame.DataFrame: data = data.values print(data) _, number_of_columns = data.shape for i in range(0, number_of_columns): #print("column number: ", i ) if (len(collections.Counter(data[:, i]))) == 1: print(i)
def calculate_inner_and_outer_differece_for_two_residetns_after_clustering(file_address, hasActivity): data = read_data_from_CSV_file(dest_file = file_address , data_type = int , has_header = True , return_as_pandas_data_frame = False , remove_date_and_time = False , return_header_separately = False , convert_int_columns_to_int = True) if hasActivity: data = np.delete(data, -1 , 1) _, cols = np.shape(data) data, persons = separate_dataset_based_on_persons(list_of_data = data[:,0:cols - 1], list_of_persons = data[:,-1] , list_of_activities = 0, has_activity = False)# because we remove the activity if does number_of_persons = len(persons) inner_class_diff = 0 sigma_denominator = 0#c(n1,2) + c(n2,2) in which ni is the number of feature vectors for person i for per in range(number_of_persons): print("person:", per) print("len(data[per]):", len(data[per])) if len(data[per]) > 1000: data[per] = cluster_samples_using_KMeans(data[per], 1000) inner_class_diff = inner_class_diff + calculate_inner_class_diff(data[per]) print("inner_class_diff:", inner_class_diff) person_number_of_samples = len(data[per]) sigma_denominator = sigma_denominator + (person_number_of_samples * (person_number_of_samples-1) / 2) #c(ni,2) print("sigma_denominator:", sigma_denominator) inner_class_diff = inner_class_diff / sigma_denominator between_class_diff = calculate_between_class_diff(data[0], data[1]) between_class_diff = between_class_diff / (len(data[0]) * len(data[1]) ) #print(inner_class_diff, between_class_diff) return inner_class_diff, between_class_diff
def test_test_different_classifiers(address_to_read): #for applying the iccke paper on CASAS dataset data = read_data_from_CSV_file(dest_file=address_to_read, data_type=int, has_header=False, return_as_pandas_data_frame=False, remove_date_and_time=False, return_header_separately=False, convert_int_columns_to_int=True) data_features = data[:, 0:-2] #print(type(data_features)) data_target = data[:, -2] names, avg_f_score = test_different_classifiers( data_features=data_features, data_target=data_target, k=10, shuffle=True, selected_classifiers=[14]) #[0,1,2,3,4,12,14]) for i in range(len(avg_f_score)): print(names[i], "best_f_score:", avg_f_score[i])
def create_fixed_size_seq_from_bag_of_events(address_to_read, len_of_each_seq, has_activity): data = read_data_from_CSV_file(dest_file=address_to_read, data_type=int, has_header=True) rows, cols = data.shape #print("all of data shape:" , rows, cols) if has_activity: list_of_persons = data[:, -2] list_of_activities = data[:, -1] data = data[:, 0:cols - 2] else: list_of_persons = data[:, -1] list_of_activities = [] data = data[:, 0:cols - 1] # it is important to say that if I use Seq of Bag of sensor events_based_on_activity_and_no_overlap_delta features #the length of feature vectors is not constant #print(list_of_persons.shape) if has_activity: data, list_of_persons, _ = separate_dataset_based_on_persons( list_of_data=data, list_of_persons=list_of_persons, list_of_activities=list_of_activities, has_activity=has_activity) else: data, list_of_persons = separate_dataset_based_on_persons( list_of_data=data, list_of_persons=list_of_persons, list_of_activities=list_of_activities, has_activity=has_activity) number_of_persons = list_of_persons.shape[0] for each_person in range(number_of_persons): rows, cols = data[each_person].shape #print("rows of each person before reshape", rows) #rows - rows % len_of_each_seq ignore extra rows of samples to create fixed lenghs sequences ignored_rows = rows % len_of_each_seq new_rows = int((rows - ignored_rows) / len_of_each_seq) #print("#####",list_of_persons[each_person].shape) #print("new_rows of each person:", new_rows) data[each_person] = data[each_person][:rows - ignored_rows].reshape( new_rows, len_of_each_seq, cols) list_of_persons[each_person] = list_of_persons[each_person][ 0:new_rows] #keep personIDs as much as samples #print("data[each_person].shape", data[each_person].shape) #print(data[each_person].shape) #print(list_of_persons[each_person]) #print("************") for person in range(1, number_of_persons): #print(list_of_persons[0].shape , list_of_persons[person].shape) data[0] = np.concatenate((data[0], data[person]), axis=0) list_of_persons[0] = np.concatenate( (list_of_persons[0], list_of_persons[person]), axis=0) print("create_fixed_size_seq_from_bag_of_events completed!") return data[0], list_of_persons[0]
''' Created on Apr 13, 2018 @author: Adele ''' from dataPreparation import casas7_to_csv_time_Ordered, casas7_to_csv_based_on_each_person_sensor_events_time_Ordered from read_write import read_data_from_CSV_file if __name__ == "__main__": #casas7_to_csv_time_Ordered() address_to_read = r"E:\Lessons_tutorials\Behavioural user profile articles\Datasets\7 twor.2009\twor.2009\annotated" address_to_save = r'E:\pgmpy\sensor_data_each_person_sensor_events+time_ordered.csv' #casas7_to_csv_based_on_each_person_sensor_events_time_Ordered(address_to_read, address_to_save , 2) a = read_data_from_CSV_file(dest_file=address_to_save, data_type=int, has_header=False, return_as_pandas_data_frame=False, remove_date_and_time=True, return_header_separately=False, convert_int_columns_to_int=True) print(a) print(a.shape)
def featureSelection_based_on_Variance( dest_file, threshold, isSave, path_to_save, column_indexes_not_apply_feature_selection, has_header, is_Panda_dataFrame, remove_work_column): ''' suppose that we have a dataset with boolean features, and we want to remove all features that are either one or zero (on or off) in more than p%(e.g. 80%) of the samples. Boolean features are Bernoulli random variables, and the variance of such variables is given by var[x] = p(1-p) Parameteres: ======== dest_file threshold remove_date_and_time = False isSave column_indexes_not_apply_feature_selection has_header path_to_save remove_work_column: if True, remove the 'Work' column before applying feature selection (just work in Pandas Dataframe datasets) ''' data = read_data_from_CSV_file( dest_file=dest_file, data_type=np.int, has_header=has_header, return_as_pandas_data_frame=is_Panda_dataFrame) # remove person, work, date and time columns #sensor_data = np.delete(np.delete(np.delete(np.delete(data ,64 , 1), 63 , 1), 62 , 1), 61,1) if is_Panda_dataFrame: if remove_work_column: data = data.drop('Work', axis=1, inplace=False) columns = data.columns data = data.values rows, cols = data.shape #print("======================") #print("original data shape: " , rows , cols) column_indexes_to_apply_feature_selection = list( set(range(cols)) - set(column_indexes_not_apply_feature_selection)) #print("column_indexes_to_apply_feature_selection:" , column_indexes_to_apply_feature_selection) #threshold=0.7 * (1 - 0.7) select_features = VarianceThreshold(threshold=threshold) # 80% of the data data_new = select_features.fit_transform( data[:, column_indexes_to_apply_feature_selection]) data_columns_not_meet_featrue_selection = data[:, column_indexes_not_apply_feature_selection] data_new = np.concatenate( (data_new, data_columns_not_meet_featrue_selection), axis=1) #print(select_features.variances_) if is_Panda_dataFrame: columns_are_kept = select_features.get_support(indices=True) selected_fetures_labels = [columns[x] for x in columns_are_kept] #print("selected_fetures_labels:" , selected_fetures_labels) column_labels_not_apply_feature_selection = [ columns[x] for x in column_indexes_not_apply_feature_selection ] final_labels = np.concatenate( (selected_fetures_labels, column_labels_not_apply_feature_selection), axis=0) #print(final_labels) data_new = pd.DataFrame(data_new, columns=final_labels) if (isSave): np.savetxt(path_to_save, data_new, delimiter=',', fmt='%s') return data_new
def PCA_data_generation_on_separated_train_and_test( file_address, base_address_to_save, remove_date_and_time, has_activity_column, remove_activity_column, test_file_address, base_address_of_test_file_to_save): ''' Parameter: ========= file_address: base_address_to_save: remove_date_and_time: if is true, the time and date columns are removed remove_activity_column: if is true, the activity/work column is removed ''' sensor_data = read_data_from_CSV_file(file_address, np.int, has_header=True, return_as_pandas_data_frame=False) test_data = read_data_from_CSV_file(test_file_address, np.int, has_header=True, return_as_pandas_data_frame=False) print(sensor_data) #:, -1]) if remove_activity_column == True: sensor_data = np.delete(sensor_data, -1, 1) test_data = np.delete(test_data, -1, 1) # check conditions that the last column is the person number if (has_activity_column and remove_activity_column) or (has_activity_column == False): index_of_Person_number = -1 elif has_activity_column and remove_activity_column == False: index_of_Person_number = -2 train_rows, _ = np.shape(sensor_data) train_target = np.zeros((train_rows, 1), dtype=int) for ind in range(train_rows): train_target[ind][0] = sensor_data[ ind, index_of_Person_number] # person number is considered as class test_rows, _ = np.shape(test_data) test_target = np.zeros((test_rows, 1), dtype=int) for ind in range(test_rows): test_target[ind][0] = test_data[ind, index_of_Person_number] sensor_data = np.delete(sensor_data, index_of_Person_number, 1) # remove the Person column test_data = np.delete(test_data, index_of_Person_number, 1) # remove the Person column for i in range(2, 41): #cols): pca = PCA(n_components=i) train_data_new = pca.fit_transform( sensor_data ) #Fit the model with X and apply the dimensionality reduction on X. print(train_data_new.shape) print(train_target.shape) dest = base_address_to_save + 'PCA_n=' + str(i) + '.csv' print(dest) np.savetxt(dest, np.concatenate((train_data_new, train_target), axis=1), delimiter=',', fmt='%s') #transform test data print(sensor_data.shape) print(test_data.shape) test_data_new = pca.transform(test_data) test_dest = base_address_of_test_file_to_save + 'PCA_n=' + str( i) + '.csv' print(test_dest) np.savetxt(test_dest, np.concatenate((test_data_new, test_target), axis=1), delimiter=',', fmt='%s')