def test_toplevel_impute(self): df = pd.DataFrame(np.transpose([[0, 1, 2, np.NaN], [1, np.PINF, 2, 3], [1, -3, np.NINF, 3]]), columns=["value_a", "value_b", "value_c"]) dataframe_functions.impute(df) self.assertEqual(list(df.value_a), [0, 1, 2, 1]) self.assertEqual(list(df.value_b), [1, 3, 2, 3]) self.assertEqual(list(df.value_c), [1, -3, -3, 3]) df = pd.DataFrame(np.transpose([[0, 1, 2, np.NaN], [1, np.PINF, 2, np.NaN], [np.NaN, -3, np.NINF, 3]]), columns=["value_a", "value_b", "value_c"]) df = df.astype(np.float64, inplace=True) dataframe_functions.impute(df) self.assertEqual(list(df.value_a), [0, 1, 2, 1]) self.assertEqual(list(df.value_b), [1, 2, 2, 1.5]) self.assertEqual(list(df.value_c), [0, -3, -3, 3]) df = pd.DataFrame(np.transpose([[0, 1, 2, np.NaN], [1, np.PINF, 2, 3], [np.PINF, -3, np.NINF, 3]]), columns=["value_a", "value_b", "value_c"]) df = df.astype(np.float32, inplace=True) dataframe_functions.impute(df) self.assertEqual(list(df.value_a), [0, 1, 2, 1]) self.assertEqual(list(df.value_b), [1, 3, 2, 3]) self.assertEqual(list(df.value_c), [3, -3, -3, 3])
def main(): config = Configuration() print("TS Fresh Feature Extraction Script started at: ", datetime.datetime.now()) # y_train = np.load(config.training_data_folder + 'train_labels.npy') # labels of the training data # x_train = np.load(config.training_data_folder + 'train_features.npy') # labels of the training data # feature_names = np.load(config.training_data_folder + 'feature_names.npy') # failureTimes_train = np.load(config.training_data_folder + 'train_failure_times.npy') # windowTimes_train = np.load(config.training_data_folder + 'train_window_times.npy') # y_test = np.load(config.training_data_folder + 'test_labels.npy') # labels of the training data # x_test = np.load(config.training_data_folder + 'test_features.npy') # labels of the training data x_train = np.load(config.case_base_folder + 'train_features.npy') # data training y_train_strings = np.expand_dims(np.load(config.case_base_folder + 'train_labels.npy'), axis=-1) feature_names = np.load(config.training_data_folder + 'feature_names.npy') columns = np.concatenate((['id', 'time'], feature_names)) print(columns.shape) print(x_train.shape) examples = x_train.shape[0] time_series_length = x_train.shape[1] attributes = x_train.shape[2] # tsfresh_input_x_test = np.zeros([examples * time_series_length, attributes+2]) tsfresh_input_x_test = np.zeros([1, 63]) # add 2 columns for id and timestamp for example in range(examples): print("example: ", example) id_vec = np.ones(x_train.shape[1]) * example time_vec = np.arange(x_train.shape[1]) # stack id and time and example matrix together id_time_matrix = np.dstack((id_vec, time_vec)).squeeze() # (1500,2) # print("id_time_matrix: ", id_time_matrix.shape) # print("x_test[example]: ", x_train[example,:,:].shape) curr_ex = np.concatenate((id_time_matrix, x_train[example, :, :]), axis=1) # (1500, 63) print(example, " shape: ", curr_ex.shape) if example == 0: tsfresh_input_x_test = curr_ex else: tsfresh_input_x_test = np.concatenate((tsfresh_input_x_test, curr_ex), axis=0) # print("dummy: ", tsfresh_input_x_test.shape) # Append to overall array # tsfresh_input_x_test[2:,:] # tsfresh_input_x_test[] # get unique classes df_timeSeries_container = pd.DataFrame(data=tsfresh_input_x_test, columns=columns) df_labels = pd.DataFrame(data=y_train_strings) print("TS Fresh Feature Extraction started at: ", datetime.datetime.now()) extracted_features = extract_features(df_timeSeries_container, column_id="id", column_sort="time") extracted_features.to_pickle(config.case_base_folder + 'extractedFeatures_X_caseBase_unfiltered_4ms4sec.pkl') # extracted_features.to_csv('extractedFeatures_X_caseBase_unfiltered.csv', sep=',', encoding='WINDOWS-1252') print('extracted features size unfiltered: ', extracted_features.shape) from tsfresh.utilities.dataframe_functions import impute # Remove NANs extracted_features = impute(extracted_features) print('extracted features size after impute: ', extracted_features.shape) from tsfresh import select_features X_filtered = select_features(extracted_features, y_train_strings) print('filtered features size: ', X_filtered.shape) print('filtered features: ', X_filtered) X_filtered.to_pickle(config.case_base_folder + 'extractedFeatures_X_filtered_4ms4sec.pkl') y_train_strings = np.squeeze(y_train_strings) print("y_train_strings: ", y_train_strings.shape) X = pd.read_pickle(config.case_base_folder + 'extractedFeatures_X_caseBase_unfiltered_4ms4sec.pkl') print("X shape: ", X.shape) print(X.head()) # Remove NANs # X = impute(X) print('extracted features size after impute: ', X.shape)
'0.0': fc_parameters, '1.0': fc_parameters, '2.0': fc_parameters, '3.0': fc_parameters, '4.0': fc_parameters, '5.0': fc_parameters } train_features4 = extract_features( sub_pb, column_id='object_id', column_value='flux', column_sort='mjd', column_kind='passband', default_fc_parameters=MinimalFCParameters()) impute(train_features4) train_features5 = extract_features( sub_pb, column_id='object_id', column_value='flux', column_sort='mjd', column_kind='passband', default_fc_parameters=fc_parameters, kind_to_fc_parameters=kind_to_fc_parameters) impute(train_features5) #Task5 model pruning training5_X = train_features5 training5_Y = sub_meta['target']
def main(argv): try: path = argv[0] except: pass print path if not os.path.exists(path): print "Invalid Path" return #Load our ML Model clf = joblib.load('bestrandomforest.pkl') #Create a list with which we will append values to class_paths = [] classes = [] truths = [] #Bad TSFresh features to filter out bad_features = [] for i in range(8): langevin = str(i) + "__max_langevin_fixed_point__m_3__r_30" bad_features.append(langevin) for j in range(9): quantile = (j + 1) * 0.1 if quantile != 0.5: feature_name = str(i) + "__index_mass_quantile__q_" + str( quantile) bad_features.append(feature_name) total_predictions = 0 true_predictions = 0 for file in os.listdir(path): if sys.platform == "win32" or sys.platform == "win64": filepath = path + '\\' + file else: filepath = path + '/' + file true_label = int(file[7]) if file.endswith(".txt") or file.endswith(".csv"): try: sample = pd.read_csv(filepath, header=None) except: pass #Preprocess the data sample[8] = sample.index.astype(float) sample[9] = 1.0 sample = extract_features(sample, column_id=9, column_sort=8) impute(sample) sample = sample.fillna(0) sample.columns = sample.columns.map(lambda t: str(t)) sample = sample.sort_index(axis=1) sample = sample.drop(bad_features, axis=1) #Predict the class gesture = clf.predict(sample.loc[0:1])[0] if gesture == "One": predicted_label = 1 if gesture == "Two": predicted_label = 2 if gesture == "Three": predicted_label = 3 if gesture == "Four": predicted_label = 4 if gesture == "Five": predicted_label = 5 if gesture == "Six": predicted_label = 6 if true_label == predicted_label: true_predictions += 1 total_predictions += 1 class_paths.append(file) classes.append(predicted_label) truths.append(true_label) accuracy = (float(true_predictions) / total_predictions) * 100 print "Accuracy:", accuracy, "%" #Save Output as CSV File output = pd.DataFrame() output['Filename'] = class_paths output['Predicted'] = classes output['True'] = truths output = output.assign(Accuracy="") accuracy_string = str(accuracy) + "%" output['Accuracy'][0] = accuracy_string output.to_csv('ClassificationResults.csv', index=False)
def main_HS(): # Hydraulic Systems Dataset, using top 2 WINDOW/STEP and TSFRESH name = 'HS_X1_20-10' X1, y_true = load_hydraulicsystems_data(True, 20, 10) with open(f'results/{name}.txt', 'w') as f: print(name, file=f) plot_corr(X1, name) X1_pca = create_pca(X1, name) # for each condition we need a model: X1_models = {} for condition in list(y_true): clf_lr, clf_dtc, clf_lsvm, clf_polsvm, clf_rbfsvm, clf_rfc, clf_gbtc =\ training_models(X1_pca, y_true[condition], name, cond=condition) X1_models[condition] = { 'clf_lr': clf_lr, 'clf_dtc': clf_dtc, 'clf_lsvm': clf_lsvm, 'clf_polsvm': clf_polsvm, 'clf_rbfsvm': clf_rbfsvm } name = 'HS_X1_30-15' X1, y_true = load_hydraulicsystems_data(True, 30, 15) with open(f'results/{name}.txt', 'w') as f: print(name, file=f) plot_corr(X1, name) X1_pca = create_pca(X1, name) # for each condition we need a model: X1_models = {} for condition in list(y_true): clf_lr, clf_dtc, clf_lsvm, clf_polsvm, clf_rbfsvm, clf_rfc, clf_gbtc =\ training_models(X1_pca, y_true[condition], name, cond=condition) X1_models[condition] = { 'clf_lr': clf_lr, 'clf_dtc': clf_dtc, 'clf_lsvm': clf_lsvm, 'clf_polsvm': clf_polsvm, 'clf_rbfsvm': clf_rbfsvm } df, y_true = load_hydraulicsystems_data(tsf=True) name = 'HS_X2_TSFRESH1' with open(f'results/{name}.txt', 'w') as f: print(name, file=f) # for each condition we need a model: X2_models = {} X2 = extract_features(df, column_id="cycle", column_sort="time") impute(X2) # plot_corr(X2, name) for condition in list(y_true): X2_pca = create_pca(X2, name, cond=condition) clf_lr, clf_dtc, clf_lsvm, clf_polsvm, clf_rbfsvm, clf_rfc, clf_gbtc =\ training_models(X2_pca, y_true[condition], name, cond=condition) X2_models[condition] = { 'clf_lr': clf_lr, 'clf_dtc': clf_dtc, 'clf_lsvm': clf_lsvm, 'clf_polsvm': clf_polsvm, 'clf_rbfsvm': clf_rbfsvm, 'clf_rfc': clf_rfc, 'clf_gbtc': clf_gbtc } name = 'HS_X3_TSFRESH2' with open(f'results/{name}.txt', 'w') as f: print(name, file=f) # for each condition we need a model: X3_models = {} for condition in list(y_true): X3 = extract_relevant_features(df, y_true[condition], column_id="cycle", column_sort="time") impute(X3) X3_pca = create_pca(X3, name, cond=condition) clf_lr, clf_dtc, clf_lsvm, clf_polsvm, clf_rbfsvm, clf_rfc, clf_gbtc =\ training_models(X3_pca, y_true[condition], name, cond=condition) X3_models[condition] = { 'clf_lr': clf_lr, 'clf_dtc': clf_dtc, 'clf_lsvm': clf_lsvm, 'clf_polsvm': clf_polsvm, 'clf_rbfsvm': clf_rbfsvm, 'clf_rfc': clf_rfc, 'clf_gbtc': clf_gbtc } return None
def create_representation(self, for_case_base=False): print() print("TS Fresh Feature Extraction Script started at: ", datetime.datetime.now()) print() x_train = self.dataset.x_train # data training y_train_strings = self.dataset.y_train_strings feature_names = self.dataset.feature_names_all columns = np.concatenate((['id', 'time'], feature_names)) # tsfresh_input_x_test = np.zeros([examples * time_series_length, attributes+2]) tsfresh_input_x_test = np.zeros([1, 63]) # add 2 columns for id and timestamp # FIXME: Wieso heißt das output array "test" aber es wird über die Trainingsbeispiele iteriert? print('Training example preparations running ...') for example in range(self.dataset.num_train_instances): id_vec = np.ones(x_train.shape[1]) * example time_vec = np.arange(x_train.shape[1]) # stack id and time and example matrix together id_time_matrix = np.dstack( (id_vec, time_vec)).squeeze() # (1500,2) curr_ex = np.concatenate((id_time_matrix, x_train[example, :, :]), axis=1) # (1500, 63) # print('Example number:', example, "\tShape: ", curr_ex.shape) if example == 0: tsfresh_input_x_test = curr_ex else: tsfresh_input_x_test = np.concatenate( (tsfresh_input_x_test, curr_ex), axis=0) # noinspection PyTypeChecker df_timeSeries_container = pd.DataFrame(data=tsfresh_input_x_test, columns=columns) print("TS Fresh Feature Extraction started at: ", datetime.datetime.now()) extracted_features = tsfresh.extract_features(df_timeSeries_container, column_id="id", column_sort="time") print('Extraction finished at:', datetime.datetime.now()) print('Extracted features (unfiltered): ', extracted_features.shape) print('Saving unfiltered to:', self.dataset.dataset_folder + self.config.ts_fresh_filtered_file) extracted_features.to_pickle(self.dataset.dataset_folder + self.config.ts_fresh_unfiltered_file) # Remove NANs extracted_features = impute(extracted_features) print('Extracted features (imputed): ', extracted_features.shape) filtered = tsfresh.select_features(extracted_features, y_train_strings) print('Filtered features size: ', filtered.shape) # print('Filtered features: ', filtered) print('Saving filtered to:', self.dataset.dataset_folder + self.config.ts_fresh_filtered_file) filtered.to_pickle(self.dataset.dataset_folder + self.config.ts_fresh_filtered_file)
download_robot_execution_failures() timeseries, y = load_robot_execution_failures() # 2. 看一下数据的形式 print(timeseries.head()) print(y.head()) # 3. 抽取特征 from tsfresh import extract_features extracted_features = extract_features(timeseries, column_id='id', column_sort='time') print(extracted_features.head()) # 4. 特征过滤 # 由上一步操作得到的特征中存在空值(NaN),这些没有意义的值需要去掉,选择有用的特征进行保留。从结果可以看出,数据的维度减少了很多。 from tsfresh import select_features from tsfresh.utilities.dataframe_functions import impute impute(extracted_features) features_filtered = select_features(extracted_features, y) print(features_filtered.head()) # 5. 特征抽取与过滤同时进行(一步到位,省去多余计算) from tsfresh import extract_relevant_features features_filtered_direct = extract_relevant_features(timeseries, y, column_id='id', column_sort='time') print(features_filtered_direct.head())
temp_extracted_features = extract_features(temp_df, column_id="id", column_sort="time", column_kind=None, column_value=None) df = df.append(temp_extracted_features, ignore_index=True) # break; # save the features into a csv file current_time = time.time() df.to_csv(str(current_time) + '_no_index.csv', index=False) # list the class label in Y Y = dataset[:, feature_num] y = pd.Series(Y) # # for test purpose # df = pd.read_csv(r"C:\WinPython-64bit-3.6.1.0Qt5\notebooks\Chair Sensor\user_identifcaiton_imputed_no_index.csv") # copy_df = df # inmpute features, so inf and nan are gone impute(df) df.to_csv(str(current_time) + '_imputed_no_index.csv', index=False) # remove useless features - to be check whether they are truely useless features_filtered = select_features(df, y) features_filtered.to_csv(str(current_time) + '_imputed_feature_selected_no_index.csv', index=False)
'magnitude': mag_flatten, #[idexes_to_get], 'timestamp': time_stamp_flatten #[idexes_to_get] } dataset_df = pd.DataFrame(dataset_dict, columns=list(dataset_dict.keys())) extraction_settings = ComprehensiveFCParameters() X = extract_features(dataset_df, column_id='ids', column_sort='time', default_fc_parameters=extraction_settings, impute_function=impute, n_jobs=-1) impute(X) y = pd.Series(y_train_real, index=np.arange(y_train_real.shape[0]) + 1) features_filtered = select_features(X, y) x_train_real = features_filtered.values scaler = StandardScaler() scaler.fit(x_train_real) x_train_scaled = scaler.transform(x_train_real) # # pca = PCA() # pca.fit(x_train_scaled) # x_train_pca = pca.transform(x_train_scaled) # # variance_precentage = pca.explained_variance_ratio_ # cum_sum_variance = np.cumsum(variance_precentage) # indx_important_pca_components = np.argmax(cum_sum_variance > 0.9)
#create initial array for output prediction probabilities rfc4outputfinal = np.zeros(16) mlp4outputfinal = np.zeros(16) #solve multi-processing problems on Windows if __name__ == "__main__": #extract training and test features using tsfresh, here minimalfcparameters mean the basic statistic e.g. mean, max, min, sd etc. train_features4 = extract_features( sub_pb, column_id='object_id', column_value='flux', column_sort='mjd', column_kind='passband', default_fc_parameters=MinimalFCParameters()) impute(train_features4) #read test set in chunks to avoid memory issues for chunk in pd.read_csv('../modules/cs342/Assignment2/test_set.csv', header=0, usecols=[0, 1, 2, 3], names=['object_id', 'mjd', 'passband', 'flux'], chunksize=10**6): sub_test_pb = chunk #Task2 & Task3 test_features4 = extract_features( sub_test_pb, column_id='object_id', column_value='flux',
classification_type = 'condition2' shuffle = 1 # (1 = train and test on same subjects, 0 = test on new subjects, not supported) # load data in a table containing all inputs and all features calculated extracted_features_original = pd.read_csv( r'C:\Users\User\Documents\2017-2018\Project\network\current_use\all_features_original.csv' ) #load labels with the following coulmns: # sub num # sub id (a different number for each time window, #condition (stress/no stress) # level(1-5) # levelB( levels 1-3 are 0, level 4-5 is 1) #levelB_condition = level (in removeLevel3 function gets new values impute(extracted_features_original) # takes care of nan and similar (in place) #All labels labels = pd.read_csv( r'C:\Users\User\Documents\2017-2018\Project\network\current_use\all_subjects_labels_original.csv' ) # adjust data and labels to remove level 3 rows (No using for now) if (classification_type == ("CL2levels" or "CL2Condition2")): extracted_features_original, labels = removeLevel3( extracted_features_original, labels, classification_type) y_for_filtering = labels.levelB #will use to get relevant features # rename data global x_input
data = pd.read_csv(folder + filename, index_col=0, date_parser=dateparse) data['id'] = [site_ID for _ in range(data.shape[0])] data.rename(columns={'AQI_': 'AQI'}, inplace=True) data['time'] = data.index data = data[['AQI', 'time', 'id']] # data = data.iloc[0:40, :] print(data.shape) # data = drop_missing_weeks(data, years) data_list[m] = data data = pd.concat(data_list, axis=0) print(data.shape) data_rolled = roll_time_series(data, column_id="id", column_sort="time", max_timeshift=7*24, n_jobs=8) features = extract_features(data_rolled, column_id="id", column_sort="time", n_jobs=8) impute(features) print(features.shape) features['time'] = data['time'].values features = drop_missing_weeks(features, years, typical_index=False) features.drop(['time'], axis=1, inplace=True) AQI = get_raw_AQI_data(path, years) AQI_data = pd.Series(data=AQI['AQI'].values, index=features.index, name='AQI') print(AQI.shape) selected_features = select_features(features, AQI_data) print(selected_features.shape) selected_features.to_csv('./data/modified_data_after_feature_extraction/AQI_features.csv', index=False)