def testData(moving_average=False, moving_median=False, standard_deviation=False, moving_entropy=False, entropy=False, probability_distribution=False, moving_probability=False, probability_from_file=False, moving_k_closest_average=False, moving_threshold_average=False, moving_median_centered_average=False, moving_weighted_average=False, rul=True,bin_classification=False): print("Testing frame process has started") print("---------------------------------") # Test data set preprocessor testing_frame = pd.read_csv("datasets/test.csv") ground_truth = pd.read_csv("datasets/rul.csv") # Obtain all column names all_column_names = list(testing_frame.columns) # Selected column names selected_column_names = all_column_names[5:] # Select seperation points to apply moving operations indices = Select.indices_seperate(feature_name="UnitNumber", data_frame=testing_frame) # Total work - progress, total_work = len(selected_column_names) if moving_average: # Moving average window 5 current_work = 0 print("Applying Moving Average") for column_name in selected_column_names: current_work += 1 column = testing_frame[column_name] slices = Select.slice(data_column=column, indices=indices) ma_header = "ma_5_" + column_name ma_calculated_array = np.array([]) for slice in slices: ma_calculated_array = np.concatenate( (ma_calculated_array, Math.moving_average(series=slice, window=5, default=True)), axis=0) testing_frame[ma_header] = pd.Series(ma_calculated_array, index=testing_frame.index) Progress.printProgress(iteration=current_work, total=total_work, decimals=1, prefix="Progress", suffix="Complete") if moving_median: # Moving median window 5 current_work = 0 print("Applying Moving Median") for column_name in selected_column_names: current_work += 1 column = testing_frame[column_name] slices = Select.slice(data_column=column, indices=indices) mm_header = "mm_5_" + column_name mm_calculated_array = np.array([]) for slice in slices: mm_calculated_array = np.concatenate( (mm_calculated_array, Math.moving_median(series=slice, window=5, default=True)), axis=0) testing_frame[mm_header] = pd.Series(mm_calculated_array, index=testing_frame.index) Progress.printProgress(iteration=current_work, total=total_work, decimals=1, prefix="Progress", suffix="Complete") if standard_deviation: # Moving entropy current_work = 0 print("Applying Standard Deviation") for column_name in selected_column_names: current_work += 1 column = testing_frame[column_name] slices = Select.slice(data_column=column, indices=indices) sd_header = "sd_10_" + column_name sd_calculated_array = np.array([]) for slice in slices: sd_calculated_array = np.concatenate( (sd_calculated_array, Math.moving_standard_deviation(series=slice, window=10, default=True)), axis=0) testing_frame[sd_header] = pd.Series(sd_calculated_array, index=testing_frame.index) Progress.printProgress(iteration=current_work, total=total_work, decimals=1, prefix="Progress", suffix="Complete") if moving_entropy: # Moving entropy current_work = 0 print("Applying Moving Entropy") for column_name in selected_column_names: current_work += 1 column = testing_frame[column_name] slices = Select.slice(data_column=column, indices=indices) me_header = "me_10_5_" + column_name me_calculated_array = np.array([]) for slice in slices: me_calculated_array = np.concatenate((me_calculated_array, Math.moving_entropy(series=slice, window=10, no_of_bins=5, default=True)), axis=0) testing_frame[me_header] = pd.Series(me_calculated_array, index=testing_frame.index) Progress.printProgress(iteration=current_work, total=total_work, decimals=1, prefix="Progress", suffix="Complete") if entropy: # Entropy current_work = 0 print("Applying Entropy") for column_name in selected_column_names: current_work += 1 column = testing_frame[column_name] e_header = "entropy_250_" + column_name testing_frame[e_header] = pd.Series(Math.entropy(series=column, no_of_bins=250), index=testing_frame.index) Progress.printProgress(iteration=current_work, total=total_work, decimals=1, prefix="Progress", suffix="Complete") if probability_distribution: # Probability distribution current_work = 0 print("Applying Probability Distribution") for column_name in selected_column_names: current_work += 1 column = testing_frame[column_name] p_header = "prob_" + column_name testing_frame[p_header] = pd.Series(Math.probabilty_distribution(series=column, no_of_bins=250), index=testing_frame.index) Progress.printProgress(iteration=current_work, total=total_work, decimals=1, prefix="Progress", suffix="Complete") if moving_probability: # Moving probability distribution current_work = 0 print("Applying Moving probability") for column_name in selected_column_names: current_work += 1 column = testing_frame[column_name] p_header = "prob_" + column_name testing_frame[p_header] = pd.Series(Math.moving_probability(series=column, window=10, no_of_bins=4, default=True), index=testing_frame.index) Progress.printProgress(iteration=current_work, total=total_work, decimals=1, prefix="Progress", suffix="Complete") if probability_from_file: # Load probabilities from file file_name = 'json.txt' current_work = 0 print("Applying Probability From File") for column_name in selected_column_names: current_work += 1 column = testing_frame[column_name] p_header = "prob_" + column_name testing_frame[p_header] = pd.Series(from_file(column, column_name), index=testing_frame.index) Progress.printProgress(iteration=current_work, total=total_work, decimals=1, prefix="Progress", suffix="Complete") if moving_k_closest_average: # Moving k closest average current_work = 0 print("Applying K Closest Average") for column_name in selected_column_names: current_work += 1 column = testing_frame[column_name] p_header = "k_closest_" + column_name testing_frame[p_header] = pd.Series(Math.moving_k_closest_average(series=column, window=5, kclosest=3, default=True), index=testing_frame.index) Progress.printProgress(iteration=current_work, total=total_work, decimals=1, prefix="Progress", suffix="Complete") if moving_threshold_average: # Moving threshold average current_work = 0 print("Applying Threshold Average") for column_name in selected_column_names: current_work += 1 column = testing_frame[column_name] p_header = "threshold_" + column_name testing_frame[p_header] = pd.Series( Math.moving_threshold_average(series=column, window=5, threshold=-1, default=True), index=testing_frame.index) Progress.printProgress(iteration=current_work, total=total_work, decimals=1, prefix="Progress", suffix="Complete") if moving_median_centered_average: # Moving median centered average current_work = 0 print("Applying Median Centered Average") for column_name in selected_column_names: current_work += 1 column = testing_frame[column_name] p_header = "threshold_" + column_name testing_frame[p_header] = pd.Series( Math.moving_median_centered_average(series=column, window=5, boundary=1, default=True), index=testing_frame.index) Progress.printProgress(iteration=current_work, total=total_work, decimals=1, prefix="Progress", suffix="Complete") if moving_weighted_average: # Moving weighted average current_work = 0 print("Applying Weighted Average") for column_name in selected_column_names: current_work += 1 column = testing_frame[column_name] p_header = "threshold_" + column_name testing_frame[p_header] = pd.Series( Math.moving_weighted_average(series=column, window=5, weights=[5, 4, 3, 2, 1], default=True), index=testing_frame.index) Progress.printProgress(iteration=current_work, total=total_work, decimals=1, prefix="Progress", suffix="Complete") filtered_frame = pd.DataFrame(columns=testing_frame.columns) # Add last index to the indices indices = np.insert(indices, len(indices), len(testing_frame['UnitNumber']) - 1, axis=0) # Select lines for test for index in indices: filtered_frame.loc[len(filtered_frame)] = testing_frame.loc[index] if rul: filtered_frame['RUL'] = pd.Series(ground_truth['RUL'], index=filtered_frame.index) print("Applying RUL") if bin_classification: label = [0 if x >= 30 else 1 for x in ground_truth['RUL']] filtered_frame['BIN'] = pd.Series(label, index=filtered_frame.index) print("Applying BIN") print("Testing frame process is completed\n") filtered_frame.to_csv("Testing.csv", index=False) return filtered_frame
print "Threshold Reconstruction Error :", threshold # Filter rows print "\nRemoving Anomalies" print "----------------------------------------------------------------------------------------------------------------" print "Reconstruction Error Array Size :", len(reconstruction_error) filtered_train = pd.DataFrame() count = 0 for i in range(hTrain.nrow): if err_list[i] < threshold: df1 = pTrain.iloc[i, :] filtered_train = filtered_train.append(df1, ignore_index=True) count += 1 Progress.printProgress(iteration=(i + 1), total=hTrain.nrow, decimals=1, prefix="Progress", suffix="Complete") print filtered_train print "Original Size :", hTrain.nrow print "Filtered Size :", len(filtered_train) print "Removed Rows :", (hTrain.nrow - len(filtered_train)) # Feature Engineering pTrain = ProcessData.trainDataToFrame(filtered_train, moving_k_closest_average=True, standard_deviation=True, probability_distribution=True) pTest = ProcessData.testData(moving_k_closest_average=True, standard_deviation=True,
def trainDataToFrame(training_frame, selected_column_names, moving_average=False, moving_median=False, standard_deviation=False, moving_entropy=False, entropy=False, probability_distribution=False, moving_probability=False, moving_k_closest_average=False, moving_threshold_average=False, moving_median_centered_average=False, moving_weighted_average=False, rul=False, bin_classification=False): print("Training frame process has started") print("----------------------------------") indices = Select.indices_seperate(feature_name="UnitNumber", data_frame=training_frame) # Total work - Progress total_work = len(selected_column_names) if moving_average: # Moving average window 5 current_work = 0 print("Applying Moving Average") for column_name in selected_column_names: current_work += 1 column = training_frame[column_name] slices = Select.slice(data_column=column, indices=indices) ma_header = "ma_5_" + column_name ma_calculated_array = np.array([]) for slice in slices: ma_calculated_array = np.concatenate( (ma_calculated_array, Math.moving_average(series=slice, window=5, default=True)), axis=0) training_frame[ma_header] = pd.Series(ma_calculated_array, index=training_frame.index) Progress.printProgress(iteration=current_work, total=total_work, decimals=1, prefix="Progress", suffix="Complete") if moving_median: # Moving median window 5 current_work = 0 print("Applying Moving Median") for column_name in selected_column_names: current_work += 1 column = training_frame[column_name] slices = Select.slice(data_column=column, indices=indices) mm_header = "mm_5_" + column_name mm_calculated_array = np.array([]) for slice in slices: mm_calculated_array = np.concatenate( (mm_calculated_array, Math.moving_median(series=slice, window=5, default=True)), axis=0) training_frame[mm_header] = pd.Series(mm_calculated_array, index=training_frame.index) Progress.printProgress(iteration=current_work, total=total_work, decimals=1, prefix="Progress", suffix="Complete") if standard_deviation: # Moving standard deviation 10 current_work = 0 print("Applying Standard Deviation") for column_name in selected_column_names: current_work += 1 column = training_frame[column_name] slices = Select.slice(data_column=column, indices=indices) sd_header = "sd_10_" + column_name sd_calculated_array = np.array([]) for slice in slices: sd_calculated_array = np.concatenate( (sd_calculated_array, Math.moving_standard_deviation(series=slice, window=10, default=True)), axis=0) training_frame[sd_header] = pd.Series(sd_calculated_array, index=training_frame.index) Progress.printProgress(iteration=current_work, total=total_work, decimals=1, prefix="Progress", suffix="Complete") if moving_entropy: # Moving entropy current_work = 0 print("Applying Moving Entropy") for column_name in selected_column_names: current_work += 1 column = training_frame[column_name] slices = Select.slice(data_column=column, indices=indices) me_header = "me_10_5_" + column_name me_calculated_array = np.array([]) for slice in slices: me_calculated_array = np.concatenate((me_calculated_array, Math.moving_entropy(series=slice, window=10, no_of_bins=5, default=True)), axis=0) training_frame[me_header] = pd.Series(me_calculated_array, index=training_frame.index) Progress.printProgress(iteration=current_work, total=total_work, decimals=1, prefix="Progress", suffix="Complete") if entropy: # Entropy current_work = 0 print("Applying Entropy") for column_name in selected_column_names: current_work += 1 column = training_frame[column_name] e_header = "entropy_250_" + column_name training_frame[e_header] = pd.Series(Math.entropy(series=column, no_of_bins=250), index=training_frame.index) Progress.printProgress(iteration=current_work, total=total_work, decimals=1, prefix="Progress", suffix="Complete") if probability_distribution: # Probability distribution current_work = 0 print("Applying Probability Distribution") for column_name in selected_column_names: current_work += 1 column = training_frame[column_name] p_header = "prob_" + column_name training_frame[p_header] = pd.Series(Math.probabilty_distribution(series=column, no_of_bins=250), index=training_frame.index) Progress.printProgress(iteration=current_work, total=total_work, decimals=1, prefix="Progress", suffix="Complete") if moving_probability: # Moving probability distribution current_work = 0 print("Applying Moving probability") for column_name in selected_column_names: current_work += 1 column = training_frame[column_name] p_header = "prob_" + column_name training_frame[p_header] = pd.Series(Math.moving_probability(series=column, window=10, no_of_bins=4, default=True), index=training_frame.index) Progress.printProgress(iteration=current_work, total=total_work, decimals=1, prefix="Progress", suffix="Complete") if moving_k_closest_average: # Moving k closest average current_work = 0 print("Applying K Closest Average") for column_name in selected_column_names: current_work += 1 column = training_frame[column_name] p_header = "k_closest_" + column_name training_frame[p_header] = pd.Series( Math.moving_k_closest_average(series=column, window=5, kclosest=3, default=True), index=training_frame.index) Progress.printProgress(iteration=current_work, total=total_work, decimals=1, prefix="Progress", suffix="Complete") if moving_threshold_average: # Moving threshold average current_work = 0 print("Applying Threshold Average") for column_name in selected_column_names: current_work += 1 column = training_frame[column_name] p_header = "threshold_" + column_name training_frame[p_header] = pd.Series( Math.moving_threshold_average(series=column, window=5, threshold=-1, default=True), index=training_frame.index) Progress.printProgress(iteration=current_work, total=total_work, decimals=1, prefix="Progress", suffix="Complete") if moving_median_centered_average: # Moving median centered average current_work = 0 print("Applying Median Centered Average") for column_name in selected_column_names: current_work += 1 column = training_frame[column_name] p_header = "threshold_" + column_name training_frame[p_header] = pd.Series( Math.moving_median_centered_average(series=column, window=5, boundary=1, default=True), index=training_frame.index) Progress.printProgress(iteration=current_work, total=total_work, decimals=1, prefix="Progress", suffix="Complete") if moving_weighted_average: # Moving weighted average current_work = 0 print("Applying Weighted Average") for column_name in selected_column_names: current_work += 1 column = training_frame[column_name] p_header = "threshold_" + column_name training_frame[p_header] = pd.Series( Math.moving_weighted_average(series=column, window=5, weights=[5, 4, 3, 2, 1], default=True), index=training_frame.index) Progress.printProgress(iteration=current_work, total=total_work, decimals=1, prefix="Progress", suffix="Complete") if rul: time_column = training_frame['Time'] rul = DataSetSpecific.remaining_usefullifetime(indices=indices, time_series=time_column) training_frame['RUL'] = pd.Series(rul, index=training_frame.index) print("Applying RUL") if bin_classification: time_column = training_frame['Time'] label = DataSetSpecific.binary_classification(indices=indices, time_series=time_column) training_frame['BIN'] = pd.Series(label, index=training_frame.index) print("Applying BIN") print("Training frame process is completed\n") training_frame.to_csv("Training.csv", index=False) return training_frame