Beispiel #1
0
def testData(moving_average=False, moving_median=False, standard_deviation=False, moving_entropy=False, entropy=False, probability_distribution=False, moving_probability=False, probability_from_file=False, moving_k_closest_average=False, moving_threshold_average=False, moving_median_centered_average=False, moving_weighted_average=False, rul=True,bin_classification=False):
    print("Testing frame process has started")
    print("---------------------------------")
    # Test data set preprocessor
    testing_frame = pd.read_csv("datasets/test.csv")
    ground_truth = pd.read_csv("datasets/rul.csv")

    # Obtain all column names
    all_column_names = list(testing_frame.columns)

    # Selected column names
    selected_column_names = all_column_names[5:]

    # Select seperation points to apply moving operations
    indices = Select.indices_seperate(feature_name="UnitNumber", data_frame=testing_frame)

    # Total work - progress,
    total_work = len(selected_column_names)

    if moving_average:
        # Moving average window 5
        current_work = 0
        print("Applying Moving Average")

        for column_name in selected_column_names:
            current_work += 1
            column = testing_frame[column_name]
            slices = Select.slice(data_column=column, indices=indices)

            ma_header = "ma_5_" + column_name
            ma_calculated_array = np.array([])
            for slice in slices:
                ma_calculated_array = np.concatenate(
                    (ma_calculated_array, Math.moving_average(series=slice, window=5, default=True)), axis=0)
            testing_frame[ma_header] = pd.Series(ma_calculated_array, index=testing_frame.index)
            Progress.printProgress(iteration=current_work, total=total_work, decimals=1, prefix="Progress",
                                   suffix="Complete")

    if moving_median:
        # Moving median window 5
        current_work = 0
        print("Applying Moving Median")

        for column_name in selected_column_names:
            current_work += 1
            column = testing_frame[column_name]
            slices = Select.slice(data_column=column, indices=indices)

            mm_header = "mm_5_" + column_name
            mm_calculated_array = np.array([])
            for slice in slices:
                mm_calculated_array = np.concatenate(
                    (mm_calculated_array, Math.moving_median(series=slice, window=5, default=True)), axis=0)
            testing_frame[mm_header] = pd.Series(mm_calculated_array, index=testing_frame.index)
            Progress.printProgress(iteration=current_work, total=total_work, decimals=1, prefix="Progress",
                                   suffix="Complete")

    if standard_deviation:
        # Moving entropy
        current_work = 0
        print("Applying Standard Deviation")

        for column_name in selected_column_names:
            current_work += 1
            column = testing_frame[column_name]
            slices = Select.slice(data_column=column, indices=indices)

            sd_header = "sd_10_" + column_name
            sd_calculated_array = np.array([])
            for slice in slices:
                sd_calculated_array = np.concatenate(
                    (sd_calculated_array, Math.moving_standard_deviation(series=slice, window=10, default=True)), axis=0)
            testing_frame[sd_header] = pd.Series(sd_calculated_array, index=testing_frame.index)
            Progress.printProgress(iteration=current_work, total=total_work, decimals=1, prefix="Progress",
                                   suffix="Complete")

    if moving_entropy:
        # Moving entropy
        current_work = 0
        print("Applying Moving Entropy")

        for column_name in selected_column_names:
            current_work += 1
            column = testing_frame[column_name]
            slices = Select.slice(data_column=column, indices=indices)

            me_header = "me_10_5_" + column_name
            me_calculated_array = np.array([])
            for slice in slices:
                me_calculated_array = np.concatenate((me_calculated_array, Math.moving_entropy(series=slice, window=10, no_of_bins=5, default=True)), axis=0)
            testing_frame[me_header] = pd.Series(me_calculated_array, index=testing_frame.index)
            Progress.printProgress(iteration=current_work, total=total_work, decimals=1, prefix="Progress",
                                   suffix="Complete")

    if entropy:
        # Entropy
        current_work = 0
        print("Applying Entropy")

        for column_name in selected_column_names:
            current_work += 1
            column = testing_frame[column_name]
            e_header = "entropy_250_" + column_name
            testing_frame[e_header] = pd.Series(Math.entropy(series=column, no_of_bins=250), index=testing_frame.index)
            Progress.printProgress(iteration=current_work, total=total_work, decimals=1, prefix="Progress",
                                   suffix="Complete")

    if probability_distribution:
        # Probability distribution
        current_work = 0
        print("Applying Probability Distribution")
        for column_name in selected_column_names:
            current_work += 1
            column = testing_frame[column_name]
            p_header = "prob_" + column_name
            testing_frame[p_header] = pd.Series(Math.probabilty_distribution(series=column, no_of_bins=250), index=testing_frame.index)
            Progress.printProgress(iteration=current_work, total=total_work, decimals=1, prefix="Progress",
                                   suffix="Complete")


    if moving_probability:
        # Moving probability distribution
        current_work = 0
        print("Applying Moving probability")
        for column_name in selected_column_names:
            current_work += 1
            column = testing_frame[column_name]
            p_header = "prob_" + column_name
            testing_frame[p_header] = pd.Series(Math.moving_probability(series=column, window=10, no_of_bins=4, default=True),
                                                index=testing_frame.index)
            Progress.printProgress(iteration=current_work, total=total_work, decimals=1, prefix="Progress",
                                   suffix="Complete")

    if probability_from_file:
        # Load probabilities from file
        file_name = 'json.txt'
        current_work = 0
        print("Applying Probability From File")
        for column_name in selected_column_names:
            current_work += 1
            column = testing_frame[column_name]
            p_header = "prob_" + column_name
            testing_frame[p_header] = pd.Series(from_file(column, column_name),
                index=testing_frame.index)
            Progress.printProgress(iteration=current_work, total=total_work, decimals=1, prefix="Progress",
                                   suffix="Complete")

    if moving_k_closest_average:
        # Moving k closest average
        current_work = 0
        print("Applying K Closest Average")
        for column_name in selected_column_names:
            current_work += 1
            column = testing_frame[column_name]
            p_header = "k_closest_" + column_name
            testing_frame[p_header] = pd.Series(Math.moving_k_closest_average(series=column, window=5, kclosest=3, default=True),
                index=testing_frame.index)
            Progress.printProgress(iteration=current_work, total=total_work, decimals=1, prefix="Progress",
                                   suffix="Complete")

    if moving_threshold_average:
        # Moving threshold average
        current_work = 0
        print("Applying Threshold Average")
        for column_name in selected_column_names:
            current_work += 1
            column = testing_frame[column_name]
            p_header = "threshold_" + column_name
            testing_frame[p_header] = pd.Series(
                Math.moving_threshold_average(series=column, window=5, threshold=-1, default=True),
                index=testing_frame.index)
            Progress.printProgress(iteration=current_work, total=total_work, decimals=1, prefix="Progress",
                                   suffix="Complete")

    if moving_median_centered_average:
        # Moving median centered average
        current_work = 0
        print("Applying Median Centered Average")
        for column_name in selected_column_names:
            current_work += 1
            column = testing_frame[column_name]
            p_header = "threshold_" + column_name
            testing_frame[p_header] = pd.Series(
                Math.moving_median_centered_average(series=column, window=5, boundary=1, default=True),
                index=testing_frame.index)
            Progress.printProgress(iteration=current_work, total=total_work, decimals=1, prefix="Progress",
                                   suffix="Complete")

    if moving_weighted_average:
        # Moving weighted average
        current_work = 0
        print("Applying Weighted Average")
        for column_name in selected_column_names:
            current_work += 1
            column = testing_frame[column_name]
            p_header = "threshold_" + column_name
            testing_frame[p_header] = pd.Series(
                Math.moving_weighted_average(series=column, window=5, weights=[5, 4, 3, 2, 1], default=True),
                index=testing_frame.index)
            Progress.printProgress(iteration=current_work, total=total_work, decimals=1, prefix="Progress",
                                   suffix="Complete")




    filtered_frame = pd.DataFrame(columns=testing_frame.columns)

    # Add last index to the indices
    indices = np.insert(indices, len(indices), len(testing_frame['UnitNumber']) - 1, axis=0)

    # Select lines for test
    for index in indices:
        filtered_frame.loc[len(filtered_frame)] = testing_frame.loc[index]

    if rul:
        filtered_frame['RUL'] = pd.Series(ground_truth['RUL'], index=filtered_frame.index)
        print("Applying RUL")

    if bin_classification:
        label = [0 if x >= 30 else 1 for x in ground_truth['RUL']]
        filtered_frame['BIN'] = pd.Series(label, index=filtered_frame.index)
        print("Applying BIN")



    print("Testing frame process is completed\n")
    filtered_frame.to_csv("Testing.csv", index=False)
    return filtered_frame
Beispiel #2
0
print "Threshold Reconstruction Error :", threshold

# Filter rows
print "\nRemoving Anomalies"
print "----------------------------------------------------------------------------------------------------------------"
print "Reconstruction Error Array Size :", len(reconstruction_error)
filtered_train = pd.DataFrame()
count = 0
for i in range(hTrain.nrow):
    if err_list[i] < threshold:
        df1 = pTrain.iloc[i, :]
        filtered_train = filtered_train.append(df1, ignore_index=True)
        count += 1
    Progress.printProgress(iteration=(i + 1),
                           total=hTrain.nrow,
                           decimals=1,
                           prefix="Progress",
                           suffix="Complete")

print filtered_train
print "Original Size :", hTrain.nrow
print "Filtered Size :", len(filtered_train)
print "Removed Rows  :", (hTrain.nrow - len(filtered_train))

# Feature Engineering
pTrain = ProcessData.trainDataToFrame(filtered_train,
                                      moving_k_closest_average=True,
                                      standard_deviation=True,
                                      probability_distribution=True)
pTest = ProcessData.testData(moving_k_closest_average=True,
                             standard_deviation=True,
Beispiel #3
0
def trainDataToFrame(training_frame, selected_column_names, moving_average=False, moving_median=False, standard_deviation=False, moving_entropy=False, entropy=False, probability_distribution=False, moving_probability=False, moving_k_closest_average=False, moving_threshold_average=False, moving_median_centered_average=False, moving_weighted_average=False, rul=False, bin_classification=False):
    print("Training frame process has started")
    print("----------------------------------")

    indices = Select.indices_seperate(feature_name="UnitNumber", data_frame=training_frame)

    # Total work - Progress
    total_work = len(selected_column_names)

    if moving_average:
        # Moving average window 5
        current_work = 0
        print("Applying Moving Average")

        for column_name in selected_column_names:
            current_work += 1
            column = training_frame[column_name]
            slices = Select.slice(data_column=column, indices=indices)

            ma_header = "ma_5_" + column_name
            ma_calculated_array = np.array([])
            for slice in slices:
                ma_calculated_array = np.concatenate(
                    (ma_calculated_array, Math.moving_average(series=slice, window=5, default=True)), axis=0)
            training_frame[ma_header] = pd.Series(ma_calculated_array, index=training_frame.index)
            Progress.printProgress(iteration=current_work, total=total_work, decimals=1, prefix="Progress", suffix="Complete")

    if moving_median:
        # Moving median window 5
        current_work = 0
        print("Applying Moving Median")

        for column_name in selected_column_names:
            current_work += 1
            column = training_frame[column_name]
            slices = Select.slice(data_column=column, indices=indices)

            mm_header = "mm_5_" + column_name
            mm_calculated_array = np.array([])
            for slice in slices:
                mm_calculated_array = np.concatenate(
                    (mm_calculated_array, Math.moving_median(series=slice, window=5, default=True)), axis=0)
            training_frame[mm_header] = pd.Series(mm_calculated_array, index=training_frame.index)
            Progress.printProgress(iteration=current_work, total=total_work, decimals=1, prefix="Progress", suffix="Complete")

    if standard_deviation:
        # Moving standard deviation 10
        current_work = 0
        print("Applying Standard Deviation")

        for column_name in selected_column_names:
            current_work += 1
            column = training_frame[column_name]
            slices = Select.slice(data_column=column, indices=indices)

            sd_header = "sd_10_" + column_name
            sd_calculated_array = np.array([])
            for slice in slices:
                sd_calculated_array = np.concatenate(
                    (sd_calculated_array, Math.moving_standard_deviation(series=slice, window=10, default=True)), axis=0)
            training_frame[sd_header] = pd.Series(sd_calculated_array, index=training_frame.index)
            Progress.printProgress(iteration=current_work, total=total_work, decimals=1, prefix="Progress", suffix="Complete")

    if moving_entropy:
        # Moving entropy
        current_work = 0
        print("Applying Moving Entropy")

        for column_name in selected_column_names:
            current_work += 1
            column = training_frame[column_name]
            slices = Select.slice(data_column=column, indices=indices)

            me_header = "me_10_5_" + column_name
            me_calculated_array = np.array([])
            for slice in slices:
                me_calculated_array = np.concatenate((me_calculated_array, Math.moving_entropy(series=slice, window=10, no_of_bins=5, default=True)), axis=0)
            training_frame[me_header] = pd.Series(me_calculated_array, index=training_frame.index)
            Progress.printProgress(iteration=current_work, total=total_work, decimals=1, prefix="Progress",
                                   suffix="Complete")

    if entropy:
        # Entropy
        current_work = 0
        print("Applying Entropy")

        for column_name in selected_column_names:
            current_work += 1
            column = training_frame[column_name]
            e_header = "entropy_250_" + column_name
            training_frame[e_header] = pd.Series(Math.entropy(series=column, no_of_bins=250), index=training_frame.index)
            Progress.printProgress(iteration=current_work, total=total_work, decimals=1, prefix="Progress", suffix="Complete")

    if probability_distribution:
        # Probability distribution
        current_work = 0
        print("Applying Probability Distribution")
        for column_name in selected_column_names:
            current_work += 1
            column = training_frame[column_name]
            p_header = "prob_" + column_name
            training_frame[p_header] = pd.Series(Math.probabilty_distribution(series=column, no_of_bins=250), index=training_frame.index)
            Progress.printProgress(iteration=current_work, total=total_work, decimals=1, prefix="Progress",
                                   suffix="Complete")

    if moving_probability:
        # Moving probability distribution
        current_work = 0
        print("Applying Moving probability")
        for column_name in selected_column_names:
            current_work += 1
            column = training_frame[column_name]
            p_header = "prob_" + column_name
            training_frame[p_header] = pd.Series(Math.moving_probability(series=column, window=10, no_of_bins=4, default=True),
                                                 index=training_frame.index)
            Progress.printProgress(iteration=current_work, total=total_work, decimals=1, prefix="Progress",
                                       suffix="Complete")

    if moving_k_closest_average:
        # Moving k closest average
        current_work = 0
        print("Applying K Closest Average")
        for column_name in selected_column_names:
            current_work += 1
            column = training_frame[column_name]
            p_header = "k_closest_" + column_name
            training_frame[p_header] = pd.Series(
                Math.moving_k_closest_average(series=column, window=5, kclosest=3, default=True),
                index=training_frame.index)
            Progress.printProgress(iteration=current_work, total=total_work, decimals=1, prefix="Progress",
                                       suffix="Complete")

    if moving_threshold_average:
        # Moving threshold average
        current_work = 0
        print("Applying Threshold Average")
        for column_name in selected_column_names:
            current_work += 1
            column = training_frame[column_name]
            p_header = "threshold_" + column_name
            training_frame[p_header] = pd.Series(
                Math.moving_threshold_average(series=column, window=5, threshold=-1, default=True),
                index=training_frame.index)
            Progress.printProgress(iteration=current_work, total=total_work, decimals=1, prefix="Progress",
                                       suffix="Complete")

    if moving_median_centered_average:
        # Moving median centered average
        current_work = 0
        print("Applying Median Centered Average")
        for column_name in selected_column_names:
            current_work += 1
            column = training_frame[column_name]
            p_header = "threshold_" + column_name
            training_frame[p_header] = pd.Series(
                Math.moving_median_centered_average(series=column, window=5, boundary=1, default=True),
                index=training_frame.index)
            Progress.printProgress(iteration=current_work, total=total_work, decimals=1, prefix="Progress",
                                       suffix="Complete")

    if moving_weighted_average:
        # Moving weighted average
        current_work = 0
        print("Applying Weighted Average")
        for column_name in selected_column_names:
            current_work += 1
            column = training_frame[column_name]
            p_header = "threshold_" + column_name
            training_frame[p_header] = pd.Series(
                Math.moving_weighted_average(series=column, window=5, weights=[5, 4, 3, 2, 1], default=True),
                    index=training_frame.index)
            Progress.printProgress(iteration=current_work, total=total_work, decimals=1, prefix="Progress",
                                       suffix="Complete")

    if rul:
        time_column = training_frame['Time']
        rul = DataSetSpecific.remaining_usefullifetime(indices=indices, time_series=time_column)
        training_frame['RUL'] = pd.Series(rul, index=training_frame.index)
        print("Applying RUL")

    if bin_classification:
        time_column = training_frame['Time']
        label = DataSetSpecific.binary_classification(indices=indices, time_series=time_column)
        training_frame['BIN'] = pd.Series(label, index=training_frame.index)
        print("Applying BIN")


    print("Training frame process is completed\n")
    training_frame.to_csv("Training.csv", index=False)
    return training_frame