def test_toplevel_impute(self):
        df = pd.DataFrame(np.transpose([[0, 1, 2, np.NaN], [1, np.PINF, 2, 3], [1, -3, np.NINF, 3]]),
                          columns=["value_a", "value_b", "value_c"])

        dataframe_functions.impute(df)

        self.assertEqual(list(df.value_a), [0, 1, 2, 1])
        self.assertEqual(list(df.value_b), [1, 3, 2, 3])
        self.assertEqual(list(df.value_c), [1, -3, -3, 3])

        df = pd.DataFrame(np.transpose([[0, 1, 2, np.NaN], [1, np.PINF, 2, np.NaN], [np.NaN, -3, np.NINF, 3]]),
                          columns=["value_a", "value_b", "value_c"])
        df = df.astype(np.float64, inplace=True)
        dataframe_functions.impute(df)

        self.assertEqual(list(df.value_a), [0, 1, 2, 1])
        self.assertEqual(list(df.value_b), [1, 2, 2, 1.5])
        self.assertEqual(list(df.value_c), [0, -3, -3, 3])

        df = pd.DataFrame(np.transpose([[0, 1, 2, np.NaN], [1, np.PINF, 2, 3], [np.PINF, -3, np.NINF, 3]]),
                          columns=["value_a", "value_b", "value_c"])
        df = df.astype(np.float32, inplace=True)
        dataframe_functions.impute(df)

        self.assertEqual(list(df.value_a), [0, 1, 2, 1])
        self.assertEqual(list(df.value_b), [1, 3, 2, 3])
        self.assertEqual(list(df.value_c), [3, -3, -3, 3])
def main():
    config = Configuration()
    print("TS Fresh Feature Extraction Script started at: ", datetime.datetime.now())
    # y_train = np.load(config.training_data_folder + 'train_labels.npy')  # labels of the training data
    # x_train = np.load(config.training_data_folder + 'train_features.npy')  # labels of the training data
    # feature_names = np.load(config.training_data_folder + 'feature_names.npy')
    # failureTimes_train = np.load(config.training_data_folder + 'train_failure_times.npy')
    # windowTimes_train = np.load(config.training_data_folder + 'train_window_times.npy')
    # y_test = np.load(config.training_data_folder + 'test_labels.npy')  # labels of the training data
    # x_test = np.load(config.training_data_folder + 'test_features.npy')  # labels of the training data
    x_train = np.load(config.case_base_folder + 'train_features.npy')  # data training
    y_train_strings = np.expand_dims(np.load(config.case_base_folder + 'train_labels.npy'), axis=-1)
    feature_names = np.load(config.training_data_folder + 'feature_names.npy')
    columns = np.concatenate((['id', 'time'], feature_names))
    print(columns.shape)

    print(x_train.shape)
    examples = x_train.shape[0]
    time_series_length = x_train.shape[1]
    attributes = x_train.shape[2]

    # tsfresh_input_x_test = np.zeros([examples * time_series_length, attributes+2])
    tsfresh_input_x_test = np.zeros([1, 63])
    # add 2 columns for id and timestamp

    for example in range(examples):
        print("example: ", example)
        id_vec = np.ones(x_train.shape[1]) * example
        time_vec = np.arange(x_train.shape[1])

        # stack id and time and example matrix together
        id_time_matrix = np.dstack((id_vec, time_vec)).squeeze()  # (1500,2)
        # print("id_time_matrix: ", id_time_matrix.shape)
        # print("x_test[example]: ", x_train[example,:,:].shape)
        curr_ex = np.concatenate((id_time_matrix, x_train[example, :, :]), axis=1)  # (1500, 63)
        print(example, " shape: ", curr_ex.shape)
        if example == 0:
            tsfresh_input_x_test = curr_ex
        else:
            tsfresh_input_x_test = np.concatenate((tsfresh_input_x_test, curr_ex), axis=0)
        # print("dummy: ", tsfresh_input_x_test.shape)

        # Append to overall array
        # tsfresh_input_x_test[2:,:]
        # tsfresh_input_x_test[]

    # get unique classes

    df_timeSeries_container = pd.DataFrame(data=tsfresh_input_x_test, columns=columns)
    df_labels = pd.DataFrame(data=y_train_strings)
    print("TS Fresh Feature Extraction started at: ", datetime.datetime.now())
    extracted_features = extract_features(df_timeSeries_container, column_id="id", column_sort="time")
    extracted_features.to_pickle(config.case_base_folder + 'extractedFeatures_X_caseBase_unfiltered_4ms4sec.pkl')

    # extracted_features.to_csv('extractedFeatures_X_caseBase_unfiltered.csv', sep=',', encoding='WINDOWS-1252')
    print('extracted features size unfiltered: ', extracted_features.shape)

    from tsfresh.utilities.dataframe_functions import impute
    # Remove NANs
    extracted_features = impute(extracted_features)
    print('extracted features size after impute: ', extracted_features.shape)

    from tsfresh import select_features
    X_filtered = select_features(extracted_features, y_train_strings)
    print('filtered features size: ', X_filtered.shape)
    print('filtered features: ', X_filtered)
    X_filtered.to_pickle(config.case_base_folder + 'extractedFeatures_X_filtered_4ms4sec.pkl')

    y_train_strings = np.squeeze(y_train_strings)
    print("y_train_strings: ", y_train_strings.shape)
    X = pd.read_pickle(config.case_base_folder + 'extractedFeatures_X_caseBase_unfiltered_4ms4sec.pkl')

    print("X shape: ", X.shape)

    print(X.head())
    # Remove NANs
    # X = impute(X)
    print('extracted features size after impute: ', X.shape)
        '0.0': fc_parameters,
        '1.0': fc_parameters,
        '2.0': fc_parameters,
        '3.0': fc_parameters,
        '4.0': fc_parameters,
        '5.0': fc_parameters
    }

    train_features4 = extract_features(
        sub_pb,
        column_id='object_id',
        column_value='flux',
        column_sort='mjd',
        column_kind='passband',
        default_fc_parameters=MinimalFCParameters())
    impute(train_features4)

    train_features5 = extract_features(
        sub_pb,
        column_id='object_id',
        column_value='flux',
        column_sort='mjd',
        column_kind='passband',
        default_fc_parameters=fc_parameters,
        kind_to_fc_parameters=kind_to_fc_parameters)
    impute(train_features5)

    #Task5 model pruning
    training5_X = train_features5
    training5_Y = sub_meta['target']
def main(argv):
    try:
        path = argv[0]
    except:
        pass
    print path

    if not os.path.exists(path):
        print "Invalid Path"
        return

    #Load our ML Model
    clf = joblib.load('bestrandomforest.pkl')

    #Create a list with which we will append values to
    class_paths = []
    classes = []
    truths = []

    #Bad TSFresh features to filter out
    bad_features = []
    for i in range(8):
        langevin = str(i) + "__max_langevin_fixed_point__m_3__r_30"
        bad_features.append(langevin)
        for j in range(9):
            quantile = (j + 1) * 0.1
            if quantile != 0.5:
                feature_name = str(i) + "__index_mass_quantile__q_" + str(
                    quantile)
                bad_features.append(feature_name)

    total_predictions = 0
    true_predictions = 0
    for file in os.listdir(path):

        if sys.platform == "win32" or sys.platform == "win64":
            filepath = path + '\\' + file
        else:
            filepath = path + '/' + file

        true_label = int(file[7])

        if file.endswith(".txt") or file.endswith(".csv"):
            try:
                sample = pd.read_csv(filepath, header=None)
            except:
                pass

            #Preprocess the data
            sample[8] = sample.index.astype(float)
            sample[9] = 1.0
            sample = extract_features(sample, column_id=9, column_sort=8)
            impute(sample)
            sample = sample.fillna(0)
            sample.columns = sample.columns.map(lambda t: str(t))
            sample = sample.sort_index(axis=1)
            sample = sample.drop(bad_features, axis=1)

            #Predict the class
            gesture = clf.predict(sample.loc[0:1])[0]
            if gesture == "One":
                predicted_label = 1
            if gesture == "Two":
                predicted_label = 2
            if gesture == "Three":
                predicted_label = 3
            if gesture == "Four":
                predicted_label = 4
            if gesture == "Five":
                predicted_label = 5
            if gesture == "Six":
                predicted_label = 6

            if true_label == predicted_label:
                true_predictions += 1
            total_predictions += 1

            class_paths.append(file)
            classes.append(predicted_label)
            truths.append(true_label)

    accuracy = (float(true_predictions) / total_predictions) * 100
    print "Accuracy:", accuracy, "%"

    #Save Output as CSV File
    output = pd.DataFrame()
    output['Filename'] = class_paths
    output['Predicted'] = classes
    output['True'] = truths
    output = output.assign(Accuracy="")
    accuracy_string = str(accuracy) + "%"
    output['Accuracy'][0] = accuracy_string

    output.to_csv('ClassificationResults.csv', index=False)
Esempio n. 5
0
def main_HS():
    # Hydraulic Systems Dataset, using top 2 WINDOW/STEP and TSFRESH
    name = 'HS_X1_20-10'
    X1, y_true = load_hydraulicsystems_data(True, 20, 10)
    with open(f'results/{name}.txt', 'w') as f:
        print(name, file=f)
    plot_corr(X1, name)
    X1_pca = create_pca(X1, name)
    # for each condition we need a model:
    X1_models = {}
    for condition in list(y_true):
        clf_lr, clf_dtc, clf_lsvm, clf_polsvm, clf_rbfsvm, clf_rfc, clf_gbtc =\
            training_models(X1_pca, y_true[condition], name, cond=condition)
        X1_models[condition] = {
            'clf_lr': clf_lr,
            'clf_dtc': clf_dtc,
            'clf_lsvm': clf_lsvm,
            'clf_polsvm': clf_polsvm,
            'clf_rbfsvm': clf_rbfsvm
        }

    name = 'HS_X1_30-15'
    X1, y_true = load_hydraulicsystems_data(True, 30, 15)
    with open(f'results/{name}.txt', 'w') as f:
        print(name, file=f)
    plot_corr(X1, name)
    X1_pca = create_pca(X1, name)
    # for each condition we need a model:
    X1_models = {}
    for condition in list(y_true):
        clf_lr, clf_dtc, clf_lsvm, clf_polsvm, clf_rbfsvm, clf_rfc, clf_gbtc =\
            training_models(X1_pca, y_true[condition], name, cond=condition)
        X1_models[condition] = {
            'clf_lr': clf_lr,
            'clf_dtc': clf_dtc,
            'clf_lsvm': clf_lsvm,
            'clf_polsvm': clf_polsvm,
            'clf_rbfsvm': clf_rbfsvm
        }

    df, y_true = load_hydraulicsystems_data(tsf=True)

    name = 'HS_X2_TSFRESH1'
    with open(f'results/{name}.txt', 'w') as f:
        print(name, file=f)
    # for each condition we need a model:
    X2_models = {}
    X2 = extract_features(df, column_id="cycle", column_sort="time")
    impute(X2)
    # plot_corr(X2, name)
    for condition in list(y_true):
        X2_pca = create_pca(X2, name, cond=condition)
        clf_lr, clf_dtc, clf_lsvm, clf_polsvm, clf_rbfsvm, clf_rfc, clf_gbtc =\
            training_models(X2_pca, y_true[condition], name, cond=condition)
        X2_models[condition] = {
            'clf_lr': clf_lr,
            'clf_dtc': clf_dtc,
            'clf_lsvm': clf_lsvm,
            'clf_polsvm': clf_polsvm,
            'clf_rbfsvm': clf_rbfsvm,
            'clf_rfc': clf_rfc,
            'clf_gbtc': clf_gbtc
        }

    name = 'HS_X3_TSFRESH2'
    with open(f'results/{name}.txt', 'w') as f:
        print(name, file=f)
    # for each condition we need a model:
    X3_models = {}
    for condition in list(y_true):
        X3 = extract_relevant_features(df,
                                       y_true[condition],
                                       column_id="cycle",
                                       column_sort="time")
        impute(X3)
        X3_pca = create_pca(X3, name, cond=condition)
        clf_lr, clf_dtc, clf_lsvm, clf_polsvm, clf_rbfsvm, clf_rfc, clf_gbtc =\
            training_models(X3_pca, y_true[condition], name, cond=condition)
        X3_models[condition] = {
            'clf_lr': clf_lr,
            'clf_dtc': clf_dtc,
            'clf_lsvm': clf_lsvm,
            'clf_polsvm': clf_polsvm,
            'clf_rbfsvm': clf_rbfsvm,
            'clf_rfc': clf_rfc,
            'clf_gbtc': clf_gbtc
        }

    return None
    def create_representation(self, for_case_base=False):
        print()
        print("TS Fresh Feature Extraction Script started at: ",
              datetime.datetime.now())
        print()

        x_train = self.dataset.x_train  # data training
        y_train_strings = self.dataset.y_train_strings
        feature_names = self.dataset.feature_names_all

        columns = np.concatenate((['id', 'time'], feature_names))
        # tsfresh_input_x_test = np.zeros([examples * time_series_length, attributes+2])
        tsfresh_input_x_test = np.zeros([1, 63])
        # add 2 columns for id and timestamp

        # FIXME: Wieso heißt das output array "test" aber es wird über die Trainingsbeispiele iteriert?

        print('Training example preparations running ...')
        for example in range(self.dataset.num_train_instances):
            id_vec = np.ones(x_train.shape[1]) * example
            time_vec = np.arange(x_train.shape[1])

            # stack id and time and example matrix together
            id_time_matrix = np.dstack(
                (id_vec, time_vec)).squeeze()  # (1500,2)
            curr_ex = np.concatenate((id_time_matrix, x_train[example, :, :]),
                                     axis=1)  # (1500, 63)

            # print('Example number:', example, "\tShape: ", curr_ex.shape)

            if example == 0:
                tsfresh_input_x_test = curr_ex
            else:
                tsfresh_input_x_test = np.concatenate(
                    (tsfresh_input_x_test, curr_ex), axis=0)

        # noinspection PyTypeChecker
        df_timeSeries_container = pd.DataFrame(data=tsfresh_input_x_test,
                                               columns=columns)

        print("TS Fresh Feature Extraction started at: ",
              datetime.datetime.now())

        extracted_features = tsfresh.extract_features(df_timeSeries_container,
                                                      column_id="id",
                                                      column_sort="time")

        print('Extraction finished at:', datetime.datetime.now())
        print('Extracted features (unfiltered): ', extracted_features.shape)

        print('Saving unfiltered to:',
              self.dataset.dataset_folder + self.config.ts_fresh_filtered_file)
        extracted_features.to_pickle(self.dataset.dataset_folder +
                                     self.config.ts_fresh_unfiltered_file)

        # Remove NANs
        extracted_features = impute(extracted_features)
        print('Extracted features (imputed): ', extracted_features.shape)

        filtered = tsfresh.select_features(extracted_features, y_train_strings)
        print('Filtered features size: ', filtered.shape)
        # print('Filtered features: ', filtered)

        print('Saving filtered to:',
              self.dataset.dataset_folder + self.config.ts_fresh_filtered_file)
        filtered.to_pickle(self.dataset.dataset_folder +
                           self.config.ts_fresh_filtered_file)
download_robot_execution_failures()
timeseries, y = load_robot_execution_failures()

# 2. 看一下数据的形式
print(timeseries.head())
print(y.head())

# 3. 抽取特征
from tsfresh import extract_features
extracted_features = extract_features(timeseries,
                                      column_id='id',
                                      column_sort='time')
print(extracted_features.head())

# 4. 特征过滤
# 由上一步操作得到的特征中存在空值(NaN),这些没有意义的值需要去掉,选择有用的特征进行保留。从结果可以看出,数据的维度减少了很多。
from tsfresh import select_features
from tsfresh.utilities.dataframe_functions import impute

impute(extracted_features)
features_filtered = select_features(extracted_features, y)
print(features_filtered.head())

# 5. 特征抽取与过滤同时进行(一步到位,省去多余计算)
from tsfresh import extract_relevant_features

features_filtered_direct = extract_relevant_features(timeseries,
                                                     y,
                                                     column_id='id',
                                                     column_sort='time')
print(features_filtered_direct.head())
        temp_extracted_features = extract_features(temp_df,
                                                   column_id="id",
                                                   column_sort="time",
                                                   column_kind=None,
                                                   column_value=None)

        df = df.append(temp_extracted_features, ignore_index=True)
#                break;

# save the features into a csv file
    current_time = time.time()
    df.to_csv(str(current_time) + '_no_index.csv', index=False)

    # list the class label in Y
    Y = dataset[:, feature_num]
    y = pd.Series(Y)

    #        # for test purpose
    #        df = pd.read_csv(r"C:\WinPython-64bit-3.6.1.0Qt5\notebooks\Chair Sensor\user_identifcaiton_imputed_no_index.csv")
    #        copy_df = df

    # inmpute features, so inf and nan are gone
    impute(df)
    df.to_csv(str(current_time) + '_imputed_no_index.csv', index=False)

    # remove useless features  - to be check whether they are truely useless
    features_filtered = select_features(df, y)
    features_filtered.to_csv(str(current_time) +
                             '_imputed_feature_selected_no_index.csv',
                             index=False)
        'magnitude': mag_flatten,  #[idexes_to_get],
        'timestamp': time_stamp_flatten  #[idexes_to_get]
    }

    dataset_df = pd.DataFrame(dataset_dict, columns=list(dataset_dict.keys()))

    extraction_settings = ComprehensiveFCParameters()

    X = extract_features(dataset_df,
                         column_id='ids',
                         column_sort='time',
                         default_fc_parameters=extraction_settings,
                         impute_function=impute,
                         n_jobs=-1)

    impute(X)
    y = pd.Series(y_train_real, index=np.arange(y_train_real.shape[0]) + 1)
    features_filtered = select_features(X, y)

    x_train_real = features_filtered.values
    scaler = StandardScaler()
    scaler.fit(x_train_real)
    x_train_scaled = scaler.transform(x_train_real)
    #
    # pca = PCA()
    # pca.fit(x_train_scaled)
    # x_train_pca = pca.transform(x_train_scaled)
    #
    # variance_precentage = pca.explained_variance_ratio_
    # cum_sum_variance = np.cumsum(variance_precentage)
    # indx_important_pca_components = np.argmax(cum_sum_variance > 0.9)
#create initial array for output prediction probabilities
rfc4outputfinal = np.zeros(16)
mlp4outputfinal = np.zeros(16)

#solve multi-processing problems on Windows
if __name__ == "__main__":

    #extract training and test features using tsfresh, here minimalfcparameters mean the basic statistic e.g. mean, max, min, sd etc.
    train_features4 = extract_features(
        sub_pb,
        column_id='object_id',
        column_value='flux',
        column_sort='mjd',
        column_kind='passband',
        default_fc_parameters=MinimalFCParameters())
    impute(train_features4)

    #read test set in chunks to avoid memory issues
    for chunk in pd.read_csv('../modules/cs342/Assignment2/test_set.csv',
                             header=0,
                             usecols=[0, 1, 2, 3],
                             names=['object_id', 'mjd', 'passband', 'flux'],
                             chunksize=10**6):
        sub_test_pb = chunk

        #Task2 & Task3

        test_features4 = extract_features(
            sub_test_pb,
            column_id='object_id',
            column_value='flux',
Esempio n. 11
0
classification_type = 'condition2'
shuffle = 1  # (1 = train and test on same subjects, 0 = test on new subjects, not supported)

# load  data in a table containing all inputs and all features calculated
extracted_features_original = pd.read_csv(
    r'C:\Users\User\Documents\2017-2018\Project\network\current_use\all_features_original.csv'
)

#load labels with the following coulmns:
# sub num
# sub id (a different number for each time window,
#condition (stress/no stress)
# level(1-5)
# levelB( levels 1-3 are 0, level 4-5 is 1)
#levelB_condition = level (in removeLevel3 function gets new values
impute(extracted_features_original)  # takes care of nan and similar (in place)

#All labels
labels = pd.read_csv(
    r'C:\Users\User\Documents\2017-2018\Project\network\current_use\all_subjects_labels_original.csv'
)

# adjust data and labels to remove level 3 rows (No using for now)
if (classification_type == ("CL2levels" or "CL2Condition2")):
    extracted_features_original, labels = removeLevel3(
        extracted_features_original, labels, classification_type)

y_for_filtering = labels.levelB  #will use to get relevant features

# rename data
global x_input
Esempio n. 12
0
        data = pd.read_csv(folder + filename, index_col=0, date_parser=dateparse)
        data['id'] = [site_ID for _ in range(data.shape[0])]
        data.rename(columns={'AQI_': 'AQI'}, inplace=True)
        data['time'] = data.index

        data = data[['AQI', 'time', 'id']]
        # data = data.iloc[0:40, :]
        print(data.shape)
        # data = drop_missing_weeks(data, years)
        data_list[m] = data

    data = pd.concat(data_list, axis=0)
    print(data.shape)

    data_rolled = roll_time_series(data, column_id="id", column_sort="time", max_timeshift=7*24, n_jobs=8)
    features = extract_features(data_rolled, column_id="id", column_sort="time", n_jobs=8)
    impute(features)
    print(features.shape)
    features['time'] = data['time'].values
    features = drop_missing_weeks(features, years, typical_index=False)
    features.drop(['time'], axis=1, inplace=True)

    AQI = get_raw_AQI_data(path, years)
    AQI_data = pd.Series(data=AQI['AQI'].values, index=features.index, name='AQI')
    print(AQI.shape)

    selected_features = select_features(features, AQI_data)
    print(selected_features.shape)
    selected_features.to_csv('./data/modified_data_after_feature_extraction/AQI_features.csv', index=False)