コード例 #1
0
def main():
    json_file_path = process_command_line_args()
    json_manager = JsonManager(json_file_path)

    csv_folder = json_manager.get_csv_path()
    normalized_folder = json_manager.get_normalized_path()
    feature_columns = json_manager.get_feature_columns()
    label_columns = json_manager.get_label_columns()
    lag_features = json_manager.get_lag_features()
    lag_window_length = json_manager.get_sliding_window_length()

    destination_path = constants.add_folder_to_directory(\
     constants.NORMALIZED_CSV_FOLDER_NAME, normalized_folder)

    for file in os.listdir(csv_folder):
        complete_file_path = os.fsdecode(os.path.join(csv_folder, file))

        if is_file_CSV(file):
            normalized_filename = make_modified_filename(\
             file, CSV_NAME_EXTENSION)
            normalized_file_path = os.fsdecode(os.path.join(\
             destination_path, normalized_filename))

            current_csv_obj = open(complete_file_path)
            normalized_csv_obj = open(normalized_file_path, mode='w')

            csv_reader = csv.reader(current_csv_obj, \
             delimiter = constants.CSV_DELIMITER)
            csv_writer = csv.writer(normalized_csv_obj, \
             delimiter = constants.CSV_DELIMITER, \
             quotechar = constants.CSV_QUOTECHAR, \
             quoting=csv.QUOTE_MINIMAL)

            all_lag_queues = [[""] * lag_window_length
                              for lag_feature in lag_features]

            header_row = list(feature_columns)
            header_row.append(constants.LABEL_COLUMN_NAME)
            csv_writer.writerow(header_row)

            label_indices = list(label_columns.values())
            header_row_being_read = True
            for timeseries_row in csv_reader:
                if header_row_being_read:
                    header_row_being_read = False
                    continue
                label_values = [
                    timeseries_row[index] for index in label_indices
                ]
                label_value = next((label_value for label_value in label_values \
                 if label_value), None)

                if label_value:
                    new_normalize_row = []
                    for column_name, column_index in feature_columns.items():
                        if column_name in lag_features:
                            index = lag_features.index(column_name)
                            lagged_feature = update_lag_feature_queue(\
                             all_lag_queues[index], timeseries_row[column_index])
                            new_normalize_row.append(lagged_feature)
                        else:
                            new_normalize_row.append(\
                             timeseries_row[feature_columns[column_name]])
                    new_normalize_row.append(label_value)
                    csv_writer.writerow(new_normalize_row)
                else:
                    for column_index, column_name in enumerate(lag_features):
                        value = timeseries_row[feature_columns[column_name]]
                        update_lag_feature_queue(all_lag_queues[column_index],
                                                 value)

            current_csv_obj.close()
            normalized_csv_obj.close()

    combined_csv_file_path = os.path.join(destination_path,
                                          constants.COMBINED_CSV_FILENAME)

    if os.path.exists(combined_csv_file_path):
        os.remove(combined_csv_file_path)
    combined_csv = pd.concat([pd.read_csv(os.fsdecode(os.path.join(destination_path, f)))\
     for f in os.listdir(destination_path)])
    combined_csv.to_csv( os.fsdecode(combined_csv_file_path), \
     index = False, encoding = 'utf-8-sig')
コード例 #2
0
def run_normalize(json_file_path):
    global add_last_action_taken
    print(f"Normalizing started using {json_file_path}")

    json_manager = JsonManager(json_file_path)
    csv_folder = json_manager.get_csv_path()
    normalized_folder = json_manager.get_normalized_path()
    feature_list = json_manager.get_feature_columns()
    label_columns = json_manager.get_label_columns()
    lag_features = json_manager.get_lag_features()
    lag_window_length = json_manager.get_sliding_window_length()
    add_last_action_taken = json_manager.get_add_last_action_taken()

    constants.remove_folder_if_exists(\
     constants.NORMALIZED_CSV_FOLDER_NAME, normalized_folder)

    destination_path = constants.add_folder_to_directory(\
     constants.NORMALIZED_CSV_FOLDER_NAME, normalized_folder)

    for file in os.listdir(csv_folder):
        complete_file_path = os.fsdecode(os.path.join(csv_folder, file))
        last_action_taken = None

        if is_file_CSV(file):
            print(f"Reading in csv: {complete_file_path}")
            normalized_filename = make_modified_filename(\
             file, CSV_NAME_EXTENSION)
            normalized_file_path = os.fsdecode(os.path.join(\
             destination_path, normalized_filename))

            current_csv_obj = open(complete_file_path)
            normalized_csv_obj = open(normalized_file_path, mode='w')

            csv_reader = csv.reader(current_csv_obj, \
             delimiter = constants.CSV_DELIMITER)
            csv_writer = csv.writer(normalized_csv_obj, \
             delimiter = constants.CSV_DELIMITER, \
             quotechar = constants.CSV_QUOTECHAR, \
             quoting=csv.QUOTE_MINIMAL)

            all_lag_queues = [[""] * lag_window_length
                              for lag_feature in lag_features]

            header_row = list(feature_list)
            if (add_last_action_taken):
                header_row.append(constants.LAST_ACTION_TAKEN_COLUMN_NAME)
            header_row.append(constants.LABEL_COLUMN_NAME)
            csv_writer.writerow(header_row)

            header_row_being_read = True
            for timeseries_row in csv_reader:
                if header_row_being_read:
                    feature_columns = generate_feature_col_dictionary(
                        timeseries_row, feature_list, False)
                    label_indices = list(
                        generate_feature_col_dictionary(
                            timeseries_row, label_columns, True).values())
                    header_row_being_read = False
                    continue

                label_values = [
                    timeseries_row[index] for index in label_indices
                ]
                label_value = next((label_value for label_value in label_values \
                 if label_value), None)

                if label_value:
                    new_normalize_row = []
                    for column_name, column_index in feature_columns.items():
                        if column_name in lag_features:
                            index = lag_features.index(column_name)
                            lagged_feature = update_lag_feature_queue(\
                             all_lag_queues[index], timeseries_row[column_index])
                            new_normalize_row.append(lagged_feature)
                        elif column_name == constants.LAST_ACTION_TAKEN_COLUMN_NAME:
                            new_normalize_row.append(last_action_taken)
                        else:
                            new_normalize_row.append(\
                             timeseries_row[feature_columns[column_name]])
                    new_normalize_row.append(label_value)
                    last_action_taken = label_value
                    csv_writer.writerow(new_normalize_row)
                else:
                    for column_index, column_name in enumerate(lag_features):
                        value = timeseries_row[feature_columns[column_name]]
                        update_lag_feature_queue(all_lag_queues[column_index],
                                                 value)

            current_csv_obj.close()
            normalized_csv_obj.close()

    combined_csv_file_path = os.path.join(destination_path,
                                          constants.COMBINED_CSV_FILENAME)

    if os.path.exists(combined_csv_file_path):
        os.remove(combined_csv_file_path)
    combined_csv = pd.concat([pd.read_csv(os.fsdecode(os.path.join(destination_path, f)))\
     for f in os.listdir(destination_path)])
    combined_csv.to_csv( os.fsdecode(combined_csv_file_path), \
     index = False, encoding = 'utf-8-sig')
    print(f"Normalizing finished, results in {normalized_file_path}")
コード例 #3
0
def main():
    json_file_path = process_command_line_args()
    json_manager = JsonManager(json_file_path)
    feature_columns = json_manager.get_feature_columns()
    categorical_features = json_manager.get_categorical_features()
    binary_features = json_manager.get_binary_features()
    hot_encoded_path = json_manager.get_hot_encoded_path()

    normalized_folder = os.fsdecode(os.path.join(\
     json_manager.get_normalized_path(), \
     constants.NORMALIZED_CSV_FOLDER_NAME))
    combined_csv_file = os.fsdecode(os.path.join(\
     normalized_folder, \
     constants.COMBINED_CSV_FILENAME))

    features_data = pd.read_csv(combined_csv_file, usecols=feature_columns)

    for binary_variable in binary_features:
        features_data[binary_variable] = features_data[binary_variable].fillna(
            value=-1)
        features_data[binary_variable] = features_data[binary_variable] * 1
    true_false_columns = features_data[binary_features]
    true_false_columns_array = true_false_columns.to_numpy()

    # true_false_features(features_data, true_false_features)

    # hot encoded features
    hot_encoded_array, hot_encoded_header = hot_encode_features(\
     features_data, categorical_features)

    # remove hot encoded features from features_data dataframe
    features_data = features_data.drop(columns=categorical_features +
                                       binary_features)
    features_data_array = features_data.to_numpy()

    # encode labels
    labels_data = pd.read_csv(combined_csv_file, \
     usecols = [constants.LABEL_COLUMN_NAME])
    label_encoder, labels_column_array = encode_label_column(labels_data)

    # add hot_encoded columns, than numerical columns, then encoded labels to one array
    final_csv = np.concatenate(\
     (hot_encoded_array, binary_columns_array, \
      features_data_array, labels_column_array), \
     axis = constants.COLUMN_AXIS)

    hot_encoded_folder = constants.add_folder_to_directory(\
     constants.HOT_ENCODED_CSV_FOLDER_NAME, hot_encoded_path)
    hot_encoded_file_path = os.fsdecode(os.path.join(\
     hot_encoded_folder, constants.HOT_ENCODED_CSV_FILENAME))

    if os.path.exists(hot_encoded_file_path):
        os.remove(hot_encoded_file_path)

    # make_formatter_string(hot_encoded_header, numerical_columns, label_column)
    hot_encode_fmt = "%i," * len(
        hot_encoded_header +
        binary_features)  # format hot encoded columns to ints
    feature_data_fmt = "%1.3f," * len(
        features_data.columns)  # format numerical columns to doubles
    total_fmt = hot_encode_fmt + feature_data_fmt + "%i"  # for label

    final_header = ','.join(
        str(i) for i in (hot_encoded_header + binary_features +
                         list(features_data.columns)))
    final_header += "," + constants.LABEL_COLUMN_NAME  # for label


    np.savetxt(hot_encoded_file_path, final_csv, \
     fmt = total_fmt, \
     header = final_header, \
     delimiter = constants.CSV_DELIMITER, \
     comments='')

    f = open(OUTPUT_LOG_FILE, "w")
    f.write("{}\n".format(total_fmt))
    f.write(str((label_encoder.classes_).tolist()))
    f.close()
コード例 #4
0
def run_hotencode(json_file_path):
    global add_last_action_taken
    print(f"Hot encoding started using {json_file_path}")

    json_manager = JsonManager(json_file_path)
    feature_list = json_manager.get_feature_columns()
    categorical_features = json_manager.get_categorical_features()
    add_last_action_taken = json_manager.get_add_last_action_taken()

    if add_last_action_taken:
        categorical_features.append(constants.LAST_ACTION_TAKEN_COLUMN_NAME)
    binary_features = json_manager.get_binary_features()
    hot_encoded_path = json_manager.get_hot_encoded_path()

    constants.remove_folder_if_exists(\
     constants.HOT_ENCODED_CSV_FOLDER_NAME, hot_encoded_path)

    hot_encoded_folder = constants.add_folder_to_directory(\
     constants.HOT_ENCODED_CSV_FOLDER_NAME, hot_encoded_path)
    hot_encoded_file_path = os.fsdecode(os.path.join(\
     hot_encoded_folder, constants.HOT_ENCODED_CSV_FILENAME))

    normalized_folder = os.fsdecode(os.path.join(\
     json_manager.get_normalized_path(), \
     constants.NORMALIZED_CSV_FOLDER_NAME))

    combined_csv_file = os.fsdecode(os.path.join(\
     normalized_folder, \
     constants.COMBINED_CSV_FILENAME))

    feature_columns = generate_feature_col_dictionary(
        get_header_row(combined_csv_file), feature_list, False)

    features_data = pd.read_csv(combined_csv_file, usecols=feature_columns)

    features_data[binary_features] = features_data[binary_features].fillna(0)
    features_data[binary_features] = features_data[binary_features].astype(
        bool)
    binary_columns_array = features_data[binary_features].to_numpy()

    # hot encoded features
    hot_encoded_array, hot_encoded_header = hot_encode_features(\
     features_data, categorical_features)

    # remove hot encoded features from features_data dataframe
    features_data = features_data.drop(columns=categorical_features +
                                       binary_features)
    features_data_array = features_data.to_numpy()

    # encode labels
    labels_data = pd.read_csv(combined_csv_file, \
     usecols = [constants.LABEL_COLUMN_NAME])
    label_encoder, labels_column_array = encode_label_column(labels_data)

    # add hot_encoded columns, than numerical columns, then encoded labels to one array
    final_csv = np.concatenate(\
     (hot_encoded_array, binary_columns_array, \
      features_data_array, labels_column_array), \
     axis = constants.COLUMN_AXIS)

    # make_formatter_string(hot_encoded_header, numerical_columns, label_column)
    hot_encode_fmt = "%s," * len(
        hot_encoded_header +
        binary_features)  # format hot encoded columns to binary features
    feature_data_fmt = "%1.3f," * len(
        features_data.columns)  # format numerical columns to doubles
    total_fmt = hot_encode_fmt + feature_data_fmt + "%i"  # for label

    final_header = ','.join(
        str(i) for i in (hot_encoded_header + binary_features +
                         list(features_data.columns)))
    final_header += "," + constants.LABEL_COLUMN_NAME  # for label


    np.savetxt(hot_encoded_file_path, final_csv, \
     fmt = total_fmt, \
     header = final_header, \
     delimiter = constants.CSV_DELIMITER, \
     comments='')

    f = open(OUTPUT_LOG_FILE, "w")
    f.write("{}\n".format(total_fmt))
    f.write(str((label_encoder.classes_).tolist()))
    f.close()
    print(f"Hot Encoding finished, results in {hot_encoded_file_path}")