Exemple #1
0
def segment(dir_path):
    """Create segments of time series."""

    target = yaml.safe_load(open("params.yaml"))["clean"]["target"]

    filepaths = find_files(dir_path, file_extension=".csv")

    output_columns = np.array(
        pd.read_csv(DATA_PATH / OUTPUT_FEATURES_PATH, index_col=0)).reshape(-1)

    dfs = []

    for filepath in filepaths:
        df = pd.read_csv(filepath, index_col=0)
        # df = df.iloc[10000:90000,:]
        # df = df.iloc[:,:-1]
        dfs.append(df)

    combined_df = pd.concat(dfs, ignore_index=True)
    combined_df = combined_df[::10]
    print(combined_df)

    n_rows = len(combined_df)
    segment_size = 100
    n_segments = int(n_rows / segment_size)
    ids = np.arange(1, n_segments + 1, 1)

    idlist = np.ones(segment_size)

    for i in ids[1:]:
        idlist = np.concatenate((idlist, np.ones(segment_size) * i))

    idlist = np.array(idlist, dtype=np.int32)

    # combined_df = combined_df.iloc[:len(idlist),:]
    # combined_df["id"] = idlist
    combined_df["id"] = np.ones(n_rows)

    # y = []

    # for i in ids:
    #     target_value = combined_df[combined_df["id"] == i][target].iloc[-1]
    #     y.append(target_value)

    # y = pd.Series(y)
    # y.index = y.index + 1
    # combined_df.index.name = "index"
    # print(y)
    print(combined_df)
    # print(np.unique(y))

    df_rolled = roll_time_series(combined_df, column_id="id", column_sort=None)
    print(df_rolled)
Exemple #2
0
def clean(dir_path):
    """Clean up inputs.

    Args:
        dir_path (str): Path to directory containing files.

    """

    # Load parameters
    dataset = yaml.safe_load(open("params.yaml"))["profile"]["dataset"]
    """Name of data set, which must be the name of subfolder of
    'assets/data/raw', in where to look for data."""

    combine_files = yaml.safe_load(
        open("params.yaml"))["clean"]["combine_files"]

    # If no name of data set is given, all files present in 'assets/data/raw'
    # will be used.
    if dataset != None:
        dir_path += "/" + dataset

    filepaths = find_files(dir_path, file_extension=".csv")

    DATA_CLEANED_PATH.mkdir(parents=True, exist_ok=True)

    # Find removable variables from profiling report
    removable_variables = parse_profile_warnings()

    dfs = []

    for filepath in filepaths:

        # Read csv
        df = pd.read_csv(filepath, index_col=0)

        for c in removable_variables:
            del df[c]

        df.dropna(inplace=True)

        dfs.append(df)

    combined_df = pd.concat(dfs, ignore_index=True)

    if combine_files:
        combined_df.to_csv(DATA_CLEANED_PATH /
                           (os.path.basename(dataset + "-cleaned.csv")))
    else:
        for filepath, df in zip(filepaths, dfs):
            df.to_csv(DATA_CLEANED_PATH /
                      (os.path.basename(filepath).replace(".", "-cleaned.")))
Exemple #3
0
def combine(dir_path):
    """Combine data from multiple input files into one dataset.

    Args:
        dir_path (str): Path to directory containing files.

    """

    filepaths = find_files(dir_path, file_extension=".npz")

    DATA_COMBINED_PATH.mkdir(parents=True, exist_ok=True)

    train_inputs = []
    train_outputs = []
    test_inputs = []
    test_outputs = []
    calibrate_inputs = []
    calibrate_outputs = []

    for filepath in filepaths:
        infile = np.load(filepath)

        if "train" in filepath:
            train_inputs.append(infile["X"])
            train_outputs.append(infile["y"])
        elif "test" in filepath:
            test_inputs.append(infile["X"])
            test_outputs.append(infile["y"])
        elif "calibrate" in filepath:
            calibrate_inputs.append(infile["X"])
            calibrate_outputs.append(infile["y"])

    X_train = np.concatenate(train_inputs)
    y_train = np.concatenate(train_outputs)
    X_test = np.concatenate(test_inputs)
    y_test = np.concatenate(test_outputs)

    if len(calibrate_inputs) > 0:
        X_calibrate = np.concatenate(calibrate_inputs)
        y_calibrate = np.concatenate(calibrate_outputs)

    np.savez(DATA_COMBINED_PATH / "train.npz", X=X_train, y=y_train)
    np.savez(DATA_COMBINED_PATH / "test.npz", X=X_test, y=y_test)

    if len(calibrate_inputs) > 0:
        np.savez(DATA_COMBINED_PATH / "calibrate.npz",
                 X=X_calibrate,
                 y=y_calibrate)
Exemple #4
0
def profile(dir_path):
    """Creates a profile report of a data set.
    
    Reads data from a set of input files, and creates a report containing
    profiling of the data. This profiling consists of various statistical
    properties. The report is stored in two formats:

    - HTML: For visual inspection
    - JSON: For subsequent automatic processing of results

    Args:
        dir_path (str): Path to directory containing files.

    """

    dataset = yaml.safe_load(open("params.yaml"))["profile"]["dataset"]
    """Name of data set, which must be the name of subfolder of
    'assets/data/raw', in where to look for data."""

    # If no name of data set is given, all files present in 'assets/data/raw'
    # will be used.
    if dataset != None:
        dir_path += "/" + dataset

    filepaths = find_files(dir_path, file_extension=".csv")

    dfs = []

    for filepath in filepaths:
        dfs.append(pd.read_csv(filepath))

    combined_df = pd.concat(dfs, ignore_index=True)

    # Generate report.
    profile = ProfileReport(combined_df,
                            title="Profiling Analysis",
                            config_file="src/profile.yaml",
                            lazy=False,
                            sort=None)

    # Create folder for profiling report
    PROFILE_PATH.mkdir(parents=True, exist_ok=True)

    # Save report to files.
    profile.to_file(PROFILE_PATH / "profile.html")
    profile.to_file(PROFILE_PATH / "profile.json")
Exemple #5
0
def sequentialize(dir_path):

    filepaths = find_files(dir_path, file_extension=".npz")

    DATA_SEQUENTIALIZED_PATH.mkdir(parents=True, exist_ok=True)

    params = yaml.safe_load(open("params.yaml"))["sequentialize"]
    net = yaml.safe_load(open("params.yaml"))["train"]["net"]

    hist_size = params["hist_size"]
    target_size = params["target_size"]

    if target_size > hist_size:
        raise ValueError("target_size cannot be larger than hist_size.")

    for filepath in filepaths:

        infile = np.load(filepath)

        X = infile["X"]
        y = infile["y"]

        # Combine y and X to get correct format for sequentializing
        data = np.hstack((y, X))

        # Split into sequences
        X, y = split_sequences(data, hist_size, target_size=target_size)

        if net == "dnn":
            X = flatten_sequentialized(X)

        # Save X and y into a binary file
        np.savez(
            DATA_SEQUENTIALIZED_PATH / (os.path.basename(filepath).replace(
                "scaled.csv", "sequentialized.npz")),
            X=X,
            y=y,
        )
Exemple #6
0
def split(dir_path):
    """Split data into train and test set.

    Training files and test files are saved to different folders.

    Args:
        dir_path (str): Path to directory containing files.

    """

    params = yaml.safe_load(open("params.yaml"))["split"]

    DATA_SPLIT_PATH.mkdir(parents=True, exist_ok=True)

    filepaths = find_files(dir_path, file_extension=".csv")

    # Handle special case where there is only one workout file.
    if isinstance(filepaths, str) or len(filepaths) == 1:
        filepath = filepaths[0]

        df = pd.read_csv(filepath, index_col=0)

        train_size = int(len(df) * params["train_split"])

        # This is used when using conformal predictors.
        # It specifies the calibration set size.
        # Set to 0 in params.yml if no calibration is to be done.
        calibrate_size = int(len(df) * params["calibrate_split"])

        df_train = None
        df_test = None
        df_calibrate = None

        if params["calibrate_split"] == 0:
            df_train = df.iloc[:train_size]
            df_test = df.iloc[train_size:]
        else:
            df_train = df.iloc[:train_size]
            df_calibrate = df.iloc[train_size:train_size + calibrate_size]
            df_test = df.iloc[train_size + calibrate_size:]

        df_train.to_csv(
            DATA_SPLIT_PATH /
            (os.path.basename(filepath).replace("featurized", "train")))

        df_test.to_csv(
            DATA_SPLIT_PATH /
            (os.path.basename(filepath).replace("featurized", "test")))

        if params["calibrate_split"] != 0:
            df_calibrate.to_csv(DATA_SPLIT_PATH /
                                (os.path.basename(filepath).replace(
                                    "featurized", "calibrate")))

    else:
        # Parameter 'train_split' is used to find out no. of files in training set
        file_split = int(len(filepaths) * params["train_split"])
        file_split_calibrate = int(len(filepaths) * params["calibrate_split"])

        training_files = []
        test_files = []
        calibrate_files = []

        if file_split_calibrate == 0:
            training_files = filepaths[:file_split]
            test_files = filepaths[file_split:]
        else:
            training_files = filepaths[:file_split]
            calibrate_files = filepaths[file_split:file_split +
                                        file_split_calibrate]
            test_files = filepaths[file_split + file_split_calibrate:]

        for filepath in filepaths:

            df = pd.read_csv(filepath, index_col=0)

            if filepath in training_files:
                df.to_csv(DATA_SPLIT_PATH /
                          (os.path.basename(filepath).replace(
                              "featurized", "train")))
            elif filepath in test_files:
                df.to_csv(
                    DATA_SPLIT_PATH /
                    (os.path.basename(filepath).replace("featurized", "test")))
            elif filepath in calibrate_files:
                df.to_csv(DATA_SPLIT_PATH /
                          (os.path.basename(filepath).replace(
                              "featurized", "calibrate")))
Exemple #7
0
def clean(dir_path, save_results_to_file=True):
    """Clean up inputs.

    Args:
        dir_path (str): Path to directory containing files.
        save_results_to_file (bool): When creating a virtual sensor, the
            results should be saved to file for more efficient reruns of the
            pipeline. When running the virtual sensor, there is no need to save
            these intermediate results to file.

    """

    # Load parameters
    dataset = yaml.safe_load(open("params.yaml"))["profile"]["dataset"]
    params = yaml.safe_load(open("params.yaml"))
    combine_files = params["clean"]["combine_files"]
    target = params["clean"]["target"]
    classification = params["clean"]["classification"]
    onehot_encode_target = params["clean"]["onehot_encode_target"]

    # If no name of data set is given, all files present in 'assets/data/raw'
    # will be used.
    if dataset is not None:
        dir_path += "/" + dataset

    filepaths = find_files(dir_path, file_extension=".csv")

    DATA_CLEANED_PATH.mkdir(parents=True, exist_ok=True)

    # Find removable variables from profiling report
    removable_variables = parse_profile_warnings()

    dfs = []

    for filepath in filepaths:

        # Read csv
        df = pd.read_csv(filepath)

        # If the first column is an index column, remove it.
        if df.iloc[:, 0].is_monotonic:
            df = df.iloc[:, 1:]

        for column in removable_variables:
            del df[column]

        df.dropna(inplace=True)

        dfs.append(df)

    combined_df = pd.concat(dfs, ignore_index=True)

    if classification:

        if onehot_encode_target and len(np.unique(combined_df[target])) > 2:
            encoder = LabelBinarizer()
        else:
            if onehot_encode_target:
                raise ValueError(
                    "Parameter 'onehot_encode_target' is set to True, but target is binary. Change parameter to False in order to use this pipeline."
                )
            encoder = LabelEncoder()

        target_col = np.array(combined_df[target]).reshape(-1)
        encoder.fit(target_col)
        # print(f"Classes: {encoder.classes_}")
        # print(f"Encoded classes: {encoder.transform(encoder.classes_)}")

        combined_df, output_columns = encode_target(encoder, combined_df, target)

        for i in range(len(dfs)):
            dfs[i], _ = encode_target(encoder, dfs[i], target)

    else:
        output_columns = [target]

    if combine_files:
        combined_df.to_csv(DATA_CLEANED_PATH / (os.path.basename("data-cleaned.csv")))
    else:
        for filepath, df in zip(filepaths, dfs):
            df.to_csv(
                DATA_CLEANED_PATH
                / (os.path.basename(filepath).replace(".", "-cleaned."))
            )

    pd.DataFrame(output_columns).to_csv(DATA_PATH / OUTPUT_FEATURES_PATH)
Exemple #8
0
def featurize(dir_path):
    """Clean up inputs and add features to data set.

    Args:
        dir_path (str): Path to directory containing files.

    """

    # Load parameters
    params = yaml.safe_load(open("params.yaml"))["featurize"]

    features = params["features"]
    """Features to include in data set."""

    target = yaml.safe_load(open("params.yaml"))["clean"]["target"]
    """Variable to use as target."""

    filepaths = find_files(dir_path, file_extension=".csv")

    DATA_FEATURIZED_PATH.mkdir(parents=True, exist_ok=True)

    # Read all data to fit one-hot encoder
    dfs = []

    for filepath in filepaths:
        df = pd.read_csv(filepath)
        dfs.append(df)

    combined_df = pd.concat(dfs, ignore_index=True)
    categorical_variables = find_categorical_variables()

    print(f"Columns: {combined_df.columns}")
    print(f"Cat: {categorical_variables}")

    # Check if some categorical variables have been removed in the cleaning
    # process, and if so, remove them from the list
    # removables = []
    # for v in categorical_variables:
    #     if v not in combined_df.columns:
    #         removables.append(v)
    #         # categorical_variables.remove(v)
    # print(removables)
    # categorical_variables.remove(removables)

    # print(f"Cat: {categorical_variables}")
    # print(combined_df[categorical_variables])
    # categorical_encoder = OneHotEncoder()
    # categorical_encoder.fit(combined_df)

    for filepath in filepaths:

        # Read csv
        df = pd.read_csv(filepath)

        # Move target column to the beginning of dataframe
        df = move_column(df, column_name=target, new_idx=0)

        # If no features are specified, use all columns as features
        # TODO: Maybe not the most robust way to test this
        if type(params["features"]) != list:
            features = df.columns

        # Check if wanted features from params.yaml exists in the data
        for feature in features:
            if feature not in df.columns:
                print(f"Feature {feature} not found!")

        # TODO: Engineer features. At the moment no engineered features exists!
        df = add_features(df, features)

        for col in df.columns:
            # Remove feature from input. This is useful in the case that a raw
            # feature is used to engineer a feature, but the raw feature itself
            # should not be a part of the input.
            if col not in features and col != target:
                del df[col]

            # Remove feature if it is non-numeric
            elif not is_numeric_dtype(df[col]):

                del df[col]

        # Save data
        df.to_csv(DATA_FEATURIZED_PATH /
                  (os.path.basename(filepath).replace(".", "-featurized.")))

    # Save list of features used
    pd.DataFrame(df.columns).to_csv(DATA_PATH / "input_columns.csv")
Exemple #9
0
def scale(dir_path):
    """Scale training and test data.

    Args:
        dir_path (str): Path to directory containing files.

    """

    filepaths = find_files(dir_path, file_extension=".csv")

    DATA_SCALED_PATH.mkdir(parents=True, exist_ok=True)

    params = yaml.safe_load(open("params.yaml"))["scale"]
    input_method = params["input"]
    output_method = params["output"]
    
    if input_method == "standard":
        scaler = StandardScaler()
    elif input_method == "minmax":
        scaler = MinMaxScaler()
    elif input_method == "robust":
        scaler = RobustScaler()
    elif input_method == "none":
        scaler = StandardScaler()
    else:
        raise NotImplementedError(f"{input_method} not implemented.")

    if output_method == "standard":
        output_scaler = StandardScaler()
    elif output_method == "minmax":
        output_scaler = MinMaxScaler()
    elif output_method == "robust":
        output_scaler = RobustScaler()
    elif output_method == "none":
        output_scaler = StandardScaler()
    else:
        raise NotImplementedError(f"{output_method} not implemented.")

    train_inputs = []
    train_outputs = []

    data_overview = {}

    for filepath in filepaths:

        df = pd.read_csv(filepath, index_col=0)
        
        # Convert to numpy
        data = df.to_numpy()

        # Split into input (X) and output/target (y)
        X = data[:, 1:].copy()
        y = data[:, 0].copy().reshape(-1, 1)


        if "train" in filepath:
            train_inputs.append(X)
            train_outputs.append(y)
            category = "train"
        elif "test" in filepath:
            category = "test"
        elif "calibrate" in filepath:
            category = "calibrate"
            
        data_overview[filepath] = {"X": X, "y": y, "category": category}

    X_train = np.concatenate(train_inputs)
    y_train = np.concatenate(train_outputs)

    # Fit a scaler to the training data
    scaler = scaler.fit(X_train)
    output_scaler = output_scaler.fit(y_train)

    for filepath in data_overview:

        # Scale inputs
        if input_method == "none":
            X=data_overview[filepath]["X"]
        else:
            X = scaler.transform(data_overview[filepath]["X"])

        # Scale outputs
        if output_method == "none":
            y = data_overview[filepath]["y"]
        else:
            y = output_scaler.transform(data_overview[filepath]["y"])

        # Save X and y into a binary file
        np.savez(
            DATA_SCALED_PATH
            / (
                os.path.basename(filepath).replace(
                    data_overview[filepath]["category"] + ".csv", 
                    data_overview[filepath]["category"] + "-scaled.npz"
                )
            ),
            #X=data_overview[filepath]["X"],
            X = X, 
            # y = data_overview[filepath]["y"]
            y = y
        )