コード例 #1
0
ファイル: score.py プロジェクト: jhchein/freezer-ml-pipeline
def run(data):
    has_error = False
    logging.info("started run.")

    # CONVERT STREAM ANALYTICS TO SKTIME FORMAT
    logging.info("loading json.")
    data = json.loads(data)
    logging.info("json loaded.")

    # Parse timestamps and temperature data
    time_created_start = data.get("allevents")[0].get("timeCreated")
    time_created_end = data.get("allevents")[-1].get("timeCreated")
    temperature_data = [
        event.get("temperature") for event in data.get("allevents")
    ]

    logging.info(f"time_created_start: {time_created_start}")
    logging.info(f"time_created_end: {time_created_end}")
    logging.info(f"temperature_data: {temperature_data}")

    # Check connection_device_id
    connection_device_id, has_error, error_message = get_connection_device_id(
        data)
    if has_error:
        return create_response(has_error=has_error,
                               error_message=error_message)

    # Assert time series has at least TIMESERIESLENGTH elements
    if len(temperature_data) < TIMESERIESLENGTH:
        error_message = f"Time series of length {len(temperature_data)} does not have enough samples ({TIMESERIESLENGTH} samples required)."
        logging.warning(error_message)
        return create_response(has_error=True, error_message=error_message)

    # Convert data to sktime format
    case_id, dim_id = 0, 0
    try:
        long_data = [[case_id, dim_id, reading_id, reading_data]
                     for reading_id, reading_data in enumerate(
                         temperature_data[-TIMESERIESLENGTH:])]
    except Exception as e:
        error_message = (
            f"Could not convert dataset to long format due to exception: '{e}'"
        )
        logging.error(error_message)
        return create_response(has_error=True, error_message=error_message)

    # Predict
    long_df = pd.DataFrame(
        long_data, columns=["case_id", "dim_id", "reading_id", "value"])
    sktime_df = from_long_to_nested(long_df)
    prediction = model.predict(sktime_df).tolist()[0]

    return create_response(
        prediction=prediction,
        connection_device_id=connection_device_id,
        time_created_start=time_created_start,
        time_created_end=time_created_end,
    )
コード例 #2
0
def reformatData(target, file_name):
    print("reformatting the data...")
    raw_df = pd.read_csv(file_name)

    #collapses the time cols
    long_table_df = raw_df.melt(
        id_vars=["event", "name", "start time", "end time", "channel"],
        var_name="anindex",
        value_name="value")

    sorted_long_table_df = long_table_df.sort_values(
        by=['event', 'name', 'start time', 'channel'], axis=0)

    unique_dim_ids = sorted_long_table_df.iloc[:, 4].unique()

    for i in range(len(unique_dim_ids)):
        my_channel = unique_dim_ids[i]
        sorted_long_table_df['channel'] = sorted_long_table_df[
            'channel'].replace({my_channel: i})
    unique_start_time = sorted_long_table_df.iloc[:, 2].unique()

    for i in range(len(unique_start_time)):
        my_time = unique_start_time[i]
        sorted_long_table_df['start time'] = sorted_long_table_df[
            'start time'].replace({my_time: i})

    sorted_long_table_df_stripped = sorted_long_table_df.drop(
        columns=['event', 'name', 'end time'])

    sorted_long_table_df_stripped.head()
    df_nested = from_long_to_nested(sorted_long_table_df_stripped)

    target = 'event'
    new_unique_start_time = sorted_long_table_df.iloc[:, 2].unique()
    labels = []
    for e in new_unique_start_time:
        x = sorted_long_table_df.loc[sorted_long_table_df['start time'] == e,
                                     [target]].iloc[0][0]
        labels.append(x)

    np_labels = np.asarray(labels, dtype=np.str)

    return df_nested, np_labels
コード例 #3
0
def prepare_dataframe(processed_json_df: pd.DataFrame, time_series_length: int,
                      threshold: float):
    # Convert to JSON
    processed_json_df["allevents"] = processed_json_df["allevents"].apply(
        lambda x: json.loads(x))

    # Reset PartitionDate Index to a simple range. We'll use it to index our "cases" ("samples")
    processed_json_df.reset_index(drop=True, inplace=True)

    # sktime expects a specific format. For now the easiest way is to convert our DataFrame to a long format
    # and then use the sktime parser.
    def dataframe_to_long(df, size=time_series_length):
        case_id = 0
        for _, case in df.iterrows():
            events = case["allevents"]

            # We ignore cases with insufficient readings
            if len(events) < size:
                continue

            # We also slice samples with too many readings ([-size:])
            for reading_id, values in enumerate(events[-size:]):
                yield case_id, 0, reading_id, values["temperature"]
                # We can add more dimensions later on.
                # yield case_id, 1, reading_id, values["ambienttemperature"]

            case_id += 1  # can't use the row index because we skip rows.

    df_long = pd.DataFrame(
        dataframe_to_long(processed_json_df, size=time_series_length),
        columns=["case_id", "dim_id", "reading_id", "value"],
    )

    # Convert to Sktime "nested" Format
    df_nested = from_long_to_nested(df_long)

    # Fake some labels
    # We simply explore the data, set an arbitrary threshold and define all series above that threshold as "True".
    df_nested["label"] = df_nested["dim_0"].apply(
        lambda x: x.max()) > threshold

    return df_nested
コード例 #4
0
          (23564575, [10, -1, 6])], 'A')
case2 = ([(23534575, [10, -1, 2]), (23545575, [10, +1, 3]),
          (23564575, [10, -1, 4]), (23564575, [10, +1, 5]),
          (23564575, [10, -1, 6])], 'B')
case3 = ([(23534575, [10, -1, 2]), (23545575, [10, +1, 3]),
          (23564575, [10, -1, 4]), (23564575, [10, +1, 5]),
          (23564575, [10, -1, 6])], 'A')

data = [case1, case2, case3]

# data -> our function -> (X_nested, y)

X = generate_long_table(ts)
X.head()

X_nested = from_long_to_nested(X)
X_nested.head()
y = np.array(['a'])  # , 'b', 'a', 'b', 'a', 'b', 'a', 'b'])

print(X_nested)

X_train, X_test, y_train, y_test = train_test_split(X_nested, y)
print(X.head())
classifier = ColumnEnsembleClassifier(estimators=[
    ("TSF1", TimeSeriesForestClassifier(n_estimators=100), [1]),
    ("TSF2", TimeSeriesForestClassifier(n_estimators=100), [2]),
])
classifier.fit(X_train, y_train)

# Use the test portion of data for prediction so we can understand how accurate our model was learned
y_pred = classifier.predict(X_test)
コード例 #5
0
def reformatData(target, file_name):

    print("reformatting the data...")
    raw_df = pd.read_csv(file_name)

    # find count of events in the datasets
    events_count = raw_df['event'].value_counts().to_dict()

    #collapses the time cols into one single time column to match the rest of the columns
    long_table_df = raw_df.melt(
        id_vars=["event", "name", "start time", "end time", "channel"],
        var_name="anindex",
        value_name="value")

    sorted_long_table_df = long_table_df.sort_values(
        by=['event', 'name', 'start time', 'channel'], axis=0)

    # combine start time and subject into 1 column
    sorted_long_table_df['case_key'] = sorted_long_table_df[
        'start time'].astype(str) + sorted_long_table_df['name']

    # use column case_key to be unique index for reformatted table
    unique_dim_ids = sorted_long_table_df.iloc[:, 4].unique()

    # get a mapping of channels
    channels_map = {}
    #replacing channel named to numeric values (need to do this doem the from_long_to_nested function)
    for i in range(len(unique_dim_ids)):
        my_channel = unique_dim_ids[i]
        sorted_long_table_df['channel'] = sorted_long_table_df[
            'channel'].replace({my_channel: i})
        channels_map[i] = my_channel

    unique_case_key = sorted_long_table_df.iloc[:, -1].unique()
    for i in range(len(unique_case_key)):
        my_case_key = unique_case_key[i]
        sorted_long_table_df['case_key'] = sorted_long_table_df[
            'case_key'].replace({my_case_key: i})

    # might need to delete this check if it takes too long
    time_map = {}  # a map index for time
    unique_start_time = sorted_long_table_df.iloc[:, 2].unique()
    for i in range(len(unique_start_time)):
        my_time = unique_start_time[i]
        sorted_long_table_df['start time'] = sorted_long_table_df[
            'start time'].replace({my_time: i})
        time_map[i] = my_time

    #excess columns are dropped for the frome_long_to_nested function
    sorted_long_table_df_stripped = sorted_long_table_df.drop(
        columns=['event', 'name', 'end time', 'start time'])

    # reorder column, move case_key column from last to first
    sorted_long_table_df_stripped = sorted_long_table_df_stripped[[
        'case_key', 'channel', 'anindex', 'value'
    ]]
    #table goes from long to nested
    # returns a sktime-formatted dataset with individual dimensions represented by columns of the output dataframe:
    df_nested = from_long_to_nested(sorted_long_table_df_stripped)

    # create a list of labels
    new_unique_case_key = sorted_long_table_df.iloc[:, -1].unique()
    labels = []
    for e in new_unique_case_key:
        x = sorted_long_table_df.loc[sorted_long_table_df['case_key'] == e,
                                     [target]].iloc[0][0]
        labels.append(x)

    np_labels = np.asarray(labels, dtype=np.str)

    return df_nested, np_labels, events_count
コード例 #6
0
import argparse
import os

import pandas as pd

from sktime.utils.load_data import from_long_to_nested

parser = argparse.ArgumentParser()
parser.add_argument("--input", type=str, help="output data")
parser.add_argument("--output", type=str, help="output data")
parser.add_argument("--threshold", type=float, help="threshold cutoff")
args = parser.parse_args()

# Get input data
pickle_path = os.path.join(args.input, "df_long.pkl")
df_long = pd.read_pickle(pickle_path)

# Convert to Sktime "nested" Format
df_nested = from_long_to_nested(df_long)

# Fake some labels
# We simply explore the data, set an arbitrary threshold and define all series above that threshold as "True".
df_nested["label"] = df_nested["dim_0"].apply(
    lambda x: x.max()) > args.threshold

# Define output
if not (args.output is None):
    os.makedirs(args.output, exist_ok=True)
    print("%s created" % args.output)
df_nested.to_pickle(os.path.join(args.output, "df_nested.pkl"))