def run(data): has_error = False logging.info("started run.") # CONVERT STREAM ANALYTICS TO SKTIME FORMAT logging.info("loading json.") data = json.loads(data) logging.info("json loaded.") # Parse timestamps and temperature data time_created_start = data.get("allevents")[0].get("timeCreated") time_created_end = data.get("allevents")[-1].get("timeCreated") temperature_data = [ event.get("temperature") for event in data.get("allevents") ] logging.info(f"time_created_start: {time_created_start}") logging.info(f"time_created_end: {time_created_end}") logging.info(f"temperature_data: {temperature_data}") # Check connection_device_id connection_device_id, has_error, error_message = get_connection_device_id( data) if has_error: return create_response(has_error=has_error, error_message=error_message) # Assert time series has at least TIMESERIESLENGTH elements if len(temperature_data) < TIMESERIESLENGTH: error_message = f"Time series of length {len(temperature_data)} does not have enough samples ({TIMESERIESLENGTH} samples required)." logging.warning(error_message) return create_response(has_error=True, error_message=error_message) # Convert data to sktime format case_id, dim_id = 0, 0 try: long_data = [[case_id, dim_id, reading_id, reading_data] for reading_id, reading_data in enumerate( temperature_data[-TIMESERIESLENGTH:])] except Exception as e: error_message = ( f"Could not convert dataset to long format due to exception: '{e}'" ) logging.error(error_message) return create_response(has_error=True, error_message=error_message) # Predict long_df = pd.DataFrame( long_data, columns=["case_id", "dim_id", "reading_id", "value"]) sktime_df = from_long_to_nested(long_df) prediction = model.predict(sktime_df).tolist()[0] return create_response( prediction=prediction, connection_device_id=connection_device_id, time_created_start=time_created_start, time_created_end=time_created_end, )
def reformatData(target, file_name): print("reformatting the data...") raw_df = pd.read_csv(file_name) #collapses the time cols long_table_df = raw_df.melt( id_vars=["event", "name", "start time", "end time", "channel"], var_name="anindex", value_name="value") sorted_long_table_df = long_table_df.sort_values( by=['event', 'name', 'start time', 'channel'], axis=0) unique_dim_ids = sorted_long_table_df.iloc[:, 4].unique() for i in range(len(unique_dim_ids)): my_channel = unique_dim_ids[i] sorted_long_table_df['channel'] = sorted_long_table_df[ 'channel'].replace({my_channel: i}) unique_start_time = sorted_long_table_df.iloc[:, 2].unique() for i in range(len(unique_start_time)): my_time = unique_start_time[i] sorted_long_table_df['start time'] = sorted_long_table_df[ 'start time'].replace({my_time: i}) sorted_long_table_df_stripped = sorted_long_table_df.drop( columns=['event', 'name', 'end time']) sorted_long_table_df_stripped.head() df_nested = from_long_to_nested(sorted_long_table_df_stripped) target = 'event' new_unique_start_time = sorted_long_table_df.iloc[:, 2].unique() labels = [] for e in new_unique_start_time: x = sorted_long_table_df.loc[sorted_long_table_df['start time'] == e, [target]].iloc[0][0] labels.append(x) np_labels = np.asarray(labels, dtype=np.str) return df_nested, np_labels
def prepare_dataframe(processed_json_df: pd.DataFrame, time_series_length: int, threshold: float): # Convert to JSON processed_json_df["allevents"] = processed_json_df["allevents"].apply( lambda x: json.loads(x)) # Reset PartitionDate Index to a simple range. We'll use it to index our "cases" ("samples") processed_json_df.reset_index(drop=True, inplace=True) # sktime expects a specific format. For now the easiest way is to convert our DataFrame to a long format # and then use the sktime parser. def dataframe_to_long(df, size=time_series_length): case_id = 0 for _, case in df.iterrows(): events = case["allevents"] # We ignore cases with insufficient readings if len(events) < size: continue # We also slice samples with too many readings ([-size:]) for reading_id, values in enumerate(events[-size:]): yield case_id, 0, reading_id, values["temperature"] # We can add more dimensions later on. # yield case_id, 1, reading_id, values["ambienttemperature"] case_id += 1 # can't use the row index because we skip rows. df_long = pd.DataFrame( dataframe_to_long(processed_json_df, size=time_series_length), columns=["case_id", "dim_id", "reading_id", "value"], ) # Convert to Sktime "nested" Format df_nested = from_long_to_nested(df_long) # Fake some labels # We simply explore the data, set an arbitrary threshold and define all series above that threshold as "True". df_nested["label"] = df_nested["dim_0"].apply( lambda x: x.max()) > threshold return df_nested
(23564575, [10, -1, 6])], 'A') case2 = ([(23534575, [10, -1, 2]), (23545575, [10, +1, 3]), (23564575, [10, -1, 4]), (23564575, [10, +1, 5]), (23564575, [10, -1, 6])], 'B') case3 = ([(23534575, [10, -1, 2]), (23545575, [10, +1, 3]), (23564575, [10, -1, 4]), (23564575, [10, +1, 5]), (23564575, [10, -1, 6])], 'A') data = [case1, case2, case3] # data -> our function -> (X_nested, y) X = generate_long_table(ts) X.head() X_nested = from_long_to_nested(X) X_nested.head() y = np.array(['a']) # , 'b', 'a', 'b', 'a', 'b', 'a', 'b']) print(X_nested) X_train, X_test, y_train, y_test = train_test_split(X_nested, y) print(X.head()) classifier = ColumnEnsembleClassifier(estimators=[ ("TSF1", TimeSeriesForestClassifier(n_estimators=100), [1]), ("TSF2", TimeSeriesForestClassifier(n_estimators=100), [2]), ]) classifier.fit(X_train, y_train) # Use the test portion of data for prediction so we can understand how accurate our model was learned y_pred = classifier.predict(X_test)
def reformatData(target, file_name): print("reformatting the data...") raw_df = pd.read_csv(file_name) # find count of events in the datasets events_count = raw_df['event'].value_counts().to_dict() #collapses the time cols into one single time column to match the rest of the columns long_table_df = raw_df.melt( id_vars=["event", "name", "start time", "end time", "channel"], var_name="anindex", value_name="value") sorted_long_table_df = long_table_df.sort_values( by=['event', 'name', 'start time', 'channel'], axis=0) # combine start time and subject into 1 column sorted_long_table_df['case_key'] = sorted_long_table_df[ 'start time'].astype(str) + sorted_long_table_df['name'] # use column case_key to be unique index for reformatted table unique_dim_ids = sorted_long_table_df.iloc[:, 4].unique() # get a mapping of channels channels_map = {} #replacing channel named to numeric values (need to do this doem the from_long_to_nested function) for i in range(len(unique_dim_ids)): my_channel = unique_dim_ids[i] sorted_long_table_df['channel'] = sorted_long_table_df[ 'channel'].replace({my_channel: i}) channels_map[i] = my_channel unique_case_key = sorted_long_table_df.iloc[:, -1].unique() for i in range(len(unique_case_key)): my_case_key = unique_case_key[i] sorted_long_table_df['case_key'] = sorted_long_table_df[ 'case_key'].replace({my_case_key: i}) # might need to delete this check if it takes too long time_map = {} # a map index for time unique_start_time = sorted_long_table_df.iloc[:, 2].unique() for i in range(len(unique_start_time)): my_time = unique_start_time[i] sorted_long_table_df['start time'] = sorted_long_table_df[ 'start time'].replace({my_time: i}) time_map[i] = my_time #excess columns are dropped for the frome_long_to_nested function sorted_long_table_df_stripped = sorted_long_table_df.drop( columns=['event', 'name', 'end time', 'start time']) # reorder column, move case_key column from last to first sorted_long_table_df_stripped = sorted_long_table_df_stripped[[ 'case_key', 'channel', 'anindex', 'value' ]] #table goes from long to nested # returns a sktime-formatted dataset with individual dimensions represented by columns of the output dataframe: df_nested = from_long_to_nested(sorted_long_table_df_stripped) # create a list of labels new_unique_case_key = sorted_long_table_df.iloc[:, -1].unique() labels = [] for e in new_unique_case_key: x = sorted_long_table_df.loc[sorted_long_table_df['case_key'] == e, [target]].iloc[0][0] labels.append(x) np_labels = np.asarray(labels, dtype=np.str) return df_nested, np_labels, events_count
import argparse import os import pandas as pd from sktime.utils.load_data import from_long_to_nested parser = argparse.ArgumentParser() parser.add_argument("--input", type=str, help="output data") parser.add_argument("--output", type=str, help="output data") parser.add_argument("--threshold", type=float, help="threshold cutoff") args = parser.parse_args() # Get input data pickle_path = os.path.join(args.input, "df_long.pkl") df_long = pd.read_pickle(pickle_path) # Convert to Sktime "nested" Format df_nested = from_long_to_nested(df_long) # Fake some labels # We simply explore the data, set an arbitrary threshold and define all series above that threshold as "True". df_nested["label"] = df_nested["dim_0"].apply( lambda x: x.max()) > args.threshold # Define output if not (args.output is None): os.makedirs(args.output, exist_ok=True) print("%s created" % args.output) df_nested.to_pickle(os.path.join(args.output, "df_nested.pkl"))