deviceid start_date end_date activity 73003 2018-11-18-23.09.20.325289 2018-11-19-00.06.38.195689 PM 73003 2018-11-21-23.09.20.325289 2018-11-21-23.10.42.203689 UM ''' activity_data = { 'deviceid': '100', 'start_date': dt.datetime.utcnow() - dt.timedelta(days=1), 'end_date': dt.datetime.utcnow(), 'activity': 'maintenance' } activity_df = pd.DataFrame(data=activity_data, index=[0]) # write activity data to a table db.write_frame(df=activity_df, table_name=activity_name, if_exists='replace') entity.exec_local_pipeline(start_ts=start_date) ''' Here is an example of the outputs: id evt_timestamp deviceid duration maintenance_duration shift_day shift_end_date shift_id shift_start_date 100 2019/25/8 19:29 100 90.13333333 90.13333333 2019/25/8 0:00 2019/25/8 21:00 2 2019/25/8 14:00 100 2019/25/8 21:00 100 510 510 2019/25/8 0:00 2019/26/8 5:30 3 2019/25/8 21:00 100 2019/26/8 5:30 100 510 510 2019/26/8 0:00 2019/26/8 14:00 1 2019/26/8 5:30 100 2019/26/8 14:00 100 329.8666667 329.8666667 2019/26/8 0:00 2019/26/8 21:00 2 2019/26/8 14:00 As per the example into the intro, the single maintenance task has been apportioned into 4 shifts. Durations may be apportioned by changes in the values of other dimensions too.
def load_metrics_data_from_csv(entity_type_name, file_path, credentials=None, **kwargs): """ reads metrics data from csv and stores in entity type metrics table Note: make sure 'deviceid' and 'evt_timestamp' columns are present in csv 'evt_timestamp' column will be inferred to be current time if None present :param entity_type_name: str name of entity we want to load data for :param file_path: str path to csv file :param credentials: dict analytics-service dev credentials :param **kwargs { db_schema str if no schema is provided will use the default schema if_exists str default:append } :return: """ # load csv in dataframe df = pd.read_csv(file_path) # Map the lowering function to all column names # required columns are lower case df.columns = map(str.lower, df.columns) # DATABASE CONNECTION # :description: to access Watson IOT Platform Analytics DB. logger.debug('Connecting to Database') db = Database(credentials=credentials, entity_type=entity_type_name) # check if entity type table exists db_schema = None if 'db_schema' in kwargs: db_schema = kwargs['db_schema'] #get the entity type to add data to entity_type_metadata = db.entity_type_metadata.get(entity_type_name) logger.debug(entity_type_metadata) if entity_type_metadata is None: raise RuntimeError( f'No entity type {entity_type_name} found.' f'Make sure you create entity type before loading data using csv.' f'Refer to create_custom_entitytype() to create the entity type first' ) # find required columns timestamp_col_name = entity_type_metadata['metricTimestampColumn'] logical_name = entity_type_metadata['name'] table_name = db_table_name(entity_type_metadata['metricTableName'], db.db_type) deviceid_col = 'deviceid' required_cols = db.get_column_names(table=table_name, schema=db_schema) missing_cols = list(set(required_cols) - set(df.columns)) logger.debug(f'missing_cols : {missing_cols}') # Add data for missing columns that are required #required columns that can't be NULL {'evt_timestamp','device_id','updated_utc','devicetype','rcv_timestamp_utc'} for m in missing_cols: if m == timestamp_col_name or m == 'rcv_timestamp_utc': #get possible timestamp columns and select the first one from all candidate df_timestamp = df.filter(like='_timestamp') if not df_timestamp.empty: df_timestamp_columns = df_timestamp.columns timestamp_col = df_timestamp_columns[0] df[m] = pd.to_datetime(df_timestamp[timestamp_col]) logger.debug( f'Inferred column {timestamp_col} as missing column {m}') else: df[m] = dt.datetime.utcnow() - dt.timedelta(seconds=15) logger.debug( f'Adding data: current time to missing column {m}') elif m == 'devicetype': df[m] = logical_name logger.debug(f'Adding data: {logical_name} to missing column {m}') elif m == 'updated_utc': logger.debug(f'Adding data: current time to missing column {m}') df[m] = dt.datetime.utcnow() - dt.timedelta(seconds=15) elif m == deviceid_col: raise RuntimeError(f'Missing required column {m}') else: df[m] = None # DATA CHECKS # 1. Check pd.DataFrame data types against entitytype/database data types # coerce data frame object data type to corresponding database-data_type # Add None for missing columns (Not added to the db) logger.debug(f'Dataframe columns before data check 1. {df.columns}') entity_type_columns = entity_type_metadata['dataItemDto'] df = change_df_dtype_to_db_dtype(df, entity_type_columns) logger.debug(f'Dataframe columns after data check 1. {df.columns}') # 2. allowed device_id name: alpha-numeric + hypen + underscore + period + between [1,36] length # Drop rows with un-allowed device_id names logger.debug( f'Dataframe has {len(df.index)} rows of data before data check 2') df = df[df[deviceid_col].str.contains(r'^[A-Za-z0-9._-]+$')] df = df[df[deviceid_col].str.len() <= 36] logger.warning( f'This function will ignore rows where deviceid has values that are not allowed' ) logger.warning( f'(NOTE) Allowed characters in deviceid string are: alpha-numeric/hypen/underscore/period with ' f'length of 1 to 36 characters') logger.debug( f'Dataframe has {len(df.index)} rows of data after data check 2') # remove columns that are not required/ in entity type definition logger.debug(f'Updating columns: {required_cols}') df = df[required_cols] logger.debug(f'Top 5 elements of the df written to the db: \n{df.head(5)}') # write the dataframe to the database table db.write_frame(df=df, table_name=table_name) logger.debug( f'Generated {len(df.index)} rows of data and inserted into {table_name}' ) # CLOSE DB CONNECTION db.release_resource()
def load_metrics_data_from_csv(entity_type_name, file_path, credentials=None, **kwargs): """ reads metrics data from csv and stores in entity type metrics table Note: make sure 'deviceid' and 'evt_timestamp' columns are present in csv 'evt_timestamp' column will be inferred to be current time if None present :param entity_type_name: str name of entity we want to load data for :param file_path: str path to csv file :param credentials: dict analytics-service dev credentials :param **kwargs { db_schema str if no schema is provided will use the default schema if_exists str default:append } :return: """ # load csv in dataframe df = pd.read_csv(file_path) # Map the lowering function to all column names # required columns are lower case df.columns = map(str.lower, df.columns) # DATABASE CONNECTION # :description: to access Watson IOT Platform Analytics DB. logger.debug('Connecting to Database') db = Database(credentials=credentials) # check if entity type table exists db_schema = None if 'db_schema' in kwargs: db_schema = kwargs['db_schema'] #get the entity type to add data to try: entity_type = db.get_entity_type(entity_type_name) except: raise Exception( f'No entity type {entity_type_name} found.' f'Make sure you create entity type before loading data using csv.' f'Refer to create_custom_entitytype() to create the entity type first' ) # find required columns required_cols = db.get_column_names(table=entity_type.name, schema=db_schema) missing_cols = list(set(required_cols) - set(df.columns)) logger.debug(f'missing_cols : {missing_cols}') # Add data for missing columns that are required # required columns that can't be NULL {'evt_timestamp',', 'updated_utc', 'devicetype'} for m in missing_cols: if m == entity_type._timestamp: #get possible timestamp columns and select the first one from all candidate df_timestamp = df.filter(like='_timestamp') if not df_timestamp.empty: df_timestamp_columns = df_timestamp.columns timestamp_col = df_timestamp_columns[0] df[m] = pd.to_datetime(df_timestamp[timestamp_col]) logger.debug( f'Inferred column {timestamp_col} as missing column {m}') else: df[m] = dt.datetime.utcnow() - dt.timedelta(seconds=15) logger.debug( f'Adding data: current time to missing column {m}') elif m == 'devicetype': df[m] = entity_type.logical_name logger.debug( f'Adding data: {entity_type.logical_name} to missing column {m}' ) elif m == 'updated_utc': logger.debug(f'Adding data: current time to missing column {m}') df[m] = dt.datetime.utcnow() - dt.timedelta(seconds=15) elif m == entity_type._entity_id: raise Exception(f'Missing required column {m}') else: df[m] = None # remove columns that are not required df = df[required_cols] # write the dataframe to the database table db.write_frame(df=df, table_name=entity_type.name) logger.debug( f'Generated {len(df.index)} rows of data and inserted into {entity_type.name}' ) # CLOSE DB CONNECTION db.release_resource() return