Exemple #1
0
deviceid    start_date                  end_date                    activity
73003	    2018-11-18-23.09.20.325289	2018-11-19-00.06.38.195689	PM
73003	    2018-11-21-23.09.20.325289	2018-11-21-23.10.42.203689	UM

'''

activity_data = {
    'deviceid': '100',
    'start_date': dt.datetime.utcnow() - dt.timedelta(days=1),
    'end_date': dt.datetime.utcnow(),
    'activity': 'maintenance'
}

activity_df = pd.DataFrame(data=activity_data, index=[0])
# write activity data to a table
db.write_frame(df=activity_df, table_name=activity_name, if_exists='replace')

entity.exec_local_pipeline(start_ts=start_date)
'''

Here is an example of the outputs:

id	evt_timestamp	    deviceid	duration	maintenance_duration	shift_day	    shift_end_date	shift_id	shift_start_date
100	2019/25/8 19:29	    100	        90.13333333	90.13333333	            2019/25/8 0:00	2019/25/8 21:00	2	        2019/25/8 14:00
100	2019/25/8 21:00	    100	        510	        510	                    2019/25/8 0:00	2019/26/8 5:30	3	        2019/25/8 21:00
100	2019/26/8 5:30	    100	        510	        510	                    2019/26/8 0:00	2019/26/8 14:00	1	        2019/26/8 5:30
100	2019/26/8 14:00	    100	        329.8666667	329.8666667	            2019/26/8 0:00	2019/26/8 21:00	2	        2019/26/8 14:00

As per the example into the intro, the single maintenance task has been apportioned into 4 shifts.

Durations may be apportioned by changes in the values of other dimensions too.
Exemple #2
0
def load_metrics_data_from_csv(entity_type_name,
                               file_path,
                               credentials=None,
                               **kwargs):
    """
    reads metrics data from csv and stores in entity type metrics table
    Note: make sure 'deviceid' and 'evt_timestamp' columns are present in csv
    'evt_timestamp' column will be inferred to be current time if None present

    :param entity_type_name: str name of entity we want to load data for
    :param file_path: str path to csv file
    :param credentials: dict analytics-service dev credentials
    :param **kwargs {
        db_schema str if no schema is provided will use the default schema
        if_exists str default:append
    }
    :return:
    """
    # load csv in dataframe
    df = pd.read_csv(file_path)

    # Map the lowering function to all column names
    # required columns are lower case
    df.columns = map(str.lower, df.columns)

    # DATABASE CONNECTION
    # :description: to access Watson IOT Platform Analytics DB.
    logger.debug('Connecting to Database')
    db = Database(credentials=credentials, entity_type=entity_type_name)
    # check if entity type table exists
    db_schema = None
    if 'db_schema' in kwargs:
        db_schema = kwargs['db_schema']
    #get the entity type to add data to
    entity_type_metadata = db.entity_type_metadata.get(entity_type_name)
    logger.debug(entity_type_metadata)
    if entity_type_metadata is None:
        raise RuntimeError(
            f'No entity type {entity_type_name} found.'
            f'Make sure you create entity type before loading data using csv.'
            f'Refer to create_custom_entitytype() to create the entity type first'
        )

    # find required columns
    timestamp_col_name = entity_type_metadata['metricTimestampColumn']
    logical_name = entity_type_metadata['name']
    table_name = db_table_name(entity_type_metadata['metricTableName'],
                               db.db_type)
    deviceid_col = 'deviceid'

    required_cols = db.get_column_names(table=table_name, schema=db_schema)
    missing_cols = list(set(required_cols) - set(df.columns))
    logger.debug(f'missing_cols : {missing_cols}')
    # Add data for missing columns that are required
    #required columns that can't be NULL {'evt_timestamp','device_id','updated_utc','devicetype','rcv_timestamp_utc'}
    for m in missing_cols:
        if m == timestamp_col_name or m == 'rcv_timestamp_utc':
            #get possible timestamp columns and select the first one from all candidate
            df_timestamp = df.filter(like='_timestamp')
            if not df_timestamp.empty:
                df_timestamp_columns = df_timestamp.columns
                timestamp_col = df_timestamp_columns[0]
                df[m] = pd.to_datetime(df_timestamp[timestamp_col])
                logger.debug(
                    f'Inferred column {timestamp_col} as missing column {m}')
            else:
                df[m] = dt.datetime.utcnow() - dt.timedelta(seconds=15)
                logger.debug(
                    f'Adding data: current time to missing column {m}')
        elif m == 'devicetype':
            df[m] = logical_name
            logger.debug(f'Adding data: {logical_name} to missing column {m}')
        elif m == 'updated_utc':
            logger.debug(f'Adding data: current time to missing column {m}')
            df[m] = dt.datetime.utcnow() - dt.timedelta(seconds=15)
        elif m == deviceid_col:
            raise RuntimeError(f'Missing required column {m}')
        else:
            df[m] = None

    # DATA CHECKS
    # 1. Check pd.DataFrame data types against entitytype/database data types
    # coerce data frame object data type to corresponding database-data_type
    # Add None for missing columns (Not added to the db)
    logger.debug(f'Dataframe columns before data check 1. {df.columns}')
    entity_type_columns = entity_type_metadata['dataItemDto']
    df = change_df_dtype_to_db_dtype(df, entity_type_columns)
    logger.debug(f'Dataframe columns after data check 1. {df.columns}')

    # 2. allowed device_id name: alpha-numeric + hypen + underscore + period + between [1,36] length
    # Drop rows with un-allowed device_id names
    logger.debug(
        f'Dataframe has {len(df.index)} rows of data before data check 2')
    df = df[df[deviceid_col].str.contains(r'^[A-Za-z0-9._-]+$')]
    df = df[df[deviceid_col].str.len() <= 36]
    logger.warning(
        f'This function will ignore rows where deviceid has values that are not allowed'
    )
    logger.warning(
        f'(NOTE) Allowed characters in deviceid string are: alpha-numeric/hypen/underscore/period with '
        f'length of 1 to 36 characters')
    logger.debug(
        f'Dataframe has {len(df.index)} rows of data after data check 2')

    # remove columns that are not required/ in entity type definition
    logger.debug(f'Updating columns: {required_cols}')
    df = df[required_cols]
    logger.debug(f'Top 5 elements of the df written to the db: \n{df.head(5)}')
    # write the dataframe to the database table
    db.write_frame(df=df, table_name=table_name)
    logger.debug(
        f'Generated {len(df.index)} rows of data and inserted into {table_name}'
    )

    # CLOSE DB CONNECTION
    db.release_resource()
def load_metrics_data_from_csv(entity_type_name,
                               file_path,
                               credentials=None,
                               **kwargs):
    """
    reads metrics data from csv and stores in entity type metrics table
    Note: make sure 'deviceid' and 'evt_timestamp' columns are present in csv
    'evt_timestamp' column will be inferred to be current time if None present

    :param entity_type_name: str name of entity we want to load data for
    :param file_path: str path to csv file
    :param credentials: dict analytics-service dev credentials
    :param **kwargs {
        db_schema str if no schema is provided will use the default schema
        if_exists str default:append
    }
    :return:
    """
    # load csv in dataframe
    df = pd.read_csv(file_path)

    # Map the lowering function to all column names
    # required columns are lower case
    df.columns = map(str.lower, df.columns)

    # DATABASE CONNECTION
    # :description: to access Watson IOT Platform Analytics DB.
    logger.debug('Connecting to Database')
    db = Database(credentials=credentials)
    # check if entity type table exists
    db_schema = None
    if 'db_schema' in kwargs:
        db_schema = kwargs['db_schema']
    #get the entity type to add data to
    try:
        entity_type = db.get_entity_type(entity_type_name)
    except:
        raise Exception(
            f'No entity type {entity_type_name} found.'
            f'Make sure you create entity type before loading data using csv.'
            f'Refer to create_custom_entitytype() to create the entity type first'
        )

    # find required columns
    required_cols = db.get_column_names(table=entity_type.name,
                                        schema=db_schema)
    missing_cols = list(set(required_cols) - set(df.columns))
    logger.debug(f'missing_cols : {missing_cols}')
    # Add data for missing columns that are required
    # required columns that can't be NULL {'evt_timestamp',', 'updated_utc', 'devicetype'}
    for m in missing_cols:
        if m == entity_type._timestamp:
            #get possible timestamp columns and select the first one from all candidate
            df_timestamp = df.filter(like='_timestamp')
            if not df_timestamp.empty:
                df_timestamp_columns = df_timestamp.columns
                timestamp_col = df_timestamp_columns[0]
                df[m] = pd.to_datetime(df_timestamp[timestamp_col])
                logger.debug(
                    f'Inferred column {timestamp_col} as missing column {m}')
            else:
                df[m] = dt.datetime.utcnow() - dt.timedelta(seconds=15)
                logger.debug(
                    f'Adding data: current time to missing column {m}')
        elif m == 'devicetype':
            df[m] = entity_type.logical_name
            logger.debug(
                f'Adding data: {entity_type.logical_name} to missing column {m}'
            )
        elif m == 'updated_utc':
            logger.debug(f'Adding data: current time to missing column {m}')
            df[m] = dt.datetime.utcnow() - dt.timedelta(seconds=15)
        elif m == entity_type._entity_id:
            raise Exception(f'Missing required column {m}')
        else:
            df[m] = None

    # remove columns that are not required
    df = df[required_cols]
    # write the dataframe to the database table
    db.write_frame(df=df, table_name=entity_type.name)
    logger.debug(
        f'Generated {len(df.index)} rows of data and inserted into {entity_type.name}'
    )

    # CLOSE DB CONNECTION
    db.release_resource()

    return