def main(argv): global db, db_connection, entityType, featureC, targetC, predictC, metric, startTime, endTime, startTimeV, endTimeV, helpString get_options(argv) # endTime == None means now if endTime is None: endTimeV = 0 else: endTimeV = ast.literal_eval(endTime) startTimeV = ast.literal_eval(startTime) + endTimeV # db_schema = None db = Database(credentials=credentials) print(db) # establish a native connection to db2 to store the model db_connection = ibm_db.connect(DB2ConnString, '', '') print(db_connection) model_store = DBModelStore(credentials['tenantId'], entityType, credentials['db2']['username'], db_connection, 'db2') db.model_store = model_store # with open('output.json', 'w+', encoding='utf-8') as G: # json.dump(db.entity_type_metadata, G) logger.info('Connected to database - SQL alchemy and native') meta = None try: meta = db.get_entity_type(entityType) print('Entity is ', meta) except Exception as e: logger.error('Failed to retrieve information about entityType ' + str(entityType) + ' from the database because of ' + str(e)) # make sure the results of the python expression is saved to the derived metrics table if metric == '': # take the first suitable choice if there is no metric sourceTableName = '' for di in meta['dataItemDto']: sourceTableName = di['sourceTableName'] if len(sourceTableName) > 0: break if len(sourceTableName) > 0: meta._data_items.append({ 'columnName': predictC, 'columnType': 'NUMBER', 'kpiFunctionId': 22856, 'kpiFunctionDto': { 'output': { 'name': predictC } }, 'name': predictC, 'parentDataItemName': None, 'sourceTableName': sourceTableName, 'tags': {}, 'transient': True, 'type': 'DERIVED_METRIC' }) else: logger.error('No suitable derived metric table found') return else: found = False try: for di in meta['dataItemDto']: if di.name == metric: found = True predictC = di.columnName break if not found: logger.error('Metric does not exist') return except Exception: pass print('Feature ', featureC, 'targets ', targetC) gbm = GBMRegressor(features=[featureC], targets=[targetC], predictions=[predictC], max_depth=20, num_leaves=40, n_estimators=4000, learning_rate=0.001) setattr(gbm, 'n_estimators', 4000) setattr(gbm, 'max_depth', 20) setattr(gbm, 'num_leaves', 40) setattr(gbm, 'learning_rate', 0.001) gbm.delete_existing_models = True logger.info('Created Regressor') jobsettings = { 'db': db, '_production_mode': False, '_start_ts_override': (dt.datetime.utcnow() - dt.timedelta(days=startTimeV)), '_end_ts_override': (dt.datetime.utcnow() - dt.timedelta(days=endTimeV)), '_db_schema': credentials['db2']['username'], 'save_trace_to_file': True } if meta is not None: meta._functions = [gbm] else: logger.error('No valid entity') return logger.info('Instantiated training job') job = pp.JobController(meta, **jobsettings) job.execute() logger.info('Model trained') return
def main(argv): # entityType = 'Clients04' entityType = '' featureC = 'pressure' targetC = 'temperature' predictC = 'predict' startTime = None endTime = None startTimeV = dt.datetime.utcnow() endTimeV = dt.datetime.utcnow() helpString = 'train.py -E <entityType> -f <feature column> -o <target column> -p <prediction column> \ -s <starttime> -e <endtime>' try: opts, args = getopt.getopt( argv, "hf:t:p:s:e:E:", ["featureC=", "targetC=", "predictC=", "startTime=", "endTime=", "entityType="]) except getopt.GetoptError: print(helpString) sys.exit(2) for opt, arg in opts: if opt == '-h': print(helpString) sys.exit() elif opt in ("-E", "--entityType"): entityType = arg elif opt in ("-f", "--feature"): featureC = arg elif opt in ("-t", "--target"): targetC = arg elif opt in ("-p", "--predict"): predictC = arg elif opt in ("-s", "--starttime"): startTime = arg elif opt in ("-e", "--endtime"): endTime = arg print('EntityType "', entityType) print('Feature Column (X) "', featureC) print('Target Column (Y) "', targetC) print('Predictor Column "', predictC) print('StartTime "', startTime) print('EndTime "', endTime) if entityType == '': print('entityType name is missing') print(helpString) sys.exit(3) # endTime == None means now if startTime == None: print('startTime is missing, please specify relative to endTime (-3 means 3 days before endTime)') print(helpString) sys.exit(4) else: startTimeV = dt.datetime.utcnow() - dt.timedelta(days=int(startTime)) # db_schema = None db = Database(credentials=credentials) print(db) meta = db.get_entity_type(entityType) logger.info('Connected to database') est = estimator.SimpleRegressor(features=[featureC], targets=[targetC], predictions=[predictC]) est.delete_existing_models = True meta._functions = [est] logger.info('Created Regressor') # make sure the results of the python expression is saved to the derived metrics table meta._data_items.append({'columnName': predictC, 'columnType': 'NUMBER', 'kpiFunctionId': 22856, 'kpiFunctionDto': {'output': {'name': predictC}}, 'name': predictC, 'parentDataItemName': None, 'sourceTableName': 'DM_CLIENTS04', 'tags': {}, 'transient': True, 'type': 'DERIVED_METRIC'}) jobsettings = {'_production_mode': False, '_start_ts_override': dt.datetime.utcnow() - dt.timedelta(days=10), '_end_ts_override': (dt.datetime.utcnow() - dt.timedelta(days=1)), # .strftime('%Y-%m-%d %H:%M:%S'), '_db_schema': 'BLUADMIN', 'save_trace_to_file': True} logger.info('Instantiated training job') job = pp.JobController(meta, **jobsettings) job.execute() logger.info('Model trained') return
def load_metrics_data_from_csv(entity_type_name, file_path, credentials=None, **kwargs): """ reads metrics data from csv and stores in entity type metrics table Note: make sure 'deviceid' and 'evt_timestamp' columns are present in csv 'evt_timestamp' column will be inferred to be current time if None present :param entity_type_name: str name of entity we want to load data for :param file_path: str path to csv file :param credentials: dict analytics-service dev credentials :param **kwargs { db_schema str if no schema is provided will use the default schema if_exists str default:append } :return: """ # load csv in dataframe df = pd.read_csv(file_path) # Map the lowering function to all column names # required columns are lower case df.columns = map(str.lower, df.columns) # DATABASE CONNECTION # :description: to access Watson IOT Platform Analytics DB. logger.debug('Connecting to Database') db = Database(credentials=credentials) # check if entity type table exists db_schema = None if 'db_schema' in kwargs: db_schema = kwargs['db_schema'] #get the entity type to add data to try: entity_type = db.get_entity_type(entity_type_name) except: raise Exception( f'No entity type {entity_type_name} found.' f'Make sure you create entity type before loading data using csv.' f'Refer to create_custom_entitytype() to create the entity type first' ) # find required columns required_cols = db.get_column_names(table=entity_type.name, schema=db_schema) missing_cols = list(set(required_cols) - set(df.columns)) logger.debug(f'missing_cols : {missing_cols}') # Add data for missing columns that are required # required columns that can't be NULL {'evt_timestamp',', 'updated_utc', 'devicetype'} for m in missing_cols: if m == entity_type._timestamp: #get possible timestamp columns and select the first one from all candidate df_timestamp = df.filter(like='_timestamp') if not df_timestamp.empty: df_timestamp_columns = df_timestamp.columns timestamp_col = df_timestamp_columns[0] df[m] = pd.to_datetime(df_timestamp[timestamp_col]) logger.debug( f'Inferred column {timestamp_col} as missing column {m}') else: df[m] = dt.datetime.utcnow() - dt.timedelta(seconds=15) logger.debug( f'Adding data: current time to missing column {m}') elif m == 'devicetype': df[m] = entity_type.logical_name logger.debug( f'Adding data: {entity_type.logical_name} to missing column {m}' ) elif m == 'updated_utc': logger.debug(f'Adding data: current time to missing column {m}') df[m] = dt.datetime.utcnow() - dt.timedelta(seconds=15) elif m == entity_type._entity_id: raise Exception(f'Missing required column {m}') else: df[m] = None # remove columns that are not required df = df[required_cols] # write the dataframe to the database table db.write_frame(df=df, table_name=entity_type.name) logger.debug( f'Generated {len(df.index)} rows of data and inserted into {entity_type.name}' ) # CLOSE DB CONNECTION db.release_resource() return