Ejemplo n.º 1
0
def load(configProperties):

    print("Scoring Data Load Start")

    #########################################
    # Load Data
    #########################################
    prodreader = DataSetReader(
        client_id=configProperties['ML_FRAMEWORK_IMS_USER_CLIENT_ID'],
        user_token=configProperties['ML_FRAMEWORK_IMS_TOKEN'],
        service_token=configProperties['ML_FRAMEWORK_IMS_ML_TOKEN'])

    timeframe = configProperties.get("timeframe")

    if (timeframe is not None):
        date_before = datetime.utcnow().date()
        date_after = date_before - timedelta(minutes=int(timeframe))
        dataframe = prodreader.load(
            data_set_id=configProperties['scoringDataSetId'],
            ims_org=configProperties['ML_FRAMEWORK_IMS_TENANT_ID'],
            date_after=date_after,
            date_before=date_before)
    else:
        dataframe = prodreader.load(
            data_set_id=configProperties['scoringDataSetId'],
            ims_org=configProperties['ML_FRAMEWORK_IMS_TENANT_ID'])

    #########################################
    # Data Preparation/Feature Engineering
    #########################################

    if '_id' in dataframe.columns:
        #Rename columns to strip tenantId
        dataframe = dataframe.rename(
            columns=lambda x: str(x)[str(x).find('.') + 1:])
        #Drop id, eventType and timestamp
        dataframe.drop(['_id', 'eventType', 'timestamp'], axis=1, inplace=True)

    dataframe.date = pd.to_datetime(dataframe.date)
    dataframe['week'] = dataframe.date.dt.week
    dataframe['year'] = dataframe.date.dt.year

    dataframe = pd.concat(
        [dataframe, pd.get_dummies(dataframe['storeType'])], axis=1)
    dataframe.drop('storeType', axis=1, inplace=True)
    dataframe['isHoliday'] = dataframe['isHoliday'].astype(int)

    dataframe['weeklySalesAhead'] = dataframe.shift(-45)['weeklySales']
    dataframe['weeklySalesLag'] = dataframe.shift(45)['weeklySales']
    dataframe['weeklySalesDiff'] = (
        dataframe['weeklySales'] -
        dataframe['weeklySalesLag']) / dataframe['weeklySalesLag']
    dataframe.dropna(0, inplace=True)

    dataframe = dataframe.set_index(dataframe.date)
    dataframe.drop('date', axis=1, inplace=True)

    print("Scoring Data Load Finish")

    return dataframe
    def split(self, configProperties={}):
        #########################################
        # Load Data
        #########################################
        prodreader = DataSetReader(
            client_id=configProperties['ML_FRAMEWORK_IMS_USER_CLIENT_ID'],
            user_token=configProperties['ML_FRAMEWORK_IMS_TOKEN'],
            service_token=configProperties['ML_FRAMEWORK_IMS_ML_TOKEN'])

        training_data_set_id = configProperties.get("trainingDataSetId")
        timeframe = configProperties.get("timeframe")

        if (timeframe is not None):
            date_before = datetime.utcnow().date()
            date_after = date_before - timedelta(minutes=int(timeframe))
            df = prodreader.load(
                data_set_id=training_data_set_id,
                ims_org=configProperties['ML_FRAMEWORK_IMS_TENANT_ID'],
                date_after=date_after,
                date_before=date_before)
        else:
            df = prodreader.load(
                data_set_id=training_data_set_id,
                ims_org=configProperties['ML_FRAMEWORK_IMS_TENANT_ID'])

        #########################################
        # Data Preparation/Feature Engineering
        #########################################
        df.date = pd.to_datetime(df.date)
        df['week'] = df.date.dt.week
        df['year'] = df.date.dt.year

        df = pd.concat([df, pd.get_dummies(df['storeType'])], axis=1)
        df.drop('storeType', axis=1, inplace=True)
        df['isHoliday'] = df['isHoliday'].astype(int)

        df['weeklySalesAhead'] = df.shift(-45)['weeklySales']
        df['weeklySalesLag'] = df.shift(45)['weeklySales']
        df['weeklySalesDiff'] = (df['weeklySales'] -
                                 df['weeklySalesLag']) / df['weeklySalesLag']
        df.dropna(0, inplace=True)

        df = df.set_index(df.date)
        df.drop('date', axis=1, inplace=True)

        # Split
        train_start = '2010-02-12'
        train_end = '2012-01-27'
        test_start = '2012-02-03'
        train = df[train_start:train_end]
        test = df[test_start:]

        return train, test
Ejemplo n.º 3
0
def load(configProperties):

    print("Scoring Data Load Start")

    #########################################
    # Load Data
    #########################################
    prodreader = DataSetReader(
        client_id=configProperties['ML_FRAMEWORK_IMS_USER_CLIENT_ID'],
        user_token=configProperties['ML_FRAMEWORK_IMS_TOKEN'],
        service_token=configProperties['ML_FRAMEWORK_IMS_ML_TOKEN'])

    scoring_data_set_id = configProperties.get("scoringDataSetId")
    timeframe = configProperties.get("timeframe")

    if (timeframe is not None):
        date_before = datetime.utcnow().date()
        date_after = date_before - timedelta(minutes=int(timeframe))
        df = prodreader.load(
            data_set_id=scoring_data_set_id,
            ims_org=configProperties['ML_FRAMEWORK_IMS_TENANT_ID'],
            date_after=date_after,
            date_before=date_before)
    else:
        df = prodreader.load(
            data_set_id=scoring_data_set_id,
            ims_org=configProperties['ML_FRAMEWORK_IMS_TENANT_ID'])

    #########################################
    # Data Preparation/Feature Engineering
    #########################################
    df.date = pd.to_datetime(df.date)
    df['week'] = df.date.dt.week
    df['year'] = df.date.dt.year

    df = pd.concat([df, pd.get_dummies(df['storeType'])], axis=1)
    df.drop('storeType', axis=1, inplace=True)
    df['isHoliday'] = df['isHoliday'].astype(int)

    df['weeklySalesAhead'] = df.shift(-45)['weeklySales']
    df['weeklySalesLag'] = df.shift(45)['weeklySales']
    df['weeklySalesDiff'] = (df['weeklySales'] -
                             df['weeklySalesLag']) / df['weeklySalesLag']
    df.dropna(0, inplace=True)

    df = df.set_index(df.date)
    df.drop('date', axis=1, inplace=True)

    print("Scoring Data Load Finish")

    return df
Ejemplo n.º 4
0
def load(configProperties):

    print("Scoring Data Load Start")

    #########################################
    # Load Data
    #########################################
    prodreader = DataSetReader(
        client_id=configProperties['ML_FRAMEWORK_IMS_USER_CLIENT_ID'],
        user_token=configProperties['ML_FRAMEWORK_IMS_TOKEN'],
        service_token=configProperties['ML_FRAMEWORK_IMS_ML_TOKEN'])

    df = prodreader.load(
        data_set_id=configProperties['scoringDataSetId'],
        ims_org=configProperties['ML_FRAMEWORK_IMS_TENANT_ID'])

    #########################################
    # Data Preparation/Feature Engineering
    #########################################
    df.date = pd.to_datetime(df.date)
    df['week'] = df.date.dt.week
    df['year'] = df.date.dt.year

    df = pd.concat([df, pd.get_dummies(df['storeType'])], axis=1)
    df.drop('storeType', axis=1, inplace=True)
    df['isHoliday'] = df['isHoliday'].astype(int)

    df['weeklySalesAhead'] = df.shift(-45)['weeklySales']
    df['weeklySalesLag'] = df.shift(45)['weeklySales']
    df['weeklySalesDiff'] = (df['weeklySales'] -
                             df['weeklySalesLag']) / df['weeklySalesLag']
    df.dropna(0, inplace=True)

    df = df.set_index(df.date)
    df.drop('date', axis=1, inplace=True)

    print("Scoring Data Load Finish")

    return df
    def train(self, config={}):
        #########################################
        # Set Up
        #########################################
        tf.logging.set_verbosity(tf.logging.ERROR)
        tf_config = json.loads(os.environ['TF_CONFIG'])
        tf_config = json.loads('{}')
        os.environ['TF_CONFIG'] = json.dumps(tf_config)

        #########################################
        # Load Data
        #########################################
        prodreader = DataSetReader(
            client_id=config['ML_FRAMEWORK_IMS_USER_CLIENT_ID'],
            user_token=config['ML_FRAMEWORK_IMS_TOKEN'],
            service_token=config['ML_FRAMEWORK_IMS_ML_TOKEN'])

        dataframe = prodreader.load(data_set_id=config['trainingDataSetId'],
                                    ims_org=config['ML_FRAMEWORK_IMS_ORG_ID'])

        #########################################
        # Data Preparation/Feature Engineering
        #########################################
        if '_id' in dataframe.columns:
            # Rename columns to strip tenantId
            dataframe = dataframe.rename(
                columns=lambda x: str(x)[str(x).find('.') + 1:])
            # Drop id, eventType and timestamp
            dataframe.drop(['_id', 'eventType', 'timestamp'],
                           axis=1,
                           inplace=True)

        dataframe.date = pd.to_datetime(dataframe.date)
        dataframe['week'] = dataframe.date.dt.week
        dataframe['year'] = dataframe.date.dt.year

        dataframe = dataframe.sort_values(by=['date', 'store'])

        dataframe = pd.concat(
            [dataframe, pd.get_dummies(dataframe['storeType'])], axis=1)
        dataframe.drop('storeType', axis=1, inplace=True)
        dataframe['isHoliday'] = dataframe['isHoliday'].astype(int)

        dataframe['weeklySalesAhead'] = dataframe.shift(-45)['weeklySales']
        dataframe['weeklySalesLag'] = dataframe.shift(45)['weeklySales']
        dataframe['weeklySalesDiff'] = (
            dataframe['weeklySales'] -
            dataframe['weeklySalesLag']) / dataframe['weeklySalesLag']
        dataframe.dropna(0, inplace=True)

        dataframe = dataframe.set_index(dataframe.date)
        dataframe.drop('date', axis=1, inplace=True)

        #########################################
        # Train / Validation Split
        #########################################
        train_start = '2010-02-12'
        train_end = '2012-01-27'
        val_start = '2012-02-03'
        train = dataframe[train_start:train_end]
        val = dataframe[val_start:]

        X_train = train.drop('weeklySalesAhead', axis=1)
        y_train = train['weeklySalesAhead'].values

        X_val = val.drop('weeklySalesAhead', axis=1)
        y_val = val['weeklySalesAhead'].values

        features = []
        for feature in X_train.columns:
            features.append(fc.numeric_column(feature, dtype=tf.float32))

        def gen_input_fn(features,
                         labels,
                         epochs=10,
                         shuffle=True,
                         batch_size=32):
            def input_function():
                dataset = tf.data.Dataset.from_tensor_slices(
                    (dict(features), labels))
                if shuffle:
                    dataset = dataset.shuffle(1000)
                dataset = dataset.batch(batch_size).repeat(epochs)
                return dataset

            return input_function

        train_input_fn = gen_input_fn(X_train, y_train)
        eval_input_fn = gen_input_fn(X_val, y_val, shuffle=False, epochs=1)

        #########################################
        # BoostedTreesRegressor Model
        #########################################
        learning_rate = float(config['learning_rate'])
        n_estimators = int(config['n_estimators'])
        max_depth = int(config['max_depth'])

        filename = config['modelPATH'] + '/my_model'
        os.makedirs(os.path.dirname(filename), exist_ok=True)

        model = tf.estimator.BoostedTreesRegressor(features,
                                                   model_dir=filename,
                                                   n_batches_per_layer=5,
                                                   n_trees=n_estimators,
                                                   max_depth=max_depth,
                                                   learning_rate=learning_rate)

        model.train(train_input_fn, max_steps=n_estimators)

        #########################################
        # Process Metrics
        #########################################
        pred_dict = list(model.predict(eval_input_fn))
        y_pred = pd.Series([pred['predictions'][0] for pred in pred_dict])

        mape = np.mean(np.abs((y_val - y_pred) / y_val))
        mae = np.mean(np.abs(y_val - y_pred))
        rmse = np.sqrt(np.mean((y_val - y_pred)**2))

        metrics_dict = {}
        metrics_dict['MAPE'] = round(mape, 3)
        metrics_dict['MAE'] = round(mae, 3)
        metrics_dict['RMSE'] = round(rmse, 3)

        pickle.dump(
            metrics_dict,
            open(os.path.join(config['modelPATH'], 'metrics_dict.pkl'), 'wb'))
    def score(self, config={}):
        tf.logging.set_verbosity(tf.logging.ERROR)
        tf_config = json.loads(os.environ['TF_CONFIG'])
        tf_config = json.loads('{}')
        os.environ['TF_CONFIG'] = json.dumps(tf_config)

        #########################################
        # Load Data
        #########################################
        prodreader = DataSetReader(
            client_id=config['ML_FRAMEWORK_IMS_USER_CLIENT_ID'],
            user_token=config['ML_FRAMEWORK_IMS_TOKEN'],
            service_token=config['ML_FRAMEWORK_IMS_ML_TOKEN'])

        dataframe = prodreader.load(data_set_id=config['scoringDataSetId'],
                                    ims_org=config['ML_FRAMEWORK_IMS_ORG_ID'])

        #########################################
        # Data Preparation/Feature Engineering
        #########################################
        if '_id' in dataframe.columns:
            # Rename columns to strip tenantId
            dataframe = dataframe.rename(
                columns=lambda x: str(x)[str(x).find('.') + 1:])
            # Drop id, eventType and timestamp
            dataframe.drop(['_id', 'eventType', 'timestamp'],
                           axis=1,
                           inplace=True)

        dataframe.date = pd.to_datetime(dataframe.date)
        dataframe['week'] = dataframe.date.dt.week
        dataframe['year'] = dataframe.date.dt.year

        dataframe = dataframe.sort_values(by=['date', 'store'])

        dataframe = pd.concat(
            [dataframe, pd.get_dummies(dataframe['storeType'])], axis=1)
        dataframe.drop('storeType', axis=1, inplace=True)
        dataframe['isHoliday'] = dataframe['isHoliday'].astype(int)

        dataframe['weeklySalesAhead'] = dataframe.shift(-45)['weeklySales']
        dataframe['weeklySalesLag'] = dataframe.shift(45)['weeklySales']
        dataframe['weeklySalesDiff'] = (
            dataframe['weeklySales'] -
            dataframe['weeklySalesLag']) / dataframe['weeklySalesLag']
        dataframe.dropna(0, inplace=True)

        dataframe = dataframe.set_index(dataframe.date)
        dataframe.drop('date', axis=1, inplace=True)

        #########################################
        # Data Preparation/Feature Engineering
        #########################################
        X_test = dataframe.drop('weeklySalesAhead', axis=1)
        y_test = dataframe['weeklySalesAhead'].values

        features = []
        for feature in X_test.columns:
            features.append(fc.numeric_column(feature, dtype=tf.float32))

        def gen_input_fn(features,
                         labels,
                         epochs=10,
                         shuffle=True,
                         batch_size=32):
            def input_function():
                dataset = tf.data.Dataset.from_tensor_slices(
                    (dict(features), labels))
                if shuffle:
                    dataset = dataset.shuffle(1000)
                dataset = dataset.batch(batch_size).repeat(epochs)
                return dataset

            return input_function

        test_input_fn = gen_input_fn(X_test, y_test, shuffle=False, epochs=1)

        #########################################
        # BoostedTreesRegressor Model
        #########################################
        model = tf.estimator.BoostedTreesRegressor(
            features, n_batches_per_layer=5, model_dir=config['modelPATH'])

        #########################################
        # Write Results
        #########################################
        pred_dict = list(model.predict(test_input_fn))
        y_pred = pd.Series([pred['predictions'][0] for pred in pred_dict])

        X_test['prediction'] = y_pred.values
        output = X_test[['store', 'prediction']].reset_index()
        output['date'] = output['date'].astype(str)

        writer = DataSetWriter(
            client_id=config['ML_FRAMEWORK_IMS_USER_CLIENT_ID'],
            user_token=config['ML_FRAMEWORK_IMS_TOKEN'],
            service_token=config['ML_FRAMEWORK_IMS_ML_TOKEN'])

        print('Writer Configured')

        tenant_id = config['tenant_id']
        output = output.add_prefix(tenant_id + '.')
        output = output.join(
            pd.DataFrame(
                {
                    '_id': '',
                    'timestamp': '2019-01-01T00:00:00',
                    'eventType': ''
                },
                index=output.index))

        writer.write(data_set_id=config['scoringResultsDataSetId'],
                     dataframe=output,
                     ims_org=config['ML_FRAMEWORK_IMS_ORG_ID'],
                     file_format='json')

        print('Write Done')