Ejemplo n.º 1
0
    def upload_predictions(self):
        logger.info(f'Storing predicted data')
        client_secrets_path = os.getenv('GCLOUD_CREDENTIALS_SERVICE_ACCOUNT_JSON_KEY_PATH')
        database = os.getenv('BIGQUERY_PROJECT_ID')
        _, db_connection = create_connection(
            f'bigquery://{database}',
            engine_kwargs={'credentials_path': client_secrets_path}
        )

        credentials = service_account.Credentials.from_service_account_file(
            client_secrets_path,
        )

        self.predictions.to_gbq(
            destination_table=f'{os.getenv("BIGQUERY_DATASET")}.churn_predictions_log',
            project_id=database,
            credentials=credentials,
            if_exists='append'
        )

        self.prediction_job_log = self.predictions[
            ['outcome_date', 'model_version', 'created_at']].head(1)
        self.prediction_job_log['rows_predicted'] = len(self.predictions)
        self.prediction_job_log.rename({'outcome_date': 'date'}, axis=1, inplace=True)

        self.prediction_job_log.to_gbq(
            destination_table=f'{os.getenv("BIGQUERY_DATASET")}.prediction_job_log',
            project_id=database,
            credentials=credentials,
            if_exists='append',
        )
Ejemplo n.º 2
0
def get_sqlalchemy_tables_w_session(db_connection_string_name: str, schema: str, table_names: List[str]) -> Dict:
    table_mapping = {}
    _, db_connection = create_connection(os.getenv(db_connection_string_name))

    for table in table_names:
        table_mapping[table] = get_sqla_table(table_name=table, engine=db_connection, schema=os.getenv(schema))

    table_mapping['session'] = sessionmaker(bind=db_connection)()

    return table_mapping
Ejemplo n.º 3
0
    def handle_table(schema: List[bigquery.SchemaField], table_name: str):
        client_secrets_path = os.getenv(
            'GCLOUD_CREDENTIALS_SERVICE_ACCOUNT_JSON_KEY_PATH')
        database = os.getenv('BIGQUERY_PROJECT_ID')
        _, db_connection = create_connection(
            f'bigquery://{database}',
            engine_kwargs={'credentials_path': client_secrets_path})

        credentials = service_account.Credentials.from_service_account_file(
            client_secrets_path, )

        handler = TableHandler(database.replace('bigquery://', ''),
                               credentials, table_name, schema)

        handler.create_table(logger)
Ejemplo n.º 4
0
def get_sqlalchemy_tables_w_session(table_names: List[str]) -> (Dict, Engine):
    table_mapping = {}
    database = os.getenv('BIGQUERY_PROJECT_ID')
    _, db_connection = create_connection(
        f'bigquery://{database}', {
            'credentials_path':
            os.getenv('GCLOUD_CREDENTIALS_SERVICE_ACCOUNT_JSON_KEY_PATH')
        })
    schema = os.getenv('BIGQUERY_DATASET')
    for table in table_names:
        table_mapping[table] = get_sqla_table(
            table_name=f'{database}.{schema}.{table}',
            engine=db_connection,
        )

    table_mapping['session'] = sessionmaker(bind=db_connection)()

    return table_mapping, db_connection
Ejemplo n.º 5
0
    def upload_model_meta(self):
        logger.info(f'Storing model metadata')
        client_secrets_path = os.getenv(
            'GCLOUD_CREDENTIALS_SERVICE_ACCOUNT_JSON_KEY_PATH')
        database = os.getenv('BIGQUERY_PROJECT_ID')
        _, db_connection = create_connection(
            f'bigquery://{database}',
            engine_kwargs={'credentials_path': client_secrets_path})

        credentials = service_account.Credentials.from_service_account_file(
            client_secrets_path, )

        models = pd.DataFrame(
            {
                'train_date': datetime.utcnow().date(),
                'min_date': self.min_date,
                'max_date': self.max_date,
                'model_type': self.model_type,
                'model_version': self.current_model_version,
                'window_days': self.moving_window,
                'event_lookahead': self.positive_event_lookahead,
            },
            index=[0])

        for feature_set_name, feature_set in {
                'numeric_columns': self.feature_columns.numeric_columns,
                'profile_numeric_columns_from_json_fields':
                self.feature_columns.profile_numeric_columns_from_json_fields,
                'time_based_columns': self.feature_columns.time_based_columns,
                'categorical_columns':
                self.feature_columns.categorical_columns,
                'bool_columns': self.feature_columns.bool_columns,
                'numeric_columns_with_window_variants':
                self.feature_columns.numeric_columns_window_variants,
                'device_based_columns':
                self.feature_columns.device_based_features
        }.items():
            if isinstance(feature_set, list):
                if feature_set_name != 'categorical_columns':
                    feature_set_elements = feature_set
                else:
                    # Categorical variables get transformed into dummy variables with <column_name>_<column_value>
                    # naming
                    feature_set_elements = [
                        column for column in self.variable_importances.index
                        for column_prefix in feature_set_name
                        if column_prefix in column
                    ]

                models[f'importances__{feature_set_name}'] = str(
                    self.variable_importances[feature_set_elements].to_dict())
            elif isinstance(feature_set, dict):
                feature_set_elements = sum(feature_set.values(), [])
                models[f'importances__{feature_set_name}'] = str(
                    self.variable_importances[feature_set_elements].to_dict())

        models.to_gbq(
            destination_table=f'{os.getenv("BIGQUERY_DATASET")}.models',
            project_id=database,
            credentials=credentials,
            if_exists='append')

        logger.info(f'Model metadata stored')
Ejemplo n.º 6
0
def get_feature_frame_via_sqlalchemy(start_time: datetime,
                                     end_time: datetime,
                                     moving_window_length: int = 30,
                                     positive_event_lookahead: int = 33):
    rolling_daily_user_profile = get_user_profiles_table()

    query = f'''
        SELECT
            user_id,
            date,
            outcome_date,
            outcome,
            feature_aggregation_functions,
            {','.join([column.name for column in rolling_daily_user_profile.columns if 'features' in column.name])}
        FROM
            {os.getenv('BIGQUERY_DATASET')}.rolling_daily_user_profile
        WHERE
            outcome_date >= @start_time
            AND outcome_date <= @end_time
            AND window_days = @window_days
            AND event_lookahead = @event_lookahead
    '''

    client_secrets_path = os.getenv(
        'GCLOUD_CREDENTIALS_SERVICE_ACCOUNT_JSON_KEY_PATH')
    database = os.getenv('BIGQUERY_PROJECT_ID')
    _, db_connection = create_connection(
        f'bigquery://{database}',
        engine_kwargs={'credentials_path': client_secrets_path})

    credentials = service_account.Credentials.from_service_account_file(
        client_secrets_path, )

    import pandas_gbq
    feature_frame = pandas_gbq.read_gbq(
        query,
        project_id=database,
        credentials=credentials,
        use_bqstorage_api=True,
        configuration={
            'query': {
                'parameterMode':
                'NAMED',
                'queryParameters': [
                    {
                        'name': 'start_time',
                        'parameterType': {
                            'type': 'DATE'
                        },
                        'parameterValue': {
                            'value': str(start_time.date())
                        }
                    },
                    {
                        'name': 'end_time',
                        'parameterType': {
                            'type': 'DATE'
                        },
                        'parameterValue': {
                            'value': str(end_time.date())
                        }
                    },
                    {
                        'name': 'window_days',
                        'parameterType': {
                            'type': 'INT64'
                        },
                        'parameterValue': {
                            'value': moving_window_length
                        }
                    },
                    {
                        'name': 'event_lookahead',
                        'parameterType': {
                            'type': 'INT64'
                        },
                        'parameterValue': {
                            'value': positive_event_lookahead
                        }
                    },
                ]
            }
        })

    return feature_frame