def _save_predicted_data(self, data_uri: BigQueryLocation, partition: str):
     data_uri = data_uri.copy()
     extra_columns = []
     extra_columns.append(
         f'"{self.mlflow_experiment_name}" as mlflow_experiment_name')
     extra_columns.append(f'"{self.model_id}" as model_id')
     extra_columns.append(f'"{self.model_version}" as model_version')
     extra_columns.append(f'"{self.run_id}" as run_id')
     extra_columns.append(f'"{partition}" as data_partition')
     extra_columns.append(f'CURRENT_TIMESTAMP as load_date')
     data_uri.data_columns += extra_columns
     query = data_uri.get_select_query(include_id=True)
     destination = f'{self.predictions_dataset}.{self.mlflow_experiment_name}'
     if (job_config := self.save_data_job_config) is None:
         job_config = set_defaults_save_job_config(
             bigquery.job.QueryJobConfig())
         job_config.destination = destination
Esempio n. 2
0
    def load_data(self):
        if self.data_loaded:
            return

        train_data = deepcopy(self.train_uri)
        eval_data = deepcopy(self.eval_uri)

        bq_loc_train = BigQueryLocation(
            **train_data,
            target_column=self.target_column,
            id_column=self.row_id_column).get_select_query(include_id=True)
        bq_loc_eval = BigQueryLocation(
            **eval_data,
            target_column=self.target_column,
            id_column=self.row_id_column).get_select_query(include_id=True)

        train_df = self.client.query(
            bq_loc_train).to_dataframe()  # API request
        eval_df = self.client.query(bq_loc_eval).to_dataframe()  # API request
        super().load_data(train_df=train_df, eval_df=eval_df)
        self.data_loaded = True
Esempio n. 3
0
 def fit(self, train_x, train_y, eval_x, eval_y) -> None:
     self.input_label_col = train_y.target_column
     model_parameters = self.get_model_parameters()
     model_parameters['data_split_method'] = 'CUSTOM'
     model_parameters['data_split_col'] = 'is_eval'
     model_parameters['input_label_cols'] = [self.input_label_col]
     quote = "'"
     options = ',\n'.join(
         [f'{key}={quote if isinstance(value, str) else ""}{value}{quote if isinstance(value, str) else ""}'
          for key, value in model_parameters.items()])
     train_data_location = BigQueryLocation(data_columns=train_x.data_columns,
                                            id_column=train_x.id_column,
                                            table=train_x.table,
                                            order=train_x.order,
                                            target_column=train_y.target_column,
                                            limit=train_x.limit)
     eval_data_location = BigQueryLocation(data_columns=eval_x.data_columns,
                                           id_column=eval_x.id_column,
                                           table=eval_x.table,
                                           order=eval_x.order,
                                           target_column=eval_y.target_column,
                                           limit=eval_x.limit)
     create_model_query = self.create_model_query.format(
         train_query=train_data_location.get_select_query(),
         eval_data=eval_data_location.get_select_query(),
         sql_model_path=self.sql_model_path,
         options=options)
     if not self.training_enabled:
         print("Training is disabled, skipping training step")
         return
     query_job = self.run_query_and_wait(
         create_model_query, job_id_prefix=self.job_id_prefix)
Esempio n. 4
0
 def predict(self, x_uri: str, *args, **kwargs) -> None:
     target_column = self.input_label_col
     predict_query = self.predict_query.format(
         id_column=x_uri.id_column,
         target_column=target_column,
         sql_model_path=self.sql_model_path,
         predict_query=x_uri.get_select_query(
             include_id=True),
         order=x_uri.order
     )
     query_job = self.run_query_and_wait(predict_query,
                                         job_id_prefix=self.job_id_prefix)
     destination = query_job.destination
     destination_location = BigQueryLocation(
         data_columns=[self.prediction_column_name.format(
             target_column=target_column)],
         id_column=x_uri.id_column,
         table=f'{destination.project}.{destination.dataset_id}.{destination.table_id}',
         order=x_uri.order
     )
     return destination_location
 def get_eval_y(self) -> str:
     return BigQueryLocation(**self.eval_y,
                             target_column=self.target_column,
                             id_column=self.row_id_column)
 def get_eval_x(self) -> str:
     return BigQueryLocation(**self.eval_x, id_column=self.row_id_column)
 def get_train_x(self) -> str:
     return BigQueryLocation(**self.train_x, id_column=self.row_id_column)