Esempio n. 1
0
    def _get_existing_transfer(self,
                               data_source_id: str,
                               destination_dataset_id: str = None,
                               params: Dict[str, str] = None,
                               name: str = None) -> bool:
        """Gets data transfer if it already exists.

    Args:
      data_source_id: Data source id.
      destination_dataset_id: BigQuery dataset id.
      params: Data transfer specific parameters.

    Returns:
      Data Transfer if the transfer already exists.
      None otherwise.
    """
        parent = self.client.location_path(
            self.project_id, config_parser.get_dataset_location())
        for transfer_config in self.client.list_transfer_configs(parent):
            if transfer_config.data_source_id != data_source_id:
                continue
            if destination_dataset_id and transfer_config.destination_dataset_id != destination_dataset_id:
                continue
            # If the transfer config is in Failed state, we should ignore.
            is_valid_state = transfer_config.state in (_PENDING_STATE,
                                                       _RUNNING_STATE,
                                                       _SUCCESS_STATE)
            params_match = self._check_params_match(transfer_config, params)
            name_matches = name is None or name == transfer_config.display_name
            if params_match and is_valid_state and name_matches:
                return transfer_config
        return None
Esempio n. 2
0
    def create_merchant_center_transfer(
            self, merchant_id: str, destination_dataset: str,
            enable_market_insights: bool) -> types.TransferConfig:
        """Creates a new merchant center transfer.

    Merchant center allows retailers to store product info into Google. This
    method creates a data transfer config to copy the product data to BigQuery.

    Args:
      merchant_id: Google Merchant Center(GMC) account id.
      destination_dataset: BigQuery dataset id.
      enable_market_insights: Whether to deploy market insights solution.

    Returns:
      Transfer config.
    """
        logging.info('Creating Merchant Center Transfer.')
        parameters = struct_pb2.Struct()
        parameters['merchant_id'] = merchant_id
        parameters['export_products'] = True
        if enable_market_insights:
            parameters['export_price_benchmarks'] = True
            parameters['export_best_sellers'] = True
        data_transfer_config = self._get_existing_transfer(
            _MERCHANT_CENTER_ID, destination_dataset, parameters)
        if data_transfer_config:
            logging.info(
                'Data transfer for merchant id %s to destination dataset %s '
                'already exists.', merchant_id, destination_dataset)
            return self._update_existing_transfer(data_transfer_config,
                                                  parameters)
        logging.info(
            'Creating data transfer for merchant id %s to destination dataset %s',
            merchant_id, destination_dataset)
        has_valid_credentials = self._check_valid_credentials(
            _MERCHANT_CENTER_ID)
        authorization_code = None
        if not has_valid_credentials:
            authorization_code = self._get_authorization_code(
                _MERCHANT_CENTER_ID)
        dataset_location = config_parser.get_dataset_location()
        parent = self.client.location_path(self.project_id, dataset_location)
        transfer_config_input = {
            'display_name': f'Merchant Center Transfer - {merchant_id}',
            'data_source_id': _MERCHANT_CENTER_ID,
            'destination_dataset_id': destination_dataset,
            'params': parameters,
            'data_refresh_window_days': 0,
        }
        transfer_config = self.client.create_transfer_config(
            parent, transfer_config_input, authorization_code)
        logging.info(
            'Data transfer created for merchant id %s to destination dataset %s',
            merchant_id, destination_dataset)
        return transfer_config
Esempio n. 3
0
    def _get_data_source(self, data_source_id: str) -> types.DataSource:
        """Returns data source.

    Args:
      data_source_id: Data source id.
    """
        dataset_location = config_parser.get_dataset_location()
        name = self.client.location_data_source_path(self.project_id,
                                                     dataset_location,
                                                     data_source_id)
        return self.client.get_data_source(name)
Esempio n. 4
0
    def _check_valid_credentials(self, data_source_id: str) -> bool:
        """Returns true if valid credentials exist for the given data source.

    Args:
      data_source_id: Data source id.
    """
        dataset_location = config_parser.get_dataset_location()
        name = self.client.location_data_source_path(self.project_id,
                                                     dataset_location,
                                                     data_source_id)
        response = self.client.check_valid_creds(name)
        return response.has_valid_creds
Esempio n. 5
0
    def schedule_query(self, name: str,
                       query_string: str) -> types.TransferConfig:
        """Schedules query to run every day.

    Args:
      name: Name of the scheduled query.
      query_string: The query to be run.
    """
        data_transfer_config = self._get_existing_transfer('scheduled_query',
                                                           name=name)
        parameters = struct_pb2.Struct()
        parameters['query'] = query_string
        if data_transfer_config:
            logging.info(
                'Data transfer for scheduling query "%s" already exists.',
                name)
            updated_transfer_config = self._update_existing_transfer(
                data_transfer_config, parameters)
            logging.info('Data transfer for scheduling query "%s" updated.',
                         name)
            start_time_pb = timestamp_pb2.Timestamp()
            start_time = datetime.datetime.now(tz=pytz.utc)
            start_time_pb.FromDatetime(start_time)
            self.client.start_manual_transfer_runs(
                parent=updated_transfer_config.name,
                requested_run_time=start_time_pb)
            logging.info(
                'One time manual run started. It might take upto 1 hour for performance data'
                ' to reflect on the dash.')
            return updated_transfer_config
        dataset_location = config_parser.get_dataset_location()
        parent = self.client.location_path(self.project_id, dataset_location)
        params = {
            'query': query_string,
        }
        transfer_config_input = google.protobuf.json_format.ParseDict(
            {
                'display_name': name,
                'data_source_id': 'scheduled_query',
                'params': params,
                'schedule': 'every 24 hours',
            },
            bigquery_datatransfer_v1.types.TransferConfig(),
        )
        has_valid_credentials = self._check_valid_credentials(
            'scheduled_query')
        authorization_code = ''
        if not has_valid_credentials:
            authorization_code = self._get_authorization_code(
                'scheduled_query')
        transfer_config = self.client.create_transfer_config(
            parent, transfer_config_input, authorization_code)
        return transfer_config
Esempio n. 6
0
    def wait_for_transfer_completion(self, transfer_config: Dict[str,
                                                                 Any]) -> None:
        """Waits for the completion of data transfer operation.

    This method retrieves data transfer operation and checks for its status. If
    the operation is not completed, then the operation is re-checked after
    `_SLEEP_SECONDS` seconds.

    Args:
      transfer_config: Resource representing data transfer.

    Raises:
      DataTransferError: If the data transfer is not successfully completed.
    """
        # TODO: Use exponential back-off for polling.
        transfer_config_name = transfer_config.name
        transfer_config_id = transfer_config_name.split('/')[-1]
        poll_counter = 0  # Counter to keep polling count.
        while True:
            transfer_config_path = self.client.location_transfer_config_path(
                self.project_id, config_parser.get_dataset_location(),
                transfer_config_id)
            response = self.client.list_transfer_runs(transfer_config_path)
            latest_transfer = None
            for transfer in response:
                latest_transfer = transfer
                break
            if not latest_transfer:
                return
            if latest_transfer.state == _SUCCESS_STATE:
                logging.info('Transfer %s was successful.',
                             transfer_config_name)
                return
            if (latest_transfer.state == _FAILED_STATE
                    or latest_transfer.state == _CANCELLED_STATE):
                error_message = (
                    f'Transfer {transfer_config_name} was not successful. '
                    f'Error - {latest_transfer.error_status}')
                logging.error(error_message)
                raise DataTransferError(error_message)
            logging.info(
                'Transfer %s still in progress. Sleeping for %s seconds before '
                'checking again.', transfer_config_name, _SLEEP_SECONDS)
            time.sleep(_SLEEP_SECONDS)
            poll_counter += 1
            if poll_counter >= _MAX_POLL_COUNTER:
                error_message = (
                    f'Transfer {transfer_config_name} is taking too long'
                    ' to finish. Hence failing the request.')
                logging.error(error_message)
                raise DataTransferError(error_message)
Esempio n. 7
0
def create_dataset_if_not_exists(project_id: str, dataset_id: str) -> None:
    """Creates BigQuery dataset if it doesn't exists.

  Args:
    project_id: A cloud project id.
    dataset_id: BigQuery dataset id.
  """
    # Construct a BigQuery client object.
    client = bigquery.Client(project=project_id)
    fully_qualified_dataset_id = f'{project_id}.{dataset_id}'
    try:
        client.get_dataset(fully_qualified_dataset_id)
        logging.info('Dataset %s already exists.', fully_qualified_dataset_id)
    except exceptions.NotFound:
        logging.info('Dataset %s is not found.', fully_qualified_dataset_id)
        dataset = bigquery.Dataset(fully_qualified_dataset_id)
        dataset.location = config_parser.get_dataset_location()
        client.create_dataset(dataset)
        logging.info('Dataset %s created.', fully_qualified_dataset_id)
Esempio n. 8
0
def execute_queries(project_id: str, dataset_id: str, merchant_id: str,
                    customer_id: str, enable_market_insights: bool) -> None:
    """Executes list of queries."""
    # Sql files to be executed in a specific order. The prefix "scripts" should be omitted.
    sql_files = [
        '1_product_view.sql',
        'targeted_products/targeted_product_ddl.sql',
        'targeted_products/construct_parsed_criteria.sql',
        '2_product_metrics_view.sql',
        '3_customer_view.sql',
        '4_product_detailed_view.sql',
        'materialize_product_detailed.sql',
        'materialize_product_historical.sql',
    ]
    if enable_market_insights:
        market_insights_sql_files = [
            'market_insights/snapshot_view.sql',
            'market_insights/historical_view.sql'
        ]
        sql_files.extend(market_insights_sql_files)
    prefix = 'scripts'
    query_params = {
        'project_id': project_id,
        'dataset': dataset_id,
        'merchant_id': merchant_id,
        'external_customer_id': customer_id
    }
    location = config_parser.get_dataset_location()
    client = bigquery.Client(project=project_id)
    for sql_file in sql_files:
        try:
            query = configure_sql(os.path.join(prefix, sql_file), query_params)
            query_job = client.query(query, location=location)
            query_job.result()
        except:
            logging.exception('Error in %s', sql_file)
            raise
Esempio n. 9
0
    def create_google_ads_transfer(
            self,
            customer_id: str,
            destination_dataset: str,
            backfill_days: int = 30) -> types.TransferConfig:
        """Creates a new Google Ads transfer.

    This method creates a data transfer config to copy Google Ads data to
    BigQuery dataset.

    Args:
      customer_id: Google Ads customer id.
      destination_dataset: BigQuery dataset id.
      backfill_days: Number of days to backfill.

    Returns:
      Transfer config.
    """
        logging.info('Creating Google Ads Transfer.')

        parameters = struct_pb2.Struct()
        parameters['customer_id'] = customer_id
        data_transfer_config = self._get_existing_transfer(
            _GOOGLE_ADS_ID, destination_dataset, parameters)
        if data_transfer_config:
            logging.info(
                'Data transfer for Google Ads customer id %s to destination dataset '
                '%s already exists.', customer_id, destination_dataset)
            return data_transfer_config
        logging.info(
            'Creating data transfer for Google Ads customer id %s to destination '
            'dataset %s', customer_id, destination_dataset)
        has_valid_credentials = self._check_valid_credentials(_GOOGLE_ADS_ID)
        authorization_code = None
        if not has_valid_credentials:
            authorization_code = self._get_authorization_code(_GOOGLE_ADS_ID)
        dataset_location = config_parser.get_dataset_location()
        parent = self.client.location_path(self.project_id, dataset_location)
        transfer_config_input = {
            'display_name': f'Google Ads Transfer - {customer_id}',
            'data_source_id': _GOOGLE_ADS_ID,
            'destination_dataset_id': destination_dataset,
            'params': parameters,
            'data_refresh_window_days': 1,
        }
        transfer_config = self.client.create_transfer_config(
            parent, transfer_config_input, authorization_code)
        logging.info(
            'Data transfer created for Google Ads customer id %s to destination '
            'dataset %s', customer_id, destination_dataset)
        if backfill_days:
            transfer_config_name = transfer_config.name
            transfer_config_id = transfer_config_name.split('/')[-1]
            start_time = datetime.datetime.now(
                tz=pytz.utc) - datetime.timedelta(days=backfill_days)
            end_time = datetime.datetime.now(tz=pytz.utc)
            start_time = start_time.replace(hour=0,
                                            minute=0,
                                            second=0,
                                            microsecond=0)
            end_time = end_time.replace(hour=0,
                                        minute=0,
                                        second=0,
                                        microsecond=0)
            parent = self.client.location_transfer_config_path(
                self.project_id, dataset_location, transfer_config_id)
            start_time_pb = timestamp_pb2.Timestamp()
            end_time_pb = timestamp_pb2.Timestamp()
            start_time_pb.FromDatetime(start_time)
            end_time_pb.FromDatetime(end_time)
            self.client.schedule_transfer_runs(parent, start_time_pb,
                                               end_time_pb)
        return transfer_config