Example #1
0
    def _bq_get_data(self):

        hook = BigQueryHook(
            bigquery_conn_id=self.gcp_conn_id,
            delegate_to=self.delegate_to,
            location=self.location,
            impersonation_chain=self.impersonation_chain,
        )
        table_ref = TableReference.from_string(self.source_project_dataset_table)
        self.log.info('Fetching Data from:')
        self.log.info('Dataset: %s, Table: %s', table_ref.dataset_id, table_ref.table_id)

        conn = hook.get_conn()
        cursor = conn.cursor()
        i = 0
        while True:
            response = cursor.get_tabledata(
                dataset_id=table_ref.dataset_id,
                table_id=table_ref.table_id,
                max_results=self.batch_size,
                selected_fields=self.selected_fields,
                start_index=i * self.batch_size,
            )

            if 'rows' not in response:
                self.log.info('Job Finished')
                return

            rows = response['rows']

            self.log.info('Total Extracted rows: %s', len(rows) + i * self.batch_size)

            table_data = []
            table_data = [[fields['v'] for fields in dict_row['f']] for dict_row in rows]

            yield table_data
            i += 1
Example #2
0
    def _table_reference(self, provided_schema_name, provided_table_name,
                         client_project):
        project_id_from_table, dataset_id_from_table, table_id = self._split_table_name(
            provided_table_name)
        project_id_from_schema = None
        dataset_id_from_schema = None
        if provided_schema_name is not None:
            provided_schema_name_split = provided_schema_name.split(".")
            if len(provided_schema_name_split) == 1:
                if dataset_id_from_table:
                    project_id_from_schema = provided_schema_name_split[0]
                else:
                    dataset_id_from_schema = provided_schema_name_split[0]
            elif len(provided_schema_name_split) == 2:
                project_id_from_schema = provided_schema_name_split[0]
                dataset_id_from_schema = provided_schema_name_split[1]
            else:
                raise ValueError("Did not understand schema: {}".format(
                    provided_schema_name))
        if (dataset_id_from_schema and dataset_id_from_table
                and dataset_id_from_schema != dataset_id_from_table):
            raise ValueError(
                "dataset_id specified in schema and table_name disagree: "
                "got {} in schema, and {} in table_name".format(
                    dataset_id_from_schema, dataset_id_from_table))
        if (project_id_from_schema and project_id_from_table
                and project_id_from_schema != project_id_from_table):
            raise ValueError(
                "project_id specified in schema and table_name disagree: "
                "got {} in schema, and {} in table_name".format(
                    project_id_from_schema, project_id_from_table))
        project_id = project_id_from_schema or project_id_from_table or client_project
        dataset_id = dataset_id_from_schema or dataset_id_from_table or self.dataset_id

        table_ref = TableReference.from_string("{}.{}.{}".format(
            project_id, dataset_id, table_id))
        return table_ref
Example #3
0
    def execute(self, context):
        self.log.info(
            'Executing extract of %s into: %s',
            self.source_project_dataset_table,
            self.destination_cloud_storage_uris,
        )
        hook = BigQueryHook(
            bigquery_conn_id=self.gcp_conn_id,
            delegate_to=self.delegate_to,
            location=self.location,
            impersonation_chain=self.impersonation_chain,
        )

        table_ref = TableReference.from_string(
            self.source_project_dataset_table, hook.project_id)

        configuration: Dict[str, Any] = {
            'extract': {
                'sourceTable': table_ref.to_api_repr(),
                'compression': self.compression,
                'destinationUris': self.destination_cloud_storage_uris,
                'destinationFormat': self.export_format,
            }
        }

        if self.labels:
            configuration['labels'] = self.labels

        if self.export_format == 'CSV':
            # Only set fieldDelimiter and printHeader fields if using CSV.
            # Google does not like it if you set these fields for other export
            # formats.
            configuration['extract']['fieldDelimiter'] = self.field_delimiter
            configuration['extract']['printHeader'] = self.print_header

        hook.insert_job(configuration=configuration)
Example #4
0
def bq_insert(rows: List):
    """
    Inserts rows into BigQuery
    :param rows: list of dictionaries which are representing rows
    :return:
    """
    from google.cloud import bigquery

    if not rows:
        logging.error("no rows to upload")
        return
    bq = bigquery.Client(project=GCP_PROJECT)
    table_ref = TableReference.from_string(
        f"{GCP_PROJECT}.live.om_state_latencies")

    schema = [
        {
            "name": "date",
            "type": "DATE"
        },
        {
            "name": "sym",
            "type": "STRING"
        },
        {
            "name": "from_state",
            "type": "STRING"
        },
        {
            "name": "to_state",
            "type": "STRING"
        },
        {
            "name": "count",
            "type": "INTEGER"
        },
        {
            "name": "average",
            "type": "FLOAT"
        },
        {
            "name": "percentile_10",
            "type": "FLOAT"
        },
        {
            "name": "percentile_50",
            "type": "FLOAT"
        },
        {
            "name": "percentile_90",
            "type": "FLOAT"
        },
        {
            "name": "percentile_99",
            "type": "FLOAT"
        },
        {
            "name": "percentile_99_99",
            "type": "FLOAT"
        },
    ]

    table = Table(table_ref)
    table.schema = schema
    table = bq.create_table(table, exists_ok=True)
    logging.info("inserting {} rows".format(len(rows)))
    res = bq.insert_rows(table, rows)
    logging.info(res)