def side_effect(request):
     if (request == bigquery.BigqueryTablesGetRequest(
             projectId='project',
             datasetId='dataset',
             tableId='table__sample_info')):
         raise exceptions.HttpError(response={'status': '404'},
                                    url='',
                                    content='')
     return bigquery.Table(tableReference=bigquery.TableReference(
         projectId='project',
         datasetId='dataset',
         tableId='table__chr1_part1'))
Beispiel #2
0
def parse_table_reference(table, dataset=None, project=None):
  """Parses a table reference into a (project, dataset, table) tuple.

  Args:
    table: The ID of the table. The ID must contain only letters
      (a-z, A-Z), numbers (0-9), or underscores (_). If dataset argument is None
      then the table argument must contain the entire table reference:
      'DATASET.TABLE' or 'PROJECT:DATASET.TABLE'. This argument can be a
      bigquery.TableReference instance in which case dataset and project are
      ignored and the reference is returned as a result.  Additionally, for date
      partitioned tables, appending '$YYYYmmdd' to the table name is supported,
      e.g. 'DATASET.TABLE$YYYYmmdd'.
    dataset: The ID of the dataset containing this table or null if the table
      reference is specified entirely by the table argument.
    project: The ID of the project containing this table or null if the table
      reference is specified entirely by the table (and possibly dataset)
      argument.

  Returns:
    A TableReference object from the bigquery API. The object has the following
    attributes: projectId, datasetId, and tableId.

  Raises:
    ValueError: if the table reference as a string does not match the expected
      format.
  """

  if isinstance(table, bigquery.TableReference):
    return table
  elif callable(table):
    return table
  elif isinstance(table, value_provider.ValueProvider):
    return table

  table_reference = bigquery.TableReference()
  # If dataset argument is not specified, the expectation is that the
  # table argument will contain a full table reference instead of just a
  # table name.
  if dataset is None:
    match = re.match(
        r'^((?P<project>.+):)?(?P<dataset>\w+)\.(?P<table>[\w\$]+)$', table)
    if not match:
      raise ValueError(
          'Expected a table reference (PROJECT:DATASET.TABLE or '
          'DATASET.TABLE) instead of %s.' % table)
    table_reference.projectId = match.group('project')
    table_reference.datasetId = match.group('dataset')
    table_reference.tableId = match.group('table')
  else:
    table_reference.projectId = project
    table_reference.datasetId = dataset
    table_reference.tableId = table
  return table_reference
 def create_table(cls, table_name, data, table_schema):
     table = bigquery.Table(tableReference=bigquery.TableReference(
         projectId=cls.project,
         datasetId=cls.dataset_id,
         tableId=table_name),
                            schema=table_schema)
     request = bigquery.BigqueryTablesInsertRequest(
         projectId=cls.project, datasetId=cls.dataset_id, table=table)
     cls.bigquery_client.client.tables.Insert(request)
     cls.bigquery_client.insert_rows(cls.project, cls.dataset_id,
                                     table_name, data)
     return table_schema
Beispiel #4
0
 def expand(self,pcoll):
     table_spec = bigquery.TableReference(
         projectId='iotpubsub-1536350750202',
         datasetId='baybenames',
         #tableId='relation_extraction_data'
         tableId='relation_data_sample'
         )
     return  (
         pcoll
         |'Read in put table' >> beam.io.Read(beam.io.BigQuerySource(table_spec))
         |'Split words' >> beam.ParDo(SplitSentence_Updated_Table())
         |'Split test and training data' >>  beam.Partition(lambda element, _: 0 if randint(0,100)<80 else 1,2 )
         )
Beispiel #5
0
    def _create_table(cls, table_name):
        table_schema = bigquery.TableSchema()

        number = bigquery.TableFieldSchema()
        number.name = 'number'
        number.type = 'INTEGER'
        table_schema.fields.append(number)

        string = bigquery.TableFieldSchema()
        string.name = 'string'
        string.type = 'STRING'
        table_schema.fields.append(string)

        time = bigquery.TableFieldSchema()
        time.name = 'time'
        time.type = 'TIME'
        table_schema.fields.append(time)

        datetime = bigquery.TableFieldSchema()
        datetime.name = 'datetime'
        datetime.type = 'DATETIME'
        table_schema.fields.append(datetime)

        rec = bigquery.TableFieldSchema()
        rec.name = 'rec'
        rec.type = 'RECORD'
        rec_datetime = bigquery.TableFieldSchema()
        rec_datetime.name = 'rec_datetime'
        rec_datetime.type = 'DATETIME'
        rec.fields.append(rec_datetime)
        rec_rec = bigquery.TableFieldSchema()
        rec_rec.name = 'rec_rec'
        rec_rec.type = 'RECORD'
        rec_rec_datetime = bigquery.TableFieldSchema()
        rec_rec_datetime.name = 'rec_rec_datetime'
        rec_rec_datetime.type = 'DATETIME'
        rec_rec.fields.append(rec_rec_datetime)
        rec.fields.append(rec_rec)
        table_schema.fields.append(rec)

        table = bigquery.Table(tableReference=bigquery.TableReference(
            projectId=cls.project,
            datasetId=cls.dataset_id,
            tableId=table_name),
                               schema=table_schema)
        request = bigquery.BigqueryTablesInsertRequest(
            projectId=cls.project, datasetId=cls.dataset_id, table=table)
        cls.bigquery_client.client.tables.Insert(request)
        cls.bigquery_client.insert_rows(cls.project, cls.dataset_id,
                                        table_name, cls.TABLE_DATA)
 def test_table_with_write_disposition_append(self):
     client = mock.Mock()
     table = bigquery.Table(tableReference=bigquery.TableReference(
         projectId='project', datasetId='dataset', tableId='table'),
                            schema=bigquery.TableSchema())
     client.tables.Get.return_value = table
     client.tables.Insert.return_value = table
     write_disposition = beam.io.BigQueryDisposition.WRITE_APPEND
     with beam.io.BigQuerySink(
             'project:dataset.table',
             write_disposition=write_disposition).writer(client):
         pass
     self.assertTrue(client.tables.Get.called)
     self.assertFalse(client.tables.Delete.called)
     self.assertFalse(client.tables.Insert.called)
Beispiel #7
0
    def _create_parquet_file(self, blob_name, staging_table_util,
                             destination_prefix):
        """Creates a parquet file from a staging table and stores in GCS.

        The parquet file is generated using DataFLow, since BigQuery Extract
        Jobs do not support the parquet file type as a destination format.

        Args:
            blob_name(str): Name of the file (or blob) to be generated. Starts
                with 'fileType=' and end with the file extension.
                Ex: fileType=csv/compression=none/numColumns=10/columnTypes=100_STRING/numFiles=10000/tableSize=2147MB/file3876.csv  # pylint: disable=line-too-long
            staging_table_util(benchmark_tools.table_util.TableUtil): Util
                object for interacting with the staging table that the parquet
                file will be generated from.
            destination_prefix(str): String containing the 'gs://' prefix, the
                bucket name, and the path of the file, without the extension.
                This is needed by the WriteToParquet class.
                Ex: gs://annarudy_test_files/fileType=csv/compression=none/numColumns=10/columnTypes=100_STRING/numFiles=10000/tableSize=2147MB/file3876 # pylint: disable=line-too-long
        """
        logging.info('Attempting to create file ' '{0:s}'.format(blob_name))
        pipeline_args = [
            '--project', self.project_id, '--staging_location',
            self.dataflow_staging_location, '--temp_location',
            self.dataflow_temp_location, '--save_main_session',
            '--worker_machine_type', 'n1-highcpu-32', '--runner',
            'DataflowRunner', '--setup_file', './setup.py'
        ]
        options = pipeline_options.PipelineOptions(pipeline_args)
        table_spec = beam_bigquery.TableReference(
            projectId=self.project_id,
            datasetId=self.primitive_staging_dataset_id,
            tableId=staging_table_util.table_id)
        bq_schema = staging_table_util.table.schema
        pa_schema = parquet_util.ParquetUtil(
            bq_schema).get_pa_translated_schema()
        p = beam.Pipeline(options=options)
        table = (
            p
            | 'ReadTable' >> beam.io.Read(beam.io.BigQuerySource(table_spec)))
        (table | beam.io.WriteToParquet(
            file_path_prefix=destination_prefix,
            schema=pa_schema,
            file_name_suffix='.parquet',
            num_shards=1,
            shard_name_template='',
        ))
        p.run().wait_until_finish()
        logging.info('Created file: {0:s}'.format(blob_name))
Beispiel #8
0
 def _setup_new_types_env(self):
     table_schema = bigquery.TableSchema()
     table_field = bigquery.TableFieldSchema()
     table_field.name = 'bytes'
     table_field.type = 'BYTES'
     table_schema.fields.append(table_field)
     table_field = bigquery.TableFieldSchema()
     table_field.name = 'date'
     table_field.type = 'DATE'
     table_schema.fields.append(table_field)
     table_field = bigquery.TableFieldSchema()
     table_field.name = 'time'
     table_field.type = 'TIME'
     table_schema.fields.append(table_field)
     table = bigquery.Table(tableReference=bigquery.TableReference(
         projectId=self.project,
         datasetId=self.dataset_id,
         tableId=NEW_TYPES_INPUT_TABLE),
                            schema=table_schema)
     request = bigquery.BigqueryTablesInsertRequest(
         projectId=self.project, datasetId=self.dataset_id, table=table)
     self.bigquery_client.client.tables.Insert(request)
     table_data = [{
         'bytes': b'xyw',
         'date': '2011-01-01',
         'time': '23:59:59.999999'
     }, {
         'bytes': b'abc',
         'date': '2000-01-01',
         'time': '00:00:00'
     }, {
         'bytes': b'\xe4\xbd\xa0\xe5\xa5\xbd',
         'date': '3000-12-31',
         'time': '23:59:59.990000'
     }, {
         'bytes': b'\xab\xac\xad',
         'date': '2000-01-01',
         'time': '00:00:00'
     }]
     # the API Tools bigquery client expects byte values to be base-64 encoded
     # TODO https://github.com/apache/beam/issues/19073: upgrade to
     # google-cloud-bigquery which does not require handling the encoding in
     # beam
     for row in table_data:
         row['bytes'] = base64.b64encode(row['bytes']).decode('utf-8')
     passed, errors = self.bigquery_client.insert_rows(
         self.project, self.dataset_id, NEW_TYPES_INPUT_TABLE, table_data)
     self.assertTrue(passed, 'Error in BQ setup: %s' % errors)
Beispiel #9
0
 def test_table_not_empty_and_write_disposition_empty(
     self, patched_time_sleep):
   client = mock.Mock()
   client.tables.Get.return_value = bigquery.Table(
       tableReference=bigquery.TableReference(
           projectId='project', datasetId='dataset', tableId='table'),
       schema=bigquery.TableSchema())
   client.tabledata.List.return_value = bigquery.TableDataList(totalRows=1)
   write_disposition = beam.io.BigQueryDisposition.WRITE_EMPTY
   with self.assertRaisesRegexp(
       RuntimeError, r'Table project:dataset\.table is not empty but write '
                     r'disposition is WRITE_EMPTY'):
     with beam.io.BigQuerySink(
         'project:dataset.table',
         write_disposition=write_disposition).writer(client):
       pass
def run(argv=None, save_main_session=True):
    """Main entry point; defines and runs the wordcount pipeline."""
    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--input',
        dest='input',
        default=
        '/home/shravan/Desktop/gcp_files/2020-10-02-11-34-19-EA6C5E314B70B157',
        help='Input file to process.')
    parser.add_argument('--output',
                        dest='output',
                        required=False,
                        default='output',
                        help='Output file to write results to.')
    known_args, pipeline_args = parser.parse_known_args(argv)

    # We use the save_main_session option because one or more DoFn's in this
    # workflow rely on global context (e.g., a module imported at module level).
    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(
        SetupOptions).save_main_session = save_main_session

    # The pipeline will be run on exiting the with block.
    with beam.Pipeline(options=pipeline_options) as p:

        bq_details = "justlikethat-294122:log_analysis.aws_log"
        SCHEMA = "bucket:string,date:datetime,operation:string,key:string,request_uri:string,http_status:string,error_code:string,bytes_sent:string,total_time:string,turnaround_time:string,referrer:string,user_agent:string,request_header:string"
        log_parser = AwsLogParser()
        # Read the text file[pattern] into a PCollection.
        lines = p | 'Read' >> ReadFromText(known_args.input)

        data = (lines
                | 'Parse AWS Log File' >>
                beam.Map(lambda log: log_parser.process(log)))

        table_spec = bigquery.TableReference(projectId='justlikethat-294122',
                                             datasetId='log_analysis',
                                             tableId='aws_log_test_1')

        data | beam.io.WriteToBigQuery(
            table_spec,
            schema=SCHEMA,
            write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE,
            create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED)

        #data | 'Write' >> WriteToText(known_args.output)
        """
 def test_no_table_and_create_if_needed(self):
     client = mock.Mock()
     table = bigquery.Table(tableReference=bigquery.TableReference(
         projectId='project', datasetId='dataset', tableId='table'),
                            schema=bigquery.TableSchema())
     client.tables.Get.side_effect = HttpError(response={'status': '404'},
                                               url='',
                                               content='')
     client.tables.Insert.return_value = table
     create_disposition = beam.io.BigQueryDisposition.CREATE_IF_NEEDED
     with beam.io.BigQuerySink(
             'project:dataset.table',
             schema='somefield:INTEGER',
             create_disposition=create_disposition).writer(client):
         pass
     self.assertTrue(client.tables.Get.called)
     self.assertTrue(client.tables.Insert.called)
def run(argv=None, save_main_session=True):
  """Main entry point; defines and runs the wordcount pipeline."""
  parser = argparse.ArgumentParser()
  parser.add_argument(
      '--input',
      dest='input',
      default='/home/shravan/Desktop/gcp_files/2020-10-02-11-34-19-EA6C5E314B70B157',
      help='Input file to process.')
  parser.add_argument(
      '--output',
      dest='output',
      required=False,
      default='output',
      help='Output file to write results to.')
  known_args, pipeline_args = parser.parse_known_args(argv)

  # We use the save_main_session option because one or more DoFn's in this
  # workflow rely on global context (e.g., a module imported at module level).
  pipeline_options = PipelineOptions(pipeline_args)
  pipeline_options.view_as(SetupOptions).save_main_session = save_main_session

  # The pipeline will be run on exiting the with block.
  with beam.Pipeline(options=pipeline_options) as p:
    """
    quotes = p | beam.Create([
      {
        'source': 'Mahatma Gandhi', 'quote': 'My life is my message.'
      }
    ])
    """
    var = AwsLogParser()
    quotes = p | beam.Create(var.my_json())

    table_spec = bigquery.TableReference(
    projectId='justlikethat-294122',
    datasetId='log_analysis',
    tableId='quotes_2')

    table_schema = 'source:STRING,  quote:STRING'

    quotes | beam.io.WriteToBigQuery(
    table_spec,
    schema=table_schema,
    write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE,
    create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED)
    def test_existing_sample_table(self):
        args = self._make_args([
            '--append', 'False', '--output_table', 'project:dataset.table',
            '--sharding_config_path',
            'gcp_variant_transforms/testing/data/sharding_configs/'
            'residual_at_end.yaml'
        ])

        client = mock.Mock()
        client.tables.Get.return_value = bigquery.Table(
            tableReference=bigquery.TableReference(
                projectId='project',
                datasetId='dataset',
                tableId='table__sample_info'))
        with self.assertRaisesRegexp(
                ValueError,
                'project:dataset.table__sample_info already exists'):
            self._options.validate(args, client)
Beispiel #14
0
 def _create_table(self,
                   project_id,
                   dataset_id,
                   table_id,
                   schema,
                   additional_parameters=None):
     additional_parameters = additional_parameters or {}
     table = bigquery.Table(tableReference=bigquery.TableReference(
         projectId=project_id, datasetId=dataset_id, tableId=table_id),
                            schema=schema,
                            **additional_parameters)
     request = bigquery.BigqueryTablesInsertRequest(projectId=project_id,
                                                    datasetId=dataset_id,
                                                    table=table)
     response = self.client.tables.Insert(request)
     logging.debug("Created the table with id %s", table_id)
     # The response is a bigquery.Table instance.
     return response
 def test_table_empty_and_write_disposition_empty(self):
     client = mock.Mock()
     table = bigquery.Table(tableReference=bigquery.TableReference(
         projectId='project', datasetId='dataset', tableId='table'),
                            schema=bigquery.TableSchema())
     client.tables.Get.return_value = table
     client.tabledata.List.return_value = bigquery.TableDataList(
         totalRows=0)
     client.tables.Insert.return_value = table
     write_disposition = beam.io.BigQueryDisposition.WRITE_EMPTY
     with beam.io.BigQuerySink(
             'project:dataset.table',
             write_disposition=write_disposition).writer(client):
         pass
     self.assertTrue(client.tables.Get.called)
     self.assertTrue(client.tabledata.List.called)
     self.assertFalse(client.tables.Delete.called)
     self.assertFalse(client.tables.Insert.called)
 def create_table_new_types(self, table_name):
     table_schema = bigquery.TableSchema()
     table_field = bigquery.TableFieldSchema()
     table_field.name = 'bytes'
     table_field.type = 'BYTES'
     table_schema.fields.append(table_field)
     table_field = bigquery.TableFieldSchema()
     table_field.name = 'date'
     table_field.type = 'DATE'
     table_schema.fields.append(table_field)
     table_field = bigquery.TableFieldSchema()
     table_field.name = 'time'
     table_field.type = 'TIME'
     table_schema.fields.append(table_field)
     table = bigquery.Table(tableReference=bigquery.TableReference(
         projectId=self.project,
         datasetId=self.dataset_id,
         tableId=table_name),
                            schema=table_schema)
     request = bigquery.BigqueryTablesInsertRequest(
         projectId=self.project, datasetId=self.dataset_id, table=table)
     self.bigquery_client.client.tables.Insert(request)
     table_data = [{
         'bytes': b'xyw',
         'date': '2011-01-01',
         'time': '23:59:59.999999'
     }, {
         'bytes': b'abc',
         'date': '2000-01-01',
         'time': '00:00:00'
     }, {
         'bytes': b'\xe4\xbd\xa0\xe5\xa5\xbd',
         'date': '3000-12-31',
         'time': '23:59:59'
     }, {
         'bytes': b'\xab\xac\xad',
         'date': '2000-01-01',
         'time': '00:00:00'
     }]
     # bigquery client expects base64 encoded bytes
     for row in table_data:
         row['bytes'] = base64.b64encode(row['bytes']).decode('utf-8')
     self.bigquery_client.insert_rows(self.project, self.dataset_id,
                                      table_name, table_data)
    def test_table_exist(self):
        client = mock.Mock()
        client.tables.Get.return_value = bigquery.Table(
            tableReference=bigquery.TableReference(
                projectId='project', datasetId='dataset', tableId='table'))
        self.assertEqual(
            bigquery_util.table_exist(client, 'project', 'dataset', 'table'),
            True)

        client.tables.Get.side_effect = exceptions.HttpError(
            response={'status': '404'}, url='', content='')
        self.assertEqual(
            bigquery_util.table_exist(client, 'project', 'dataset', 'table'),
            False)

        client.tables.Get.side_effect = exceptions.HttpError(
            response={'status': '401'}, url='', content='')
        self.assertRaises(exceptions.HttpError, bigquery_util.table_exist,
                          client, 'project', 'dataset', 'table')
def run(argv=None, save_main_session=True):
    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--input',
        dest='input',
        default=
        '/home/shravan/Desktop/gcp_files/2020-10-02-11-34-19-EA6C5E314B70B157',
        help='Input file to process.')
    parser.add_argument('--output',
                        dest='output',
                        required=False,
                        default='output',
                        help='Output file to write results to.')
    known_args, pipeline_args = parser.parse_known_args(argv)

    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(
        SetupOptions).save_main_session = save_main_session

    with beam.Pipeline(options=pipeline_options) as p:

        quotes = p | beam.Create([{
            'source': 'Mahatma Gandhi',
            'quote': 'My life is my message.'
        }, {
            'source': 'Mahatma Gandhi',
            'quote': 'My life is my message.'
        }])

        table_spec = bigquery.TableReference(projectId='justlikethat-294122',
                                             datasetId='mydataset',
                                             tableId='quotes')

        table_schema = 'source:STRING,  quote:STRING'

        quotes | beam.io.WriteToBigQuery(
            table_spec,
            schema=table_schema,
            write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE,
            create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED)

        quotes | WriteToText(known_args.output)
Beispiel #19
0
 def create_table(cls, table_name):
     table_schema = bigquery.TableSchema()
     table_field = bigquery.TableFieldSchema()
     table_field.name = 'number'
     table_field.type = 'INTEGER'
     table_schema.fields.append(table_field)
     table_field = bigquery.TableFieldSchema()
     table_field.name = 'str'
     table_field.type = 'STRING'
     table_schema.fields.append(table_field)
     table = bigquery.Table(tableReference=bigquery.TableReference(
         projectId=cls.project,
         datasetId=cls.dataset_id,
         tableId=table_name),
                            schema=table_schema)
     request = bigquery.BigqueryTablesInsertRequest(
         projectId=cls.project, datasetId=cls.dataset_id, table=table)
     cls.bigquery_client.client.tables.Insert(request)
     cls.bigquery_client.insert_rows(cls.project, cls.dataset_id,
                                     table_name, cls.TABLE_DATA)
Beispiel #20
0
    def test_dofn_client_start_bundle_called(self):
        client = mock.Mock()
        client.tables.Get.return_value = bigquery.Table(
            tableReference=bigquery.TableReference(projectId='project_id',
                                                   datasetId='dataset_id',
                                                   tableId='table_id'))
        create_disposition = beam.io.BigQueryDisposition.CREATE_NEVER
        write_disposition = beam.io.BigQueryDisposition.WRITE_APPEND
        fn = beam.io.gcp.bigquery.BigQueryWriteFn(
            table_id='table_id',
            dataset_id='dataset_id',
            project_id='project_id',
            batch_size=2,
            schema='month:INTEGER',
            create_disposition=create_disposition,
            write_disposition=write_disposition,
            client=client)

        fn.start_bundle()
        self.assertTrue(client.tables.Get.called)
Beispiel #21
0
  def test_dofn_client_process_performs_batching(self):
    client = mock.Mock()
    client.tables.Get.return_value = bigquery.Table(
        tableReference=bigquery.TableReference(
            projectId='project_id', datasetId='dataset_id', tableId='table_id'))
    client.tabledata.InsertAll.return_value = \
      bigquery.TableDataInsertAllResponse(insertErrors=[])
    create_disposition = beam.io.BigQueryDisposition.CREATE_NEVER
    write_disposition = beam.io.BigQueryDisposition.WRITE_APPEND

    fn = beam.io.gcp.bigquery.BigQueryWriteFn(
        batch_size=2,
        create_disposition=create_disposition,
        write_disposition=write_disposition,
        kms_key=None,
        test_client=client)

    fn.process(('project_id:dataset_id.table_id', {'month': 1}))

    # InsertRows not called as batch size is not hit yet
    self.assertFalse(client.tabledata.InsertAll.called)
Beispiel #22
0
    def test_rows_are_written(self):
        client = mock.Mock()
        table = bigquery.Table(tableReference=bigquery.TableReference(
            projectId='project', datasetId='dataset', tableId='table'),
                               schema=bigquery.TableSchema())
        client.tables.Get.return_value = table
        write_disposition = beam.io.BigQueryDisposition.WRITE_APPEND

        client.insert_rows_json.return_value = []

        with beam.io.BigQuerySink(
                'project:dataset.table',
                write_disposition=write_disposition).writer(client) as writer:
            writer.Write({'i': 1, 'b': True, 's': 'abc', 'f': 3.14})

        sample_row = {'i': 1, 'b': True, 's': 'abc', 'f': 3.14}
        client.insert_rows_json.assert_called_with(gcp_bigquery.TableReference(
            gcp_bigquery.DatasetReference('project', 'dataset'), 'table'),
                                                   json_rows=[sample_row],
                                                   row_ids=['_1'],
                                                   skip_invalid_rows=True)
Beispiel #23
0
def run():
    argv = [
        '--project={0}'.format(PROJECT),
        '--job_name={}'.format('wcount' + get_time()), '--save_main_session',
        '--region=europe-west1', '--requirements_file=requirements.txt',
        '--staging_location={}/staging/'.format(BUCKET),
        '--temp_location={}/staging/'.format(BUCKET), '--runner=DataflowRunner'
    ]

    p = beam.Pipeline(argv=argv)  # sys.argv
    # input = ''
    table_spec = bigquery.TableReference(projectId='patstat2016a',
                                         datasetId='raw',
                                         tableId='tls2012_cp')
    output_prefix = '{}/countWord/fullCW'.format(BUCKET)

    query = 'SELECT\
            appln_title_lg,\
            appln_title,\
            appln_auth,\
            year\
            FROM\
            `patstat2016a.raw.tls2012_cp`\
            WHERE\
            appln_title_lg="en"'

    (p
     | 'ReadTable' >> beam.io.Read(
         beam.io.BigQuerySource(query=query,
                                use_standard_sql=True))  # more efficient
     # |'ReadTable' >> beam.io.Read(beam.io.BigQuerySource(table_spec))
     # |'FilterLg' >> beam.Filter(lambda line: line['appln_title_lg']=='en')
     | 'FormatPairTitle' >> beam.Map(lambda line: formatPairTitle(line))
     | 'FormatPairWord' >> beam.ParDo(SplitAndPairWithKey())
     | 'GroupAndSum' >> beam.CombinePerKey(sum)
     | 'FilterSW' >> beam.Filter(lambda (w, c): filterStopWords(w[-1]))
     | 'FormatCSV' >> beam.Map(lambda pair: formatCSV(pair))
     | 'Write' >> beam.io.WriteToText(output_prefix, file_name_suffix='.txt'))

    p.run()
Beispiel #24
0
def run(argv=None, save_main_session=True):
    parser = argparse.ArgumentParser()

    parser.add_argument(
        '--input',
        dest='input',
        default=
        '/home/shravan/Desktop/gcp_files/2020-10-02-11-34-19-EA6C5E314B70B157',
        help='Input file to process.')

    parser.add_argument('--output',
                        dest='output',
                        required=False,
                        default='output',
                        help='Output file to write results to.')

    known_args, pipeline_args = parser.parse_known_args(argv)

    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(
        SetupOptions).save_main_session = save_main_session

    with beam.Pipeline(options=pipeline_options) as p:

        utility = Utility()
        quotes = (p
                  | 'Read' >> ReadFromText(known_args.input)
                  | 'ParDo Dealings' >> beam.ParDo(Split()))

        table_spec = bigquery.TableReference(projectId='justlikethat-294122',
                                             datasetId='log_analysis',
                                             tableId='quotes')

        table_schema = 'source:STRING,  quote:STRING'

        quotes | beam.io.WriteToBigQuery(
            table_spec,
            table_schema,
            write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE,
            create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED)
Beispiel #25
0
    def __init__(self):

        self.table_spec = bigquery.TableReference(
            projectId='justlikethat-294122',
            datasetId='medium_dataset',
            tableId='mytable')

        self.schema = 'bucket:string,date:datetime,remote_ip:string,operation:string,\
                      key:string,request_uri:string,http_status:string,error_code:string,\
                      bytes_sent:string,object_size:string,total_time:string,turn_aroundtime:string,\
                      referrer:string'

        self.log = {
            # 'bucket_owner': '',
            'bucket': '',
            'date': '',
            #'time_offset': '',
            'remote_ip': '',
            #'request_arn': '',
            #'request_id': '',
            'operation': '',
            'key': '',
            'request_uri': '',
            'http_status': '',
            'error_code': '',
            'bytes_sent': '',
            'object_size': '',
            'total_time': '',
            'turn_aroundtime': '',
            'referrer': '',
            #'user_agent': '',
            #'version_id': '',
            #'host_id': '',
            #'signature_version': '',
            #'cipher_suite': '',
            #'authentication_type': '',
            #'host_header': '',
            #'tls_version': ''
        }
  def test_dofn_client_process_flush_called(self):
    client = mock.Mock()
    client.tables.Get.return_value = bigquery.Table(
        tableReference=bigquery.TableReference(
            projectId='project_id', datasetId='dataset_id', tableId='table_id'))
    client.tabledata.InsertAll.return_value = (
        bigquery.TableDataInsertAllResponse(insertErrors=[]))
    create_disposition = beam.io.BigQueryDisposition.CREATE_NEVER
    write_disposition = beam.io.BigQueryDisposition.WRITE_APPEND

    fn = beam.io.gcp.bigquery.BigQueryWriteFn(
        batch_size=2,
        create_disposition=create_disposition,
        write_disposition=write_disposition,
        kms_key=None,
        test_client=client)

    fn.start_bundle()
    fn.process(('project_id:dataset_id.table_id', ({'month': 1}, 'insertid1')))
    fn.process(('project_id:dataset_id.table_id', ({'month': 2}, 'insertid2')))
    # InsertRows called as batch size is hit
    self.assertTrue(client.tabledata.InsertAll.called)
Beispiel #27
0
def run(argv=None, save_main_session=True):
    """Main entry point; defines and runs the wordcount pipeline."""
    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--input',
        dest='input',
        default=
        '/home/shravan/Desktop/gcp_files/2020-10-02-11-34-19-EA6C5E314B70B157',
        help='Input file to process.')
    parser.add_argument('--output',
                        dest='output',
                        required=False,
                        default='output',
                        help='Output file to write results to.')
    known_args, pipeline_args = parser.parse_known_args(argv)

    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(
        SetupOptions).save_main_session = save_main_session

    with beam.Pipeline(options=pipeline_options) as p:

        #obj = Utility()
        table_spec = bigquery.TableReference(projectId='justlikethat-294122',
                                             datasetId='log_analysis',
                                             tableId='quotes')

        table_schema = 'source:STRING,  quote:STRING'

        data_ingestion = dataingestion()
        (p | 'Read from a File' >> beam.io.ReadFromText(known_args.input)
         | 'String To BigQuery Row' >>
         beam.Map(lambda s: data_ingestion.parse_method(s))
         | 'Write to BigQuery' >> WriteToBigQuery(
             table_spec,
             schema='source:STRING,  quote:STRING',
             create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
             write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE))
        """
 def create_table(self, table_name):
     table_schema = bigquery.TableSchema()
     table_field = bigquery.TableFieldSchema()
     table_field.name = 'bytes'
     table_field.type = 'BYTES'
     table_schema.fields.append(table_field)
     table_field = bigquery.TableFieldSchema()
     table_field.name = 'date'
     table_field.type = 'DATE'
     table_schema.fields.append(table_field)
     table_field = bigquery.TableFieldSchema()
     table_field.name = 'time'
     table_field.type = 'TIME'
     table_schema.fields.append(table_field)
     table = bigquery.Table(tableReference=bigquery.TableReference(
         projectId=self.project,
         datasetId=self.dataset_id,
         tableId=table_name),
                            schema=table_schema)
     request = bigquery.BigqueryTablesInsertRequest(
         projectId=self.project, datasetId=self.dataset_id, table=table)
     self.bigquery_client.client.tables.Insert(request)
Beispiel #29
0
  def test_dofn_client_start_bundle_called(self):
    client = mock.Mock()
    client.tables.Get.return_value = bigquery.Table(
        tableReference=bigquery.TableReference(
            projectId='project_id', datasetId='dataset_id', tableId='table_id'))
    create_disposition = beam.io.BigQueryDisposition.CREATE_NEVER
    write_disposition = beam.io.BigQueryDisposition.WRITE_APPEND
    schema = {'fields': [
        {'name': 'month', 'type': 'INTEGER', 'mode': 'NULLABLE'}]}

    fn = beam.io.gcp.bigquery.BigQueryWriteFn(
        table_id='table_id',
        dataset_id='dataset_id',
        project_id='project_id',
        batch_size=2,
        schema=schema,
        create_disposition=create_disposition,
        write_disposition=write_disposition,
        kms_key=None,
        test_client=client)

    fn.start_bundle()
    self.assertTrue(client.tables.Get.called)
    def test_rows_are_written(self):
        client = mock.Mock()
        table = bigquery.Table(tableReference=bigquery.TableReference(
            projectId='project', datasetId='dataset', tableId='table'),
                               schema=bigquery.TableSchema())
        client.tables.Get.return_value = table
        write_disposition = beam.io.BigQueryDisposition.WRITE_APPEND

        insert_response = mock.Mock()
        insert_response.insertErrors = []
        client.tabledata.InsertAll.return_value = insert_response

        with beam.io.BigQuerySink(
                'project:dataset.table',
                write_disposition=write_disposition).writer(client) as writer:
            writer.Write({'i': 1, 'b': True, 's': 'abc', 'f': 3.14})

        sample_row = {'i': 1, 'b': True, 's': 'abc', 'f': 3.14}
        expected_rows = []
        json_object = bigquery.JsonObject()
        for k, v in iteritems(sample_row):
            json_object.additionalProperties.append(
                bigquery.JsonObject.AdditionalProperty(key=k,
                                                       value=to_json_value(v)))
        expected_rows.append(
            bigquery.TableDataInsertAllRequest.RowsValueListEntry(
                insertId='_1',  # First row ID generated with prefix ''
                json=json_object))
        client.tabledata.InsertAll.assert_called_with(
            bigquery.BigqueryTabledataInsertAllRequest(
                projectId='project',
                datasetId='dataset',
                tableId='table',
                tableDataInsertAllRequest=bigquery.TableDataInsertAllRequest(
                    rows=expected_rows,
                    skipInvalidRows=False,
                )))