def test_one_job_fails_all_jobs_fail(self):

    # If one of the import jobs fails, then other jobs must not be performed.
    # This is to avoid reinsertion of some records when a pipeline fails and
    # is rerun.
    output_table_1 = '%s%s' % (self.output_table, 1)
    output_table_2 = '%s%s' % (self.output_table, 2)

    self.bigquery_client.get_or_create_table(
        self.project,
        self.dataset_id,
        output_table_1.split('.')[1],
        bigquery_tools.parse_table_schema_from_json(self.BIG_QUERY_SCHEMA),
        None,
        None)
    self.bigquery_client.get_or_create_table(
        self.project,
        self.dataset_id,
        output_table_2.split('.')[1],
        bigquery_tools.parse_table_schema_from_json(self.BIG_QUERY_SCHEMA_2),
        None,
        None)

    pipeline_verifiers = [
        BigqueryFullResultMatcher(
            project=self.project,
            query="SELECT name, language FROM %s" % output_table_1,
            data=[]),
        BigqueryFullResultMatcher(
            project=self.project,
            query="SELECT name, foundation FROM %s" % output_table_2,
            data=[])
    ]

    args = self.test_pipeline.get_full_options_as_args(
        experiments='use_beam_bq_sink')

    with self.assertRaises(Exception):
      # The pipeline below fails because neither a schema nor SCHEMA_AUTODETECT
      # are specified.
      with beam.Pipeline(argv=args) as p:
        input = p | beam.Create(_ELEMENTS)
        input2 = p | "Broken record" >> beam.Create(['language_broken_record'])

        input = (input, input2) | beam.Flatten()

        _ = (
            input | "WriteWithMultipleDests" >> bigquery.WriteToBigQuery(
                table=lambda x:
                (output_table_1 if 'language' in x else output_table_2),
                create_disposition=(
                    beam.io.BigQueryDisposition.CREATE_IF_NEEDED),
                write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND,
                temp_file_format=bigquery_tools.FileFormat.JSON))

    hamcrest_assert(p, all_of(*pipeline_verifiers))
Exemple #2
0
def run_bq_pipeline(argv=None):
  """Run the sample BigQuery pipeline.

  Args:
    argv: Arguments to the run function.
  """
  parser = argparse.ArgumentParser()
  parser.add_argument('--query', required=True,
                      help='Query to process for the table.')
  parser.add_argument('--output', required=True,
                      help='Output BQ table to write results to.')
  parser.add_argument('--output_schema', dest='output_schema', required=True,
                      help='Schema for output BQ table.')
  parser.add_argument('--use_standard_sql', action='store_true',
                      dest='use_standard_sql',
                      help='Output BQ table to write results to.')
  known_args, pipeline_args = parser.parse_known_args(argv)

  table_schema = parse_table_schema_from_json(known_args.output_schema)

  p = TestPipeline(options=PipelineOptions(pipeline_args))

  # pylint: disable=expression-not-assigned
  # pylint: disable=bad-continuation
  (p | 'read' >> beam.io.Read(beam.io.BigQuerySource(
      query=known_args.query, use_standard_sql=known_args.use_standard_sql))
   | 'write' >> beam.io.Write(beam.io.BigQuerySink(
           known_args.output,
           schema=table_schema,
           create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
           write_disposition=beam.io.BigQueryDisposition.WRITE_EMPTY)))

  result = p.run()
  result.wait_until_finish()
Exemple #3
0
 def test_parse_table_schema_from_json(self):
   string_field = bigquery.TableFieldSchema(
       name='s', type='STRING', mode='NULLABLE', description='s description')
   number_field = bigquery.TableFieldSchema(
       name='n', type='INTEGER', mode='REQUIRED', description='n description')
   record_field = bigquery.TableFieldSchema(
       name='r',
       type='RECORD',
       mode='REQUIRED',
       description='r description',
       fields=[string_field, number_field])
   expected_schema = bigquery.TableSchema(fields=[record_field])
   json_str = json.dumps({
       'fields': [{
           'name': 'r',
           'type': 'RECORD',
           'mode': 'REQUIRED',
           'description': 'r description',
           'fields': [{
               'name': 's',
               'type': 'STRING',
               'mode': 'NULLABLE',
               'description': 's description'
           },
                      {
                          'name': 'n',
                          'type': 'INTEGER',
                          'mode': 'REQUIRED',
                          'description': 'n description'
                      }]
       }]
   })
   self.assertEqual(parse_table_schema_from_json(json_str), expected_schema)
Exemple #4
0
    def setUp(self):
        self.test_pipeline = TestPipeline(is_integration_test=True)
        self.project = self.test_pipeline.get_option('project')

        # Set up PubSub environment.
        from google.cloud import pubsub
        self.pub_client = pubsub.PublisherClient()
        self.pubsub_setup_client = PubSubSetupClient(project=self.project)

        self.input_topic = self.pubsub_setup_client.create_topic(INPUT_TOPIC)
        self.output_topic = self.pubsub_setup_client.create_topic(OUTPUT_TOPIC)

        self.input_sub = self.pubsub_setup_client.create_subscription(
            self.input_topic, INPUT_SUB)
        self.output_sub = self.pubsub_setup_client.create_subscription(
            self.output_topic, OUTPUT_SUB)

        # Set up BigQuery tables
        self.dataset_ref = utils.create_bq_dataset(self.project,
                                                   OUTPUT_DATASET)
        self.bq_wrapper = BigQueryWrapper()
        table_schema = parse_table_schema_from_json(schemas.get_test_schema())

        def _create_table(table_id, schema):
            return self.bq_wrapper.get_or_create_table(
                project_id=self.project,
                dataset_id=self.dataset_ref.dataset_id,
                table_id=table_id,
                schema=schema,
                create_disposition='CREATE_IF_NEEDED',
                write_disposition='WRITE_APPEND')

        self.table_ref = _create_table(OUTPUT_TABLE, table_schema)
Exemple #5
0
def import_json_bq_schema():
    path = os.path.join(
        os.path.dirname(inspect.getfile(inspect.currentframe())),
        'mimic_cxr_bigquery_labels_schema.json')

    with open(path) as fp:
        return parse_table_schema_from_json(fp.read())
    def _create_input_data(self):
        """
    Runs an additional pipeline which creates test data and waits for its
    completion.
    """
        SCHEMA = parse_table_schema_from_json(
            '{"fields": [{"name": "data", "type": "BYTES"}]}')

        def format_record(record):
            # Since Synthetic Source returns data as a dictionary, we should skip one
            # of the part
            import base64
            return {'data': base64.b64encode(record[1])}

        with TestPipeline() as p:
            (  # pylint: disable=expression-not-assigned
                p
                | 'Produce rows' >> Read(
                    SyntheticSource(self.parse_synthetic_source_options()))
                | 'Format' >> Map(format_record)
                | 'Write to BigQuery' >> WriteToBigQuery(
                    dataset=self.input_dataset,
                    table=self.input_table,
                    schema=SCHEMA,
                    create_disposition=BigQueryDisposition.CREATE_IF_NEEDED,
                    write_disposition=BigQueryDisposition.WRITE_EMPTY))
Exemple #7
0
    def parse_method(self, string_input):
        """This method translates a single line of comma separated values to a
    dictionary which can be loaded into BigQuery.
        """
        # Strip out return characters and quote characters.
        schema = parse_table_schema_from_json(self.schema_str)

        field_map = [f for f in schema.fields]

        # Use a CSV Reader which can handle quoted strings etc.
        reader = csv.reader(string_input.split('\n'))
        for csv_row in reader:
            month = '01'
            day = '01'
            year = csv_row[2]

            row = {}
            i = 0
            for value in csv_row:
                if field_map[i].type == 'DATE':
                    # Format the date to YYYY-MM-DD format which BigQuery
                    # accepts.
                    value = '-'.join((year, month, day))

                row[field_map[i].name] = value
                i += 1

            return row
def run_bq_pipeline(argv=None):
    """Run the sample BigQuery pipeline.

  Args:
    argv: Arguments to the run function.
  """
    parser = argparse.ArgumentParser()
    parser.add_argument('--query',
                        required=True,
                        help='Query to process for the table.')
    parser.add_argument('--output',
                        required=True,
                        help='Output BQ table to write results to.')
    parser.add_argument('--output_schema',
                        dest='output_schema',
                        required=True,
                        help='Schema for output BQ table.')
    parser.add_argument('--use_standard_sql',
                        action='store_true',
                        dest='use_standard_sql',
                        help='Output BQ table to write results to.')
    parser.add_argument('--kms_key',
                        default=None,
                        help='Use this Cloud KMS key with BigQuery.')
    parser.add_argument('--native',
                        default=False,
                        action='store_true',
                        help='Use NativeSources and Sinks.')
    known_args, pipeline_args = parser.parse_known_args(argv)

    table_schema = parse_table_schema_from_json(known_args.output_schema)
    kms_key = known_args.kms_key

    p = TestPipeline(options=PipelineOptions(pipeline_args))

    # Note to future modifiers: Keep using BigQuerySource if known_args.native is
    # True.
    data = p | 'read' >> beam.io.Read(
        beam.io.BigQuerySource(query=known_args.query,
                               use_standard_sql=known_args.use_standard_sql,
                               kms_key=kms_key))
    if known_args.native:
        _ = data | 'write' >> beam.io.Write(
            beam.io.BigQuerySink(
                known_args.output,
                schema=table_schema,
                create_disposition=beam.io.BigQueryDisposition.
                CREATE_IF_NEEDED,
                write_disposition=beam.io.BigQueryDisposition.WRITE_EMPTY,
                kms_key=kms_key))
    else:
        _ = data | 'write' >> beam.io.WriteToBigQuery(
            known_args.output,
            schema=table_schema,
            create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
            write_disposition=beam.io.BigQueryDisposition.WRITE_EMPTY,
            kms_key=kms_key)

    result = p.run()
    result.wait_until_finish()
  def test_one_job_fails_all_jobs_fail(self):

    # If one of the import jobs fails, then other jobs must not be performed.
    # This is to avoid reinsertion of some records when a pipeline fails and
    # is rerun.
    output_table_1 = '%s%s' % (self.output_table, 1)
    output_table_2 = '%s%s' % (self.output_table, 2)

    self.bigquery_client.get_or_create_table(
        self.project, self.dataset_id, output_table_1.split('.')[1],
        bigquery_tools.parse_table_schema_from_json(self.BIG_QUERY_SCHEMA),
        None, None)
    self.bigquery_client.get_or_create_table(
        self.project, self.dataset_id, output_table_2.split('.')[1],
        bigquery_tools.parse_table_schema_from_json(self.BIG_QUERY_SCHEMA_2),
        None, None)

    pipeline_verifiers = [
        BigqueryFullResultMatcher(
            project=self.project,
            query="SELECT * FROM %s" % output_table_1,
            data=[]),
        BigqueryFullResultMatcher(
            project=self.project,
            query="SELECT * FROM %s" % output_table_2,
            data=[])]

    args = self.test_pipeline.get_full_options_as_args(
        experiments='use_beam_bq_sink')

    with self.assertRaises(Exception):
      with beam.Pipeline(argv=args) as p:
        input = p | beam.Create(_ELEMENTS)
        input2 = p | "Broken record" >> beam.Create(['language_broken_record'])

        input = (input, input2) | beam.Flatten()

        _ = (input |
             "WriteWithMultipleDests" >> bigquery.WriteToBigQuery(
                 table=lambda x: (output_table_1
                                  if 'language' in x
                                  else output_table_2),
                 create_disposition=(
                     beam.io.BigQueryDisposition.CREATE_IF_NEEDED),
                 write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND))

    hamcrest_assert(p, all_of(*pipeline_verifiers))
Exemple #10
0
 def _validate_schema(self, expected_fields, actual_schema):
   super()._validate_schema(expected_fields, actual_schema)
   json_schema = schema_converter.convert_table_schema_to_json_bq_schema(
       actual_schema)
   # Beam expects schema to be generated from dict with 'fields' item being
   # list of columns, while 'bq mk' command expects just the list of fields.
   updated_json_schema = json.dumps({"fields": json.loads(json_schema)})
   schema_from_json = parse_table_schema_from_json(updated_json_schema)
   self.assertEqual(schema_from_json, actual_schema)
def run(argv=None):
    """The main function which creates the pipeline and runs it."""
    parser = argparse.ArgumentParser()
    # Here we add some specific command line arguments we expect.   Specifically
    # we have the input file to load and the output table to write to.
    parser.add_argument(
        '--input', dest='input', required=False,
        help='Input file to read.  This can be a local file or '
             'a file in a Google Storage Bucket.',
        # This example file contains a total of only 10 lines.
        # It is useful for developing on a small set of data
        default='gs://spls/gsp290/data_files/head_usa_names.csv')
    # This defaults to the temp dataset in your BigQuery project.  You'll have
    # to create the temp dataset yourself using bq mk temp
    parser.add_argument('--output', dest='output', required=False,
                        help='Output BQ table to write results to.',
                        default='lake.usa_names_transformed')

    # Parse arguments from the command line.
    known_args, pipeline_args = parser.parse_known_args(argv)
    # DataTransformation is a class we built in this script to hold the logic for
    # transforming the file into a BigQuery table.
    data_ingestion = DataTransformation()

    # Initiate the pipeline using the pipeline arguments passed in from the
    # command line.  This includes information like where Dataflow should
    # store temp files, and what the project id is.
    p = beam.Pipeline(options=PipelineOptions(pipeline_args))
    schema = parse_table_schema_from_json(data_ingestion.schema_str)

    (p
     # Read the file.  This is the source of the pipeline.  All further
     # processing starts with lines read from the file.  We use the input
     # argument from the command line.  We also skip the first line which is a
     # header row.
     | 'Read From Text' >> beam.io.ReadFromText(known_args.input,
                                                skip_header_lines=1)
     # This stage of the pipeline translates from a CSV file single row
     # input as a string, to a dictionary object consumable by BigQuery.
     # It refers to a function we have written.  This function will
     # be run in parallel on different workers using input from the
     # previous stage of the pipeline.
     | 'String to BigQuery Row' >> beam.Map(lambda s:
                                            data_ingestion.parse_method(s))
     | 'Write to BigQuery' >> beam.io.Write(
        beam.io.BigQuerySink(
            # The table name is a required argument for the BigQuery sink.
            # In this case we use the value passed in from the command line.
            known_args.output,
            # Here we use the JSON schema read in from a JSON file.
            # Specifying the schema allows the API to create the table correctly if it does not yet exist.
            schema=schema,
            # Creates the table in BigQuery if it does not yet exist.
            create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
            # Deletes all data in the BigQuery table before writing.
            write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE)))
    p.run().wait_until_finish()
Exemple #12
0
  def get_table_schema(schema):
    """Transform the table schema into a bigquery.TableSchema instance.

    Args:
      schema: The schema to be used if the BigQuery table to write has to be
        created. This is a dictionary object created in the WriteToBigQuery
        transform.
    Returns:
      table_schema: The schema to be used if the BigQuery table to write has
         to be created but in the bigquery.TableSchema format.
    """
    if schema is None:
      return schema
    elif isinstance(schema, (str, unicode)):
      return bigquery_tools.parse_table_schema_from_json(schema)
    elif isinstance(schema, dict):
      return bigquery_tools.parse_table_schema_from_json(json.dumps(schema))
    else:
      raise TypeError('Unexpected schema argument: %s.' % schema)
Exemple #13
0
  def get_table_schema(schema):
    """Transform the table schema into a bigquery.TableSchema instance.

    Args:
      schema: The schema to be used if the BigQuery table to write has to be
        created. This is a dictionary object created in the WriteToBigQuery
        transform.
    Returns:
      table_schema: The schema to be used if the BigQuery table to write has
         to be created but in the bigquery.TableSchema format.
    """
    if schema is None:
      return schema
    elif isinstance(schema, (str, unicode)):
      return bigquery_tools.parse_table_schema_from_json(schema)
    elif isinstance(schema, dict):
      return bigquery_tools.parse_table_schema_from_json(json.dumps(schema))
    else:
      raise TypeError('Unexpected schema argument: %s.' % schema)
    def parse_method(self, string_input):
        """This method translates a single line of comma separated values to a
    dictionary which can be loaded into BigQuery.

        Args:
            string_input: A comma separated list of values in the form of
            state_abbreviation,gender,year,name,count_of_babies,dataset_created_date
                example string_input: KS,F,1923,Dorothy,654,11/28/2016

        Returns:
            A dict mapping BigQuery column names as keys to the corresponding value
            parsed from string_input.  In this example, the data is not transformed, and
            remains in the same format as the CSV.  There are no date format transformations.

                example output:
                      {'state': 'KS',
                       'gender': 'F',
                       'year': '1923-01-01', <- This is the BigQuery date format.
                       'name': 'Dorothy',
                       'number': '654',
                       'created_date': '11/28/2016'
                       }
        """
        # Strip out return characters and quote characters.
        schema = parse_table_schema_from_json(self.schema_str)

        field_map = [f for f in schema.fields]

        # Use a CSV Reader which can handle quoted strings etc.
        reader = csv.reader(string_input.split('\n'))
        for csv_row in reader:
            # Our source data only contains year, so default January 1st as the
            # month and day.
            month = '01'
            day = '01'
            # The year comes from our source data.
            year = csv_row[2]

            row = {}
            i = 0
            # Iterate over the values from our csv file, applying any transformation logic.
            for value in csv_row:
                # If the schema indicates this field is a date format, we must
                # transform the date from the source data into a format that
                # BigQuery can understand.
                if field_map[i].type == 'DATE':
                    # Format the date to YYYY-MM-DD format which BigQuery
                    # accepts.
                    value = '-'.join((year, month, day))

                row[field_map[i].name] = value
                i += 1

            return row
Exemple #15
0
def run_bq_pipeline(argv=None):
    """Run the sample BigQuery pipeline.

  Args:
    argv: Arguments to the run function.
  """
    parser = argparse.ArgumentParser()
    parser.add_argument('--query',
                        required=True,
                        help='Query to process for the table.')
    parser.add_argument('--output',
                        required=True,
                        help='Output BQ table to write results to.')
    parser.add_argument('--output_schema',
                        dest='output_schema',
                        required=True,
                        help='Schema for output BQ table.')
    parser.add_argument('--use_standard_sql',
                        action='store_true',
                        dest='use_standard_sql',
                        help='Output BQ table to write results to.')
    parser.add_argument('--kms_key',
                        default=None,
                        help='Use this Cloud KMS key with BigQuery.')
    parser.add_argument('--bq_temp_location',
                        default=None,
                        help=('GCS bucket to use to store files for '
                              'loading data into BigQuery.'))
    known_args, pipeline_args = parser.parse_known_args(argv)

    table_schema = parse_table_schema_from_json(known_args.output_schema)
    kms_key = known_args.kms_key

    p = TestPipeline(options=PipelineOptions(pipeline_args))

    # pylint: disable=expression-not-assigned
    # pylint: disable=bad-continuation
    (p | 'read' >> beam.io.Read(
        beam.io.BigQuerySource(query=known_args.query,
                               use_standard_sql=known_args.use_standard_sql,
                               kms_key=kms_key))
     | 'write' >> beam.io.WriteToBigQuery(
         known_args.output,
         schema=table_schema,
         create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
         write_disposition=beam.io.BigQueryDisposition.WRITE_EMPTY,
         gs_location=known_args.bq_temp_location))

    result = p.run()
    result.wait_until_finish()
Exemple #16
0
def run(argv=None):
    known_args, pipeline_args = _parse_user_args(argv)

    options = get_pipeline_options(pipeline_args)

    # Load schema
    schema = '{"fields": ' + open(known_args.schema_path, "r").read() + '}'

    schema = parse_table_schema_from_json(schema)

    with beam.Pipeline(options=options) as p:
        # Get message from pubsub and split it by identifier
        formated_messages = (
            p
            | "Read from PubSub" >> beam.io.ReadFromPubSub(known_args.topic)
            | "Windowing" >> beam.WindowInto(window.FixedWindows(30))
            | "Decoder" >> beam.Map(lambda e: e.decode())
            | "Split into List" >> beam.ParDo(SplitWords(",")))

        # Pipeline split:
        # 1. Write to FS
        # 2. Snooze for 10 sec, and change data locally

        # Write to FS
        writer_messages = (
            formated_messages
            | "Write to FS" >> beam.ParDo(WriteToFS())
            | "Get FS keys" >> beam.Map(lambda val: (val["uniqe_id"], val)))

        # Snooze for 10 sec, and change data locally
        do_something_that_takes_time = (
            formated_messages
            | "Snooze For 10 Seconds" >> beam.ParDo(Snooze())
            | "Add Data" >> beam.ParDo(ChangeData("changed!"))
            |
            "Get Update keys" >> beam.Map(lambda val: (val["uniqe_id"], val)))

        # Pipeline group by id and update data in FS after changed locally
        results = ((writer_messages, do_something_that_takes_time)
                   | "Group by key" >> beam.CoGroupByKey()
                   | "Update FS" >> beam.ParDo(UpdateToFS()))

        # Write updated data to Big Query
        (results
         | "Read Document From FS" >> beam.ParDo(ReadFromFS())
         | "Format For BQ" >> beam.ParDo(FormatForBQ())
         | "Write to BigQuery" >> beam.io.WriteToBigQuery("saar.messaging",
                                                          schema=schema))
def parse_table_schema(schema):
    """
    Accepts a BigQuery tableschema as a string, dict (from json), or bigquery.TabelSchema and returns
    a bigquery.TableSchema

    String Format

    "[FIELD]:[DATA_TYPE],[FIELD]:[DATA_TYPE]"

    dict format

    {
      "fields": [
        {
          "name": "[FIELD]",
          "type": "[DATA_TYPE]"
        },
        {
          "name": "[FIELD]",
          "type": "[DATA_TYPE]"
        }
    ]}

    see https://cloud.google.com/bigquery/data-types
    see https://cloud.google.com/bigquery/docs/schemas#specifying_a_schema_file


    """
    if schema is None:
        return schema
    elif isinstance(schema, bq.TableSchema):
        return schema
    elif isinstance(schema, six.string_types):
        # try to parse json into dict
        try:
            schema = ujson.loads(schema)
        except ValueError as e:
            pass

    if isinstance(schema, six.string_types):
        # if it is still a string, then it must not be json.  Assume it is string representation
        return WriteToBigQuery.get_table_schema_from_string(schema)
    elif isinstance(schema, dict):
        # either it came in as a dict or it got converted from json earlier
        return parse_table_schema_from_json(ujson.dumps(schema))
    else:
        raise TypeError('Unexpected schema argument: %s.' % schema)
Exemple #18
0
    def test(self):
        SCHEMA = parse_table_schema_from_json(
            '{"fields": [{"name": "data", "type": "BYTES"}]}')

        def format_record(record):
            # Since Synthetic Source returns data as a dictionary, we should skip one
            # of the part
            return {'data': base64.b64encode(record[1])}

        # pylint: disable=expression-not-assigned
        (self.pipeline
         | 'ProduceRows' >> Read(
             SyntheticSource(self.parseTestPipelineOptions()))
         | 'Format' >> Map(format_record)
         | 'WriteToBigQuery' >> WriteToBigQuery(
             self.output_dataset + '.' + self.output_table,
             schema=SCHEMA,
             create_disposition=BigQueryDisposition.CREATE_IF_NEEDED,
             write_disposition=BigQueryDisposition.WRITE_EMPTY))
def run_bq_pipeline(argv=None):
  """Run the sample BigQuery pipeline.

  Args:
    argv: Arguments to the run function.
  """
  parser = argparse.ArgumentParser()
  parser.add_argument('--query', required=True,
                      help='Query to process for the table.')
  parser.add_argument('--output', required=True,
                      help='Output BQ table to write results to.')
  parser.add_argument('--output_schema', dest='output_schema', required=True,
                      help='Schema for output BQ table.')
  parser.add_argument('--use_standard_sql', action='store_true',
                      dest='use_standard_sql',
                      help='Output BQ table to write results to.')
  parser.add_argument('--kms_key', default=None,
                      help='Use this Cloud KMS key with BigQuery.')
  parser.add_argument('--bq_temp_location',
                      default=None,
                      help=('GCS bucket to use to store files for '
                            'loading data into BigQuery.'))
  known_args, pipeline_args = parser.parse_known_args(argv)

  table_schema = parse_table_schema_from_json(known_args.output_schema)
  kms_key = known_args.kms_key

  p = TestPipeline(options=PipelineOptions(pipeline_args))

  # pylint: disable=expression-not-assigned
  # pylint: disable=bad-continuation
  (p | 'read' >> beam.io.Read(beam.io.BigQuerySource(
      query=known_args.query, use_standard_sql=known_args.use_standard_sql,
      kms_key=kms_key))
   | 'write' >> beam.io.WriteToBigQuery(
           known_args.output,
           schema=table_schema,
           create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
           write_disposition=beam.io.BigQueryDisposition.WRITE_EMPTY,
           gs_location=known_args.bq_temp_location))

  result = p.run()
  result.wait_until_finish()
Exemple #20
0
def run(argv=None):
    """The main function which creates the pipeline and runs it."""
    parser = argparse.ArgumentParser()
    parser.add_argument('--input',
                        dest='input',
                        required=False,
                        help='Input file to read',
                        default='gs://linux-etl/data_files/head_usa_names.csv')

    parser.add_argument('--output',
                        dest='output',
                        required=False,
                        help='Output BQ table to write results to.',
                        default='lake.usa_names_transformed')

    # Parse arguments from the command line.
    known_args, pipeline_args = parser.parse_known_args(argv)
    # DataTransformation is a class we built in this script to hold the logic for
    # transforming the file into a BigQuery table.

    data_ingestion = DataTransformation()

    # Initiate the pipeline using the pipeline arguments passed in from the
    # command line.

    p = beam.Pipeline(options=PipelineOptions(pipeline_args))
    schema = parse_table_schema_from_json(data_ingestion.schema_str)

    (p
     | 'Read From Text' >> beam.io.ReadFromText(known_args.input,
                                                skip_header_lines=1)
     | 'String to BigQuery Row' >>
     beam.Map(lambda s: data_ingestion.parse_method(s))
     | 'Write to BigQuery' >> beam.io.Write(
         beam.io.BigQuerySink(
             known_args.output,
             schema=schema,
             create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
             write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE)))
    p.run().wait_until_finish()
Exemple #21
0
    def test(self):
        SCHEMA = parse_table_schema_from_json(
            '{"fields": [{"name": "data", "type": "BYTES"}]}')

        def format_record(record):
            # Since Synthetic Source returns data as a dictionary, we should skip one
            # of the part
            return {'data': base64.b64encode(record[1])}

        (  # pylint: disable=expression-not-assigned
            self.pipeline
            | 'Produce rows' >> Read(
                SyntheticSource(self.parse_synthetic_source_options()))
            | 'Count messages' >> ParDo(CountMessages(self.metrics_namespace))
            | 'Format' >> Map(format_record)
            | 'Measure time' >> ParDo(MeasureTime(self.metrics_namespace))
            | 'Write to BigQuery' >> WriteToBigQuery(
                dataset=self.output_dataset,
                table=self.output_table,
                schema=SCHEMA,
                create_disposition=BigQueryDisposition.CREATE_IF_NEEDED,
                write_disposition=BigQueryDisposition.WRITE_TRUNCATE))
Exemple #22
0
      def expand(self, pcoll):
        from apache_beam.io.gcp.bigquery_tools import parse_table_schema_from_json
        import json

        schema = None
        if self.schema:
          schema = parse_table_schema_from_json(json.dumps(self.schema))

        out = pcoll | io.Write(
            io.BigQuerySink(
                self.table_reference.tableId,
                self.table_reference.datasetId,
                self.table_reference.projectId,
                schema,
                self.create_disposition,
                self.write_disposition,
                kms_key=self.kms_key))

        # The WriteToBigQuery can have different outputs depending on if it's
        # Batch or Streaming. This retrieved the output keys from the node and
        # is replacing them here to be consistent.
        return {key: out for key in self.outputs}
Exemple #23
0
    def test_multiple_destinations_transform(self):
        output_table_1 = '%s%s' % (self.output_table, 1)
        output_table_2 = '%s%s' % (self.output_table, 2)
        output_table_3 = '%s%s' % (self.output_table, 3)
        output_table_4 = '%s%s' % (self.output_table, 4)
        schema1 = bigquery.WriteToBigQuery.get_dict_table_schema(
            bigquery_tools.parse_table_schema_from_json(self.BIG_QUERY_SCHEMA))
        schema2 = bigquery.WriteToBigQuery.get_dict_table_schema(
            bigquery_tools.parse_table_schema_from_json(
                self.BIG_QUERY_SCHEMA_2))

        schema_kv_pairs = [
            (output_table_1, schema1), (output_table_2, schema2),
            (output_table_3, schema1), (output_table_4, schema2)
        ]
        pipeline_verifiers = [
            BigqueryFullResultMatcher(
                project=self.project,
                query="SELECT name, language FROM %s" % output_table_1,
                data=[(d['name'], d['language']) for d in _ELEMENTS
                      if 'language' in d]),
            BigqueryFullResultMatcher(
                project=self.project,
                query="SELECT name, foundation FROM %s" % output_table_2,
                data=[(d['name'], d['foundation']) for d in _ELEMENTS
                      if 'foundation' in d]),
            BigqueryFullResultMatcher(
                project=self.project,
                query="SELECT name, language FROM %s" % output_table_3,
                data=[(d['name'], d['language']) for d in _ELEMENTS
                      if 'language' in d]),
            BigqueryFullResultMatcher(
                project=self.project,
                query="SELECT name, foundation FROM %s" % output_table_4,
                data=[(d['name'], d['foundation']) for d in _ELEMENTS
                      if 'foundation' in d])
        ]

        args = self.test_pipeline.get_full_options_as_args(
            on_success_matcher=all_of(*pipeline_verifiers),
            experiments='use_beam_bq_sink')

        with beam.Pipeline(argv=args) as p:
            input = p | beam.Create(_ELEMENTS)

            schema_map_pcv = beam.pvalue.AsDict(
                p | "MakeSchemas" >> beam.Create(schema_kv_pairs))

            table_record_pcv = beam.pvalue.AsDict(
                p | "MakeTables" >> beam.Create([('table1', output_table_1),
                                                 ('table2', output_table_2)]))

            # Get all input in same machine
            input = (input
                     | beam.Map(lambda x: (None, x))
                     | beam.GroupByKey()
                     | beam.FlatMap(lambda elm: elm[1]))

            _ = (
                input
                | "WriteWithMultipleDestsFreely" >> bigquery.WriteToBigQuery(
                    table=lambda x, tables:
                    (tables['table1']
                     if 'language' in x else tables['table2']),
                    table_side_inputs=(table_record_pcv, ),
                    schema=lambda dest, schema_map: schema_map.get(dest, None),
                    schema_side_inputs=(schema_map_pcv, ),
                    create_disposition=beam.io.BigQueryDisposition.
                    CREATE_IF_NEEDED,
                    write_disposition=beam.io.BigQueryDisposition.WRITE_EMPTY))

            _ = (input | "WriteWithMultipleDests" >> bigquery.WriteToBigQuery(
                table=lambda x:
                (output_table_3 if 'language' in x else output_table_4),
                schema=lambda dest, schema_map: schema_map.get(dest, None),
                schema_side_inputs=(schema_map_pcv, ),
                create_disposition=beam.io.BigQueryDisposition.
                CREATE_IF_NEEDED,
                write_disposition=beam.io.BigQueryDisposition.WRITE_EMPTY,
                max_file_size=20,
                max_files_per_bundle=-1))
Exemple #24
0
def parse_table_schema_from_json(schema_string):
    return bigquery_tools.parse_table_schema_from_json(schema_string)
def run(args, pipeline_args=None):
    """Executes Pipeline.

    :param args:
    :param pipeline_args:
    :return:
    """
    """Build and run the pipeline."""
    # We use the save_main_session option because one or more DoFn's in this
    # workflow rely on global context (e.g., a module imported at module level).
    pipeline_options = PipelineOptions(pipeline_args,
                                       streaming=True,
                                       save_main_session=True)
    pipeline_options.view_as(StandardOptions).runner = args.runner
    # Run on Cloud DataFlow by default
    google_cloud_options = pipeline_options.view_as(GoogleCloudOptions)
    google_cloud_options.project = PROJECT_ID
    google_cloud_options.job_name = 'pubsub-api-bigquery'
    google_cloud_options.staging_location = args.staging_location
    google_cloud_options.temp_location = args.temp_location
    google_cloud_options.region = args.region

    p = beam.Pipeline(options=pipeline_options)

    lines = p | 'read in tweets' >> beam.io.ReadFromPubSub(
        topic=args.input_topic, with_attributes=False,
        id_label='tweet_id')  # TODO: Change to PubSub id.

    # Window them, and batch them into batches. (Not too large)
    output_tweets = (
        lines | 'assign window key' >> beam.WindowInto(
            window.FixedWindows(args.window_size))
        | 'batch into n batches' >> BatchElements(
            min_batch_size=args.min_batch_size,
            max_batch_size=args.max_batch_size)
        | 'predict sentiment' >>
        beam.FlatMap(lambda messages: prediction_helper(messages)))

    # Make explicit BQ schema for output tables:
    bq_schema_json = {
        "fields": [
            {
                "name": "id",
                "type": "STRING"
            },
            {
                "name": "text",
                "type": "STRING"
            },
            {
                "name": "user_id",
                "type": "STRING"
            },
            {
                "name": "sentiment",
                "type": "FLOAT"
            },
            {
                "name": "posted_at",
                "type": "TIMESTAMP"
            },
            {
                "name": "favorite_count",
                "type": "INTEGER"
            },
            {
                "name": "retweet_count",
                "type": "INTEGER"
            },
            {
                "name": "media",
                "type": "STRING"
            },
        ]
    }
    bq_schema = parse_table_schema_from_json(json.dumps(bq_schema_json))

    # Write to BigQuery
    output_tweets | 'store twitter posts' >> beam.io.WriteToBigQuery(
        table=args.bigquery_table,
        dataset=args.bigquery_dataset,
        schema=bq_schema,
        write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND,
        create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
        project=PROJECT_ID)
    result = p.run()
    result.wait_until_finish()
Exemple #26
0
def parse_table_schema_from_json(schema_string):
  return bigquery_tools.parse_table_schema_from_json(schema_string)
def main(argv=None):
    parser = argparse.ArgumentParser()

    parser.add_argument('--input',
                        dest='input',
                        required=True,
                        help='Input path')

    parser.add_argument('--output',
                        dest='output',
                        required=True,
                        help='Output file to write results to.')

    # parser.add_argument('--host',
    #                     dest='host',
    #                     required=False,
    #                     help='Database host')

    known_args, pipeline_args = parser.parse_known_args(argv)

    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(SetupOptions).save_main_session = True

    table_schema = parse_table_schema_from_json(json.dumps(_FIELDS))

    # additional_bq_parameters = {
    #   'timePartitioning': {
    #                         'type': 'DAY',
    #                         'field': 'orderdate'}}

    # Scrip para conectar no banco de dados
    # db_config = DBConfig(drivername='postgresql',
    #                                 username='******',
    #                                 password='******',
    #                                 database='perfectorder',
    #                                 host=known_args.host,
    #                                 port = 5432)

    ####### FILTERS EXAMPLES #######################
    #
    # today = date.today()
    # filters = "orders_order.due_date >= TO_DATE('{}', 'YYYY-MM-DD')".format(today)
    #
    # filters = "orders_order.customer_id = 3 AND orders_order.due_date >= TO_DATE('{}', 'YYYY-MM-DD')".format(today)
    #
    #####################################################################################################################

    p = beam.Pipeline(options=pipeline_options)

    #########################################################
    #
    #       GENERAL DATA
    ############################################################

    _region = f"{known_args.input}dss_region.csv"
    _region_columns = ['r_regionkey', 'r_name', 'r_comment']
    logging.info('Reading region data..')
    pregion = (
        p
        | 'Reading region data' >> beam.io.ReadFromText(_region,
                                                        skip_header_lines=1)
        | 'Mapping region data to Json' >> beam.ParDo(
            Split(columns=_region_columns))
        | 'Mapping region values' >>
        beam.Map(lambda element: (element['r_regionkey'], element['r_name'])))

    _nation = f"{known_args.input}dss_nation.csv"
    _nation_columns = ['nationkey', 'nation_name', 'regionkey', 'n_comment']
    logging.info('Reading nation data..')
    pnation = (
        p
        | 'Reading nation data' >> beam.io.ReadFromText(_nation,
                                                        skip_header_lines=1)
        | 'Mapping nation data to Json' >> beam.ParDo(
            Split(columns=_nation_columns))
        # | 'Getting Region Name' >> ApplyMap('region', 'n_regionkey', pregion)
        # | 'Mapping nation values' >> beam.Map(lambda element: {
        #                             'nationkey' : element['n_nationkey'],
        #                             'nation': element['n_name'],
        #                             'region': element['region'],
        #                             })
    )

    #########################################################
    #
    #       CUSTOMER DATA
    ############################################################

    _customer = f"{known_args.input}dss_customer.csv"
    _customer_columns = [
        'custkey', 'customer_name', 'customer_addres', 'nationkey', 'phone',
        'acctbal', 'mktsegment', 'comment'
    ]
    logging.info('Reading customer data..')
    pcustomers = (
        p
        | 'Reading customer data' >> beam.io.ReadFromText(_customer,
                                                          skip_header_lines=1)
        | 'Reshuffling customer data to be parallel' >> beam.Reshuffle()
        | 'Mapping customer data to Json' >> beam.ParDo(
            Split(columns=_customer_columns)))

    ## Enrich Customer Data
    logging.info('Enrich Customer Data..')
    pipeline_dict = {'customer': pcustomers, 'nation': pnation}
    pcustomer_nation = (
        pipeline_dict
        | 'Join Customer with Nations' >> LeftJoin(
            'customer', pcustomers, 'nation', pnation, 'nationkey')
        | 'Getting Region Name to Customer' >> ApplyMap(
            'customer_region', 'regionkey', pregion)
        | 'Mapping customers values' >> beam.Map(
            lambda element: {
                'custkey': element['custkey'],
                'customer_name': element['customer_name'],
                'customer_addres': element['customer_addres'],
                'mktsegment': element['mktsegment'],
                'customer_nation': element['nation_name'],
                'customer_region': element['customer_region']
            }))

    #########################################################
    #
    #       ORDER DATA
    ############################################################

    # Getting order data
    logging.info('Reading order data..')
    _order_colums = [
        'orderkey', 'custkey', 'orderstatus', 'totalprice', 'orderdate',
        'orderpriority', 'clerk', 'shippriority', 'comment'
    ]
    _order = f"{known_args.input}dss_order.csv"
    porder = (p
              | 'Reading order data' >> beam.io.ReadFromText(
                  _order, skip_header_lines=1)
              | 'Reshuffling order data to be parallel' >> beam.Reshuffle()
              | 'Mapping order data to Json' >> beam.ParDo(
                  Split(columns=_order_colums))
              | 'cleaning unncessary fields from order' >> beam.Map(
                  lambda element: {
                      'orderkey':
                      element['orderkey'],
                      'custkey':
                      element['custkey'],
                      'orderstatus':
                      element['orderstatus'],
                      'totalprice':
                      element['totalprice'],
                      'orderdate':
                      datetime.strptime(element['orderdate'], '"%Y-%m-%d"').
                      strftime('%Y-%m-%d'),
                      'orderpriority':
                      element['orderpriority'],
                      'shippriority':
                      element['shippriority']
                  }))

    logging.info('Join order data with customer data')
    pipeline_dict = {'orders': porder, 'customers': pcustomer_nation}
    porder_customer = (
        pipeline_dict
        | 'Join Order with Customer' >> LeftJoin('orders', porder, 'customers',
                                                 pcustomer_nation, 'custkey'))

    #########################################################
    #
    #       ITEMS DATA
    # NESTED FIELDS
    ############################################################

    #########################################################
    #
    #       SUPPLIER DATA
    ############################################################

    logging.info('Reading Supplier data..')
    _supplier_colums = [
        'suppkey', 'supplier_name', 'supplier_address', 'nationkey', 'phone',
        'acctbal', 's_comment'
    ]
    _supplier = f"{known_args.input}dss_supplier.csv"
    psupplier = (p
                 | 'Reading supplier data' >> beam.io.ReadFromText(
                     _supplier, skip_header_lines=1)
                 | 'Mapping supplier data to Json' >> beam.ParDo(
                     Split(columns=_supplier_colums)))

    ## Enrich Supplier Data
    logging.info('Enrich Supplier Data..')
    pipeline_dict = {'supplier': psupplier, 'supplier_nation': pnation}
    psupplier_nation = (
        pipeline_dict
        | 'Join Supplier with Nations' >> LeftJoin(
            'supplier', psupplier, 'supplier_nation', pnation, 'nationkey')
        | 'Getting Region Name to Supplier' >> ApplyMap(
            'supplier_region', 'regionkey', pregion)
        | 'Mapping supplier fields' >> beam.Map(
            lambda element: {
                'suppkey': element['suppkey'],
                'supplier_name': element['supplier_name'],
                'supplier_address': element['supplier_address'],
                'supplier_nation': element['nation_name'],
                'supplier_region': element['supplier_region']
            }))

    #########################################################
    #
    #       PRODUCT DATA
    ############################################################

    logging.info('Reading Product data..')
    _product_colums = [
        'partkey', 'product_name', 'product_manufacture', 'product_brand',
        'product_type', 'product_size', 'product_container', 'retailprice',
        'product_comment'
    ]
    _product = f"{known_args.input}dss_part.csv"
    pproduct = (p
                | 'Reading product data' >> beam.io.ReadFromText(
                    _product, skip_header_lines=1)
                | 'Mapping product data to Json' >> beam.ParDo(
                    Split(columns=_product_colums))
                | 'Product mapping values' >> beam.Map(
                    lambda element: {
                        'partkey': element['partkey'],
                        'product_name': element['product_name'],
                        'product_manufacture': element['product_manufacture'],
                        'product_brand': element['product_brand'],
                        'product_type': element['product_type'],
                        'product_size': element['product_size'],
                        'product_container': element['product_container'],
                        'retailprice': element['retailprice']
                    }))

    #########################################################
    #
    #       PRODUCT AVAILABILITY BY SUPPLIER
    ############################################################

    logging.info('Reading Product Availability data..')
    _psupp_colums = [
        'partkey', 'suppkey', 'availqty', 'supplycost', 'ps_comment'
    ]
    _psupp = f"{known_args.input}dss_partsupp.csv"
    ppsupp = (
        p
        | 'Reading product Availability data' >> beam.io.ReadFromText(
            _psupp, skip_header_lines=1)
        | 'Mapping product Availability data to Json' >> beam.ParDo(
            Split(columns=_psupp_colums))
        | 'Creating Complex Key for Product and Supplier' >> beam.Map(
            lambda element: {
                'ckey': "{}|{}".format(element['partkey'], element['suppkey']),
                'availqty': element['availqty'],
                'supplycost': element['supplycost']
            }))

    #########################################################
    #
    #       ITEMS DATA
    ############################################################

    logging.info('Reading items data..')
    _items_colums = [
        'orderkey', 'partkey', 'suppkey', 'l_linenumber', 'l_quantity',
        'l_extendedprice', 'l_discount', 'l_tax', 'l_returnflag',
        'l_linestatus', 'l_shipdate', 'l_commitdate', 'l_receiptdate',
        'l_shipinstruct', 'l_shipmode', 'l_comment'
    ]
    _items = f"{known_args.input}dss_lineitem.csv"
    pitems = (p
              | 'Reading items data' >> beam.io.ReadFromText(
                  _items, skip_header_lines=1)
              | 'Reshuffling items data to be parallel' >> beam.Reshuffle()
              | 'Mapping items data to Json' >> beam.ParDo(
                  Split(columns=_items_colums))
              | 'Mapping items fields' >> beam.Map(
                  lambda element: {
                      'ckey':
                      "{}|{}".format(element['partkey'], element['suppkey']),
                      'orderkey':
                      element['orderkey'],
                      'partkey':
                      element['partkey'],
                      'suppkey':
                      element['suppkey'],
                      'linenumber':
                      element['l_linenumber'],
                      'quantity':
                      element['l_quantity'],
                      'extendedprice':
                      element['l_extendedprice'],
                      'discount':
                      element['l_discount'],
                      'tax':
                      element['l_tax'],
                      'returnflag':
                      element['l_returnflag'],
                      'linestatus':
                      element['l_linestatus'],
                      'shipdate':
                      datetime.strptime(element['l_shipdate'], '"%Y-%m-%d"').
                      strftime('%Y-%m-%d'),
                      'commitdate':
                      datetime.strptime(element['l_commitdate'], '"%Y-%m-%d"'
                                        ).strftime('%Y-%m-%d'),
                      'receiptdate':
                      datetime.strptime(element['l_receiptdate'], '"%Y-%m-%d"'
                                        ).strftime('%Y-%m-%d'),
                      'delay': (datetime.strptime(element[
                          'l_commitdate'], '"%Y-%m-%d"') - datetime.strptime(
                              element['l_receiptdate'], '"%Y-%m-%d"')).days,
                      'shipinstruct':
                      element['l_shipinstruct'],
                      'shipmode':
                      element['l_shipmode']
                  }))

    ## Enrich Items Data
    logging.info('Enrich Items Data..')
    pipeline_dict = {'item': pitems, 'product': pproduct}
    pitems_product = (pipeline_dict
                      | 'Join Items with Product' >> LeftJoin(
                          'item', pitems, 'product', pproduct, 'partkey'))

    ## Enrich Items Data
    logging.info('Enrich Items Data..')
    pipeline_dict = {'item': pitems_product, 'supplier': psupplier_nation}
    pitems_supp = (
        pipeline_dict
        | 'Join items with Supplier' >> LeftJoin(
            'item', pitems_product, 'supplier', psupplier_nation, 'suppkey'))

    # Enrich Items Data
    logging.info('Enrich Items Data..')
    pipeline_dict = {'item': pitems_supp, 'avail': ppsupp}
    pitems_availability = (pipeline_dict
                           | 'Join items with avail' >> LeftJoin(
                               'item', pitems_supp, 'avail', ppsupp, 'ckey'))

    #########################################################
    #
    #       ADD ITEMS TO ORDERS AND WRITE
    ############################################################

    pipeline_dict = {'orders': porder_customer, 'items': pitems_availability}
    results = (
        pipeline_dict
        | 'Join Order with Items' >>
        JoinNested('orders', porder_customer, 'items', pitems_availability,
                   'orderkey')
        # | 'Writing to BQ' >> beam.io.WriteToText(known_args.output)
        | 'Writing to BQ' >> beam.io.WriteToBigQuery(
            known_args.output,
            'santodigital',
            'perfect-order-api',
            schema=table_schema,
            create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
            write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE))

    p.run().wait_until_finish()
Exemple #28
0
def run(argv=None):
    """The main function which creates the pipeline and runs it."""

    parser = argparse.ArgumentParser()

    parser.add_argument(
        '--input',
        dest='input',
        required=True,
        help='Input file to read. This can be a local file or '
        'a file in a Google Storage Bucket.')

    # This defaults to the lake dataset in your BigQuery project. You'll have
    # to create the lake dataset yourself using this command:
    # bq mk lake
    parser.add_argument('--output',
                        dest='output',
                        required=True,
                        help='Output BQ table to write results to.')


    parser.add_argument('--temp_bucket',
                        dest='temp_bucket',
                        required=True,
                        help='temp bucket name.')
    

    parser.add_argument('--credential',
                        dest='credential',
                        required=True,
                        help='credential json key.')


    parser.add_argument('--schema',
                        dest='schema_string',
                        required=True,
                        help='data schema json format.')


    parser.add_argument('--skip_json_lines',
                        dest='skip_json_lines',
                        type=int,
                        required=False,
                        help='skip csv lines.',
                        default=0)

    # Parse arguments from the command line.
    known_args, pipeline_args = parser.parse_known_args(argv)

    pipeline_args += ["--runner=DataflowRunner", 
                      "--save_main_session", 
                      #"--staging_location=gs://%s/staging" % (known_args.temp_bucket),
                      "--temp_location=gs://%s/temp" % (known_args.temp_bucket)]

    os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = known_args.credential
    #schema_string='{"fields":[{"name":"usage","type":"record","fields":[{"name":"cpu","type":"STRING"},{"name":"mem","type":"STRING"}]},{"name":"_proc_PID_io","type":"record","fields":[{"name":"syscw","type":"INTEGER","mode":"repeated"},{"name":"cancelled_write_bytes","type":"INTEGER","mode":"repeated"},{"name":"wchar","type":"INTEGER","mode":"repeated"},{"name":"syscr","type":"INTEGER","mode":"repeated"},{"name":"read_bytes","type":"INTEGER","mode":"repeated"},{"name":"rchar","type":"INTEGER","mode":"repeated"},{"name":"write_bytes","type":"INTEGER","mode":"repeated"}]},{"name":"_proc_PID_stat","type":"record","fields":[{"name":"ds_agent","type":"STRING","mode":"repeated"}]},{"name":"_proc_PID_status","type":"record","fields":[{"name":"ShdPnd","type":"INTEGER","mode":"repeated"},{"name":"CapInh","type":"INTEGER","mode":"repeated"},{"name":"Cpus_allowed_list","type":"STRING","mode":"repeated"},{"name":"SigBlk","type":"INTEGER","mode":"repeated"},{"name":"State","type":"STRING","mode":"repeated"},{"name":"TracerPid","type":"INTEGER","mode":"repeated"},{"name":"FDSize","type":"INTEGER","mode":"repeated"},{"name":"VmRSS","type":"INTEGER","mode":"repeated"},{"name":"Gid","type":"INTEGER","mode":"repeated"},{"name":"CapBnd","type":"STRING","mode":"repeated"},{"name":"Utrace","type":"INTEGER","mode":"repeated"},{"name":"VmExe","type":"INTEGER","mode":"repeated"},{"name":"Pid","type":"INTEGER","mode":"repeated"},{"name":"SigIgn","type":"INTEGER","mode":"repeated"},{"name":"Groups","type":"INTEGER","mode":"repeated"},{"name":"Name","type":"STRING","mode":"repeated"},{"name":"Uid","type":"INTEGER","mode":"repeated"},{"name":"VmSwap","type":"INTEGER","mode":"repeated"},{"name":"SigCgt","type":"STRING","mode":"repeated"},{"name":"VmStk","type":"INTEGER","mode":"repeated"},{"name":"VmPeak","type":"INTEGER","mode":"repeated"},{"name":"VmData","type":"INTEGER","mode":"repeated"},{"name":"nonvoluntary_ctxt_switches","type":"INTEGER","mode":"repeated"},{"name":"voluntary_ctxt_switches","type":"INTEGER","mode":"repeated"},{"name":"Mems_allowed_list","type":"STRING","mode":"repeated"},{"name":"Mems_allowed","type":"STRING","mode":"repeated"},{"name":"SigQ","type":"STRING","mode":"repeated"},{"name":"Tgid","type":"INTEGER","mode":"repeated"},{"name":"Cpus_allowed","type":"STRING","mode":"repeated"},{"name":"CapEff","type":"STRING","mode":"repeated"},{"name":"VmLck","type":"INTEGER","mode":"repeated"},{"name":"VmPTE","type":"INTEGER","mode":"repeated"},{"name":"VmSize","type":"INTEGER","mode":"repeated"},{"name":"CapPrm","type":"STRING","mode":"repeated"},{"name":"PPid","type":"INTEGER","mode":"repeated"},{"name":"SigPnd","type":"INTEGER","mode":"repeated"},{"name":"Threads","type":"INTEGER","mode":"repeated"},{"name":"VmHWM","type":"INTEGER","mode":"repeated"},{"name":"VmLib","type":"INTEGER","mode":"repeated"}]}]}'
    #schema = parse_table_schema_from_json(known_args.schema_string)
    schema = parse_table_schema_from_json('{"fields":%s}' % (known_args.schema_string))
    
    # DataIngestion is a class we built in this script to hold the logic for
    # transforming the file into a BigQuery table.
    data_ingestion = DataIngestion()

    # Initiate the pipeline using the pipeline arguments passed in from the
    # command line. This includes information such as the project ID and
    # where Dataflow should store temp files.
    p = beam.Pipeline(options=PipelineOptions(pipeline_args))

    (
     p | 'Read from a File' >> beam.io.ReadFromText(known_args.input,
                                                  skip_header_lines=known_args.skip_json_lines)
    
     # This stage of the pipeline translates from a CSV file single row
     # input as a string, to a dictionary object consumable by BigQuery.
     # It refers to a function we have written. This function will
     # be run in parallel on different workers using input from the
     # previous stage of the pipeline.
     | 'String To BigQuery Row' >>
     beam.Map(lambda s: data_ingestion.parse_method(s))
     | 'Write to BigQuery' >> beam.io.Write(
         beam.io.BigQuerySink(
             # The table name is a required argument for the BigQuery sink.
             # In this case we use the value passed in from the command line.
             known_args.output,
             # Here we use the simplest way of defining a schema:
             # fieldName:fieldType

             schema=schema,

             # Creates the table in BigQuery if it does not yet exist.
             create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
             # Deletes all data in the BigQuery table before writing.
             write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND)))
    p.run().wait_until_finish()
def run_bq_pipeline(argv=None):
    """Run the sample BigQuery pipeline.

  Args:
    argv: Arguments to the run function.
  """
    parser = argparse.ArgumentParser()
    parser.add_argument('--query',
                        required=True,
                        help='Query to process for the table.')
    parser.add_argument('--output',
                        required=True,
                        help='Output BQ table to write results to.')
    parser.add_argument('--output_schema',
                        dest='output_schema',
                        required=True,
                        help='Schema for output BQ table.')
    parser.add_argument('--use_standard_sql',
                        action='store_true',
                        dest='use_standard_sql',
                        help='Output BQ table to write results to.')
    parser.add_argument('--kms_key',
                        default=None,
                        help='Use this Cloud KMS key with BigQuery.')
    parser.add_argument('--native',
                        default=False,
                        action='store_true',
                        help='Use NativeSources and Sinks.')
    parser.add_argument('--use_json_exports',
                        default=False,
                        action='store_true',
                        help='Use JSON as the file format for exports.')
    known_args, pipeline_args = parser.parse_known_args(argv)

    table_schema = parse_table_schema_from_json(known_args.output_schema)
    kms_key = known_args.kms_key

    options = PipelineOptions(pipeline_args)
    p = TestPipeline(options=options)

    # Note to future modifiers: Keep using BigQuerySource if known_args.native is
    # True.
    if known_args.native:
        data = p | 'read' >> beam.io.Read(
            beam.io.BigQuerySource(
                query=known_args.query,
                use_standard_sql=known_args.use_standard_sql,
                kms_key=kms_key))
    else:
        data = p | 'read' >> beam.io.gcp.bigquery.ReadFromBigQuery(
            query=known_args.query,
            project=options.view_as(GoogleCloudOptions).project,
            use_standard_sql=known_args.use_standard_sql,
            use_json_exports=known_args.use_json_exports,
            kms_key=kms_key)

    temp_file_format = ('NEWLINE_DELIMITED_JSON'
                        if known_args.use_json_exports else 'AVRO')
    _ = data | 'write' >> beam.io.WriteToBigQuery(
        known_args.output,
        schema=table_schema,
        create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
        write_disposition=beam.io.BigQueryDisposition.WRITE_EMPTY,
        temp_file_format=temp_file_format,
        kms_key=kms_key)

    result = p.run()
    result.wait_until_finish()
Exemple #30
0
        "CORE_LB_ZoneEntityFR": "STRING",
        "CORE_LB_ZoneEntityStatus": "STRING",
        "CORE_LB_Order": "INTEGER",
        "CORE_ID_ZoneSourceId": "STRING",
        "CORE_DT_LastMod": "DATETIME",
        "CORE_DT_RecordCreationDate": "DATETIME",
        "CORE_DT_RecordModificationDate": "DATETIME",
        "CORE_FL_IsDeleted": "INTEGER",
        "CORE_FL_Latitude": "FLOAT",
        "CORE_FL_Longitude": "FLOAT"
    }
    mapping_list = [{"name": k, "type": mapping[k]} for k in mapping.keys()]
    return json.JSONEncoder(sort_keys=True).encode({"fields": mapping_list})


table_schema = parse_table_schema_from_json(make_sink_schema())

source = BigQuerySource(
    query=
    "SELECT  ROW_NUMBER() over(order by Code) as CORE_Id_ZoneId, Parent as CORE_LB_ZoneParentCode, Level as CORE_LB_Level, Code as CORE_LB_ZoneCode, EntityEN as CORE_LB_ZoneEntityEN, EntityFR as CORE_LB_ZoneEntityFR, EntityStatus as CORE_LB_ZoneEntityStatus, `Order` as CORE_LB_Order, Id as CORE_ID_ZoneSourceId, LastMod as CORE_DT_LastMod, current_date() as CORE_DT_RecordCreationDate, current_date() as CORE_DT_RecordModificationDate, 0 as CORE_FL_IsDeleted, geo.Latitude as CORE_FL_Latitude, geo.Longitude as CORE_FL_Longitude FROM `studied-client-307710.SMT_STG.SecondAxis` ax left outer join `studied-client-307710.SMT_STG.Geographic_Coordinates` geo on geo.SubZone = ax.EntityEN",
    use_standard_sql=True)  # you can also use SQL queries
#source = BigQuerySource(source_table_spec)
target = BigQuerySink(sink_table_spec, schema=table_schema)
#target = beam.io.WriteToText("output.txt")


def run(argv=None):
    parser = argparse.ArgumentParser()
    known_args, pipeline_args = parser.parse_known_args(argv)
    with beam.Pipeline(argv=pipeline_args) as p:
        raw_values = (
Exemple #31
0
def parse_table_schema_from_json(schema_string):
    import warnings
    warnings.warn("This function is deprecated and will be permanently moved "
                  "to the bigquery_tools module in a future version of beam")
    return bigquery_tools.parse_table_schema_from_json(schema_string)
Exemple #32
0
def run_pipeline(pipeline_options, known_args):
    p = beam.Pipeline(options=pipeline_options)
    lines = p | "read in tweets" >> beam.io.ReadFromPubSub(
        subscription=known_args.input_subscription,
        with_attributes=False
        # id_label="tweet_id" # not for direct runner
    )
    output_tweets = (
        lines
        | 'add window key' >> beam.WindowInto(
            window.FixedWindows(10))  # 10 seconds
        |
        'batch messages' >> BatchElements(min_batch_size=2, max_batch_size=50)
        | 'predict sentiment' >>
        beam.FlatMap(lambda messages: predict_sentiment(messages)))
    bq_schema_tweets = json.dumps({
        "fields": [{
            "name": "id",
            "type": "STRING"
        }, {
            "name": "time_stamp",
            "type": "TIMESTAMP"
        }, {
            "name": "text",
            "type": "STRING"
        }, {
            "name": "username",
            "type": "STRING"
        }, {
            "name": "sentiment",
            "type": "INTEGER"
        }, {
            "name": "sentiment_score",
            "type": "FLOAT"
        }, {
            "name": "sentiment_magnitude",
            "type": "FLOAT"
        }, {
            "name": "language",
            "type": "STRING"
        }, {
            "name": "n_followers",
            "type": "INTEGER"
        }]
    })
    output_tweets | 'write to BQ' >> beam.io.WriteToBigQuery(
        table=os.getenv('BQ_TABLE'),
        dataset=os.getenv('BQ_DATASET'),
        schema=parse_table_schema_from_json(bq_schema_tweets),
        write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND,
        create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
        project=os.getenv('GC_PROJECT'))

    output_batch = (
        lines
        | 'add window key 2' >> beam.WindowInto(window.FixedWindows(
            1 * 60))  # 1 minute
        | 'batch messages 2' >> BatchElements(min_batch_size=10)
        | 'analyze in batch' >>
        beam.Map(lambda messages: analyze_batch(messages)))

    bq_schema_batches = json.dumps({
        "fields": [{
            "name": "time_stamp",
            "type": "TIMESTAMP"
        }, {
            "name": "batch_size",
            "type": "INTEGER"
        }, {
            "name": "top_words",
            "type": "STRING"
        }, {
            "name": "top_languages",
            "type": "STRING"
        }, {
            "name": "avg_num_words",
            "type": "FLOAT"
        }, {
            "name": "avg_num_characters",
            "type": "FLOAT"
        }, {
            "name": "avg_sentiment_score",
            "type": "FLOAT"
        }]
    })
    output_batch | 'write to BQ 2' >> beam.io.WriteToBigQuery(
        table=os.getenv('BQ_TABLE_BATCH'),
        dataset=os.getenv('BQ_DATASET'),
        schema=parse_table_schema_from_json(bq_schema_batches),
        write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND,
        create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
        project=os.getenv('GC_PROJECT'))

    return p.run()
Exemple #33
0
        new_x = pd.DataFrame.from_dict(element, orient="index").T.fillna(0)
        weight = self.model.predict(new_x.iloc[:, :8])[0]
        yield {
            'guid': element['guid'],
            'weight': weight,
            'time': str(element['time'])
        }


schema = parse_table_schema_from_json(
    json.dumps({
        'fields': [{
            'name': 'guid',
            'type': 'STRING'
        }, {
            'name': 'weight',
            'type': 'FLOAT64'
        }, {
            'name': 'time',
            'type': 'STRING'
        }]
    }))


class CreateEntityDoFn(beam.DoFn):
    def process(self, element):
        key = Key(['natality-guid', element['guid']])
        entity = Entity(key)
        entity.set_properties({
            'weight': element['weight'],
            'time': element['time']
def run(argv=None):
    """ Método principal """

    parser = argparse.ArgumentParser()
    parser.add_argument('--input_a',
                        dest='input_a',
                        required=False,
                        help='Arquivo de entrada',
                        default='gs://dotz-exam/raw/price_quote.csv')
    parser.add_argument('--input_b',
                        dest='input_b',
                        required=False,
                        help='Arquivo de entrada',
                        default='gs://dotz-exam/raw/bill_of_materials.csv')
    parser.add_argument('--input_c',
                        dest='input_c',
                        required=False,
                        help='Arquivo de entrada',
                        default='gs://dotz-exam/raw/comp_boss.csv')
    parser.add_argument('--output_table',
                        dest='output_table',
                        required=False,
                        help='Saida para BQ',
                        default='price_bill_comp')

    known_args, pipeline_args = parser.parse_known_args(argv)

    data_ingestion = DataIngestion()
    price_key = 'tube_assembly_id'
    bill_key = 'tube_assembly_id'

    p = beam.Pipeline(options=PipelineOptions(pipeline_args))
    schema = parse_table_schema_from_json(
        data_ingestion.price_bill_comp_schema)

    price_data = (
        p
        | 'price quote - Read text file' >> beam.io.ReadFromText(
            known_args.input_a, skip_header_lines=1)
        | 'price quote - Convert to dict' >> beam.ParDo(price_quote_to_dict()))
    # | 'price quote - Filter by id' >> beam.ParDo(filter_by_key(), 'tube_assembly_id', 'TA-11583'))

    bill_data = (p
                 | 'bill - Read text file' >> beam.io.ReadFromText(
                     known_args.input_b, skip_header_lines=1)
                 | 'bill - Convert to dict' >> beam.ParDo(bill_to_dict()))
    # | 'bill - Filter by id' >> beam.ParDo(filter_by_key(), 'tube_assembly_id', 'TA-11583'))

    price_bill_data = (
        {
            'price': price_data,
            'bill': bill_data
        }
        | 'Left join {0} and {1} on {2}'.format(
            'price', 'bill', 'tube_assembly_id') >> LeftJoin(
                price_data, bill_data, 'price', 'bill', 'tube_assembly_id'))

    comp_data1 = (p
                  | 'comp1 - Read text file' >> beam.io.ReadFromText(
                      known_args.input_c, skip_header_lines=1)
                  | 'comp1 - Convert to dict' >> beam.ParDo(comp1_to_dict()))

    # comp_data2 = (p
    #               | 'comp2 - Read text file' >> beam.io.ReadFromText(known_args.input_c, skip_header_lines=1)
    #               | 'comp2 - Convert to dict' >> beam.ParDo(comp2_to_dict()))

    price_bill_comp_data1 = (
        {
            'price_bill': price_bill_data,
            'comp1': comp_data1
        }
        | 'Left join {0} and {1} on {2}'.format(
            'price_bill', 'comp1', 'component_id_1') >> LeftJoin(
                price_data, bill_data, 'price_bill', 'comp1', 'component_id_1')
        | 'Comp1 - Salvar' >> beam.io.WriteToText('./tmp/', 'comp1')
        | 'Salvar no GCS' >> beam.io.WriteToText(
            file_path_prefix='gs://dotz-exam/work/',
            file_name_suffix='.json',
            append_trailing_newlines=True))

    # TODO: Fazer a junção das demais informações sobre componentes

    # price_bill_comp_data2 = ({'price_bill_comp_data1': price_bill_comp_data1, 'comp2': comp_data2}
    #                          | 'Left join {0} and {1} on {2}'.format('price_bill_comp_data1', 'comp2', 'component_id_2')
    #                          >> LeftJoin(price_bill_comp_data1, comp_data2, 'price_bill_comp_data1', 'comp2', 'component_id_2')
    #                          | 'Comp2 - Salvar' >> beam.io.WriteToText('./tmp/', 'comp2')
    #                          )

    p.run().wait_until_finish()