Exemple #1
0
 def generate_cooccur_schema():
     """BigQuery schema for the word co-occurrence table."""
     json_str = json.dumps({
         'fields': [{
             'name': 'w1',
             'type': 'STRING',
             'mode': 'NULLABLE'
         }, {
             'name': 'w2',
             'type': 'STRING',
             'mode': 'NULLABLE'
         }, {
             'name': 'count',
             'type': 'INTEGER',
             'mode': 'NULLABLE'
         }, {
             'name': 'log_weight',
             'type': 'FLOAT',
             'mode': 'NULLABLE'
         }, {
             'name': 'ts',
             'type': 'TIMESTAMP',
             'mode': 'NULLABLE'
         }]
     })
     return parse_table_schema_from_json(json_str)
Exemple #2
0
    def parse_method(self, string_input):
        # Strip out carriage return, newline and quote characters.
        values = re.split(",",
                          re.sub('\r\n', '', re.sub(u'"', '', string_input)))
        row = dict(zip(('name', 'email'), values))
        return row
        """This method translates a single line of comma separated values to a
    dictionary which can be loaded into BigQuery.

        Args:
            string_input: A comma separated list of values in the form of
            state_abbreviation,gender,year,name,count_of_babies,dataset_created_date
                example string_input: KS,F,1923,Dorothy,654,11/28/2016

        Returns:
            A dict mapping BigQuery column names as keys to the corresponding value
            parsed from string_input.  In this example, the data is not transformed, and
            remains in the same format as the CSV.  There are no date format transformations.
                example output:
                      {'state': 'KS',
                       'gender': 'F',
                       'year': '1923-01-01', <- This is the BigQuery date format.
                       'name': 'Dorothy',
                       'number': '654',
                       'created_date': '11/28/2016'
                       }
     """
        # Strip out return characters and quote characters.
        schema = bigquery.parse_table_schema_from_json(self.schema_str)
def run_bq_pipeline(argv=None):
  """Run the sample BigQuery pipeline.

  Args:
    argv: Arguments to the run function.
  """
  parser = argparse.ArgumentParser()
  parser.add_argument('--query', required=True,
                      help='Query to process for the table.')
  parser.add_argument('--output', required=True,
                      help='Output BQ table to write results to.')
  parser.add_argument('--output_schema', dest='output_schema', required=True,
                      help='Schema for output BQ table.')
  parser.add_argument('--use_standard_sql', action='store_true',
                      dest='use_standard_sql',
                      help='Output BQ table to write results to.')
  known_args, pipeline_args = parser.parse_known_args(argv)

  table_schema = parse_table_schema_from_json(known_args.output_schema)

  p = TestPipeline(options=PipelineOptions(pipeline_args))

  # pylint: disable=expression-not-assigned
  # pylint: disable=bad-continuation
  (p | 'read' >> beam.io.Read(beam.io.BigQuerySource(
      query=known_args.query, use_standard_sql=known_args.use_standard_sql))
   | 'write' >> beam.io.Write(beam.io.BigQuerySink(
           known_args.output,
           schema=table_schema,
           create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
           write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE)))

  result = p.run()
  result.wait_until_finish()
 def get_bq_schema(self):
     """
     This helper function parses a 'FIELDNAME:DATATYPE' string for the BQ
     api.
     """
     return beam_bigquery.parse_table_schema_from_json(
         json.dumps(self.schema))
Exemple #5
0
 def get_bq_schema(self):
     """
     This helper function parses a list of bq.schema.SchemaField for the BQ
     api.
     """
     return beam_bigquery.parse_table_schema_from_json(
         json.dumps(self.schema))
def run(argv=None):
    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--input',
        dest='input',
        required=False,
        help='Input file to read.  This can be a local file or '
        'a file in a Google Storage Bucket.',
        default='gs://python-dataflow-example/data_files/head_usa_names.csv')
    parser.add_argument('--output',
                        dest='output',
                        required=False,
                        help='Output BQ table to write results to.',
                        default='lake_gig2.usa_names_transformed')

    known_args, pipeline_args = parser.parse_known_args(argv)
    data_ingestion = DataTransformation()

    p = beam.Pipeline(options=PipelineOptions(pipeline_args))
    schema = parse_table_schema_from_json(data_ingestion.schema_str)

    (p
     | 'Read From Text' >> beam.io.ReadFromText(known_args.input,
                                                skip_header_lines=1)
     | 'String to BigQuery Row' >>
     beam.Map(lambda s: data_ingestion.parse_method(s))
     | 'Write to BigQuery' >> beam.io.Write(
         beam.io.WriteToBigQuery(
             known_args.output,
             schema=schema,
             create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
             write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE)))
    p.run().wait_until_finish()
    def parse_method(self, string_input):
        """This method translates a single line of comma separated values to a
    dictionary which can be loaded into BigQuery.
        
        Args:
            string_input: A comma separated list of values in the form of 
            state_abbreviation,gender,year,name,count_of_babies,dataset_created_date
                example string_input: KS,F,1923,Dorothy,654,11/28/2016
        
        Returns:
            A dict mapping BigQuery column names as keys to the corresponding value
            parsed from string_input.  In this example, the data is not transformed, and 
            remains in the same format as the CSV.  There are no date format transformations. 

                example output:
                      {'state': 'KS',
                       'gender': 'F',
                       'year': '1923-01-01', <- This is the BigQuery date format.
                       'name': 'Dorothy',
                       'number': '654',
                       'created_date': '11/28/2016'
                       }

     """
        # Strip out return characters and quote characters.
        schema = bigquery.parse_table_schema_from_json(self.schema_str)

        field_map = [f for f in schema.fields]

        # Use a CSV Reader which can handle quoted strings etc.
        reader = csv.reader(string_input.split('\n'))
        for csv_row in reader:
            if (sys.version_info.major < 3.0):
                values = [x.decode('utf8') for x in csv_row]
            else:
                values = csv_row
            # Our source data only contains year, so default January 1st as the
            # month and day.
            month = u'01'
            day = u'01'
            # The year comes from our source data.
            year = values[2]
            row = {}
            i = 0
            # Iterate over the values from our csv file, applying any transformation logic.
            for value in values:
                # If the schema indicates this field is a date format, we must
                # transform the date from the source data into a format that
                # BigQuery can understand.
                if field_map[i].type == 'DATE':
                    # Format the date to YYYY-MM-DD format which BigQuery
                    # accepts.
                    value = u'-'.join((year, month, day))

                row[field_map[i].name] = value
                i += 1

            return row
Exemple #8
0
def run(argv=None):
    """The main function which creates the pipeline and runs it."""
    parser = argparse.ArgumentParser()
    # Here we add some specific command line arguments we expect.   Specifically
    # we have the input file to load and the output table to write to.
    parser.add_argument(
        '--input', dest='input', required=False,
        help='Input file to read.  This can be a local file or '
             'a file in a Google Storage Bucket.',
        # This example file contains a total of only 10 lines.
        # It is useful for developing on a small set of data
        default='gs://python-dataflow-example/data_files/head_usa_names.csv')
    # This defaults to the temp dataset in your BigQuery project.  You'll have
    # to create the temp dataset yourself using bq mk temp
    parser.add_argument('--output', dest='output', required=False,
                        help='Output BQ table to write results to.',
                        default='lake.usa_names_transformed')

    # Parse arguments from the command line.
    known_args, pipeline_args = parser.parse_known_args(argv)
    # DataTransformation is a class we built in this script to hold the logic for
    # transforming the file into a BigQuery table.
    data_ingestion = DataTransformation()

    # Initiate the pipeline using the pipeline arguments passed in from the
    # command line.  This includes information like where Dataflow should
    # store temp files, and what the project id is.
    p = beam.Pipeline(options=PipelineOptions(pipeline_args))
    schema = parse_table_schema_from_json(data_ingestion.schema_str)

    (p
     # Read the file.  This is the source of the pipeline.  All further
     # processing starts with lines read from the file.  We use the input
     # argument from the command line.  We also skip the first line which is a
     # header row.
     | 'Read From Text' >> beam.io.ReadFromText(known_args.input,
                                                skip_header_lines=1)
     # This stage of the pipeline translates from a CSV file single row
     # input as a string, to a dictionary object consumable by BigQuery.
     # It refers to a function we have written.  This function will
     # be run in parallel on different workers using input from the
     # previous stage of the pipeline.
     | 'String to BigQuery Row' >> beam.Map(lambda s:
                                            data_ingestion.parse_method(s))
     | 'Write to BigQuery' >> beam.io.Write(
        beam.io.BigQuerySink(
            # The table name is a required argument for the BigQuery sink.
            # In this case we use the value passed in from the command line.
            known_args.output,
            # Here we use the JSON schema read in from a JSON file.
            # Specifying the schema allows the API to create the table correctly if it does not yet exist.
            schema=schema,
            # Creates the table in BigQuery if it does not yet exist.
            create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
            # Deletes all data in the BigQuery table before writing.
            write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE)))
    p.run().wait_until_finish()
Exemple #9
0
    def parse_method(self, string_input):
        """This method translates a single line of comma separated values to a
        dictionary which can be loaded into BigQuery.

        Args:
            string_input: A comma separated list of values in the form
        Returns:
            A dict mapping BigQuery column names as keys to the corresponding value
            parsed from string_input. The data is not transformed and remains in
            the same format as the CSV file.``

        output form:
              {
              'field name_1': 'value_1',
              'field name_2': 'value_2',
              ...
              }
        """

        # Strip the return and quote characters.
        schema = parse_table_schema_from_json(self.schema_str)
        schema_2 = parse_table_schema_from_json(self.schema_str_2)
        # field mapping
        field_map = [f for f in schema.fields]

        # encoding
        reload(sys)
        sys.setdefaultencoding("utf-8")

        # use csv Reader that deal with quoted strings.
        reader = csv.reader(string_input.split('\n'))
        for csv_row in reader:
            values = [x.decode('utf8') for x in csv_row]

            # store in dictionary. iterate over the values from the csv file
            row = {}
            i = 0
            for value in values:
                # dictionary to get the value
                row[field_map[i].name] = value
                i += 1

            return row
Exemple #10
0
def run(argv=None, save_main_session=True):
    parser = argparse.ArgumentParser()
    
    #1 Replace your hackathon-edem with your project id
    parser.add_argument('--input_topic',
                      dest='input_topic',
                      default='projects/gft-app-294621/topics/audio_topic',
                      help='Input file to process.')
    
    #2 Replace your hackathon-edem with your project id
    parser.add_argument('--input_subscription',
                      dest='input_subscription',
                      default='projects/gft-app-294621/subscriptions/audio_sub',
                      help='Input Subscription')
    
    parser.add_argument('--output_table', 
                        required=True,
                        dest='gft-app-294621:audios_dataset.classified_data',
                        help=
                        ('Output BigQuery table for results specified as: PROJECT:DATASET.TABLE '
                        'or DATASET.TABLE.'))
    
    known_args, pipeline_args = parser.parse_known_args(argv)
    
    pipeline_options = PipelineOptions(pipeline_args)
    
    google_cloud_options = pipeline_options.view_as(GoogleCloudOptions)
    
    google_cloud_options.project = 'gft-app-294621'
    google_cloud_options.job_name = 'gft_app'
        # Uncomment below and add your bucket if you want to execute on Dataflow
    google_cloud_options.staging_location = 'gs://audio_app/binaries'
    google_cloud_options.temp_location = 'gs://audio_app/temp'
    google_cloud_options.region ='europe-west1'
    
    pipeline_options.view_as(StandardOptions).runner = 'DataflowRunner'
    pipeline_options.view_as(StandardOptions).streaming = True
    
    pipeline_options.view_as(SetupOptions).save_main_session = save_main_session
    
    p = beam.Pipeline(options=options)
    
    data = (p | 'Read from PubSub' >> beam.io.ReadFromPubSub(subscription=known_args.input_subscription))
    
    processed_data = (data | beam.ParDo(preprocessAudio()))
    
    classified_data = (processed_data | beam.ParDo(audioClassifier()))
    
    table_schema = parse_table_schema_from_json(json.dumps(json.load(open("schema.json"))["schema"]))
    
    classified_data |   'Write to Big Query' >> beam.io.WriteToBigQuery(args.output, schema=table_schema, create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND)
    
    result = p.run()
    result.wait_until_finish()
Exemple #11
0
 def generate_wc_schema():
     """BigQuery schema for the word count table."""
     json_str = json.dumps({
         'fields': [{
             'name': 'word',
             'type': 'STRING',
             'mode': 'NULLABLE'
         }, {
             'name': 'percent',
             'type': 'FLOAT',
             'mode': 'NULLABLE'
         }, {
             'name': 'ts',
             'type': 'TIMESTAMP',
             'mode': 'NULLABLE'
         }]
     })
     return parse_table_schema_from_json(json_str)
Exemple #12
0
 def test_parse_table_schema_from_json(self):
   string_field = bigquery.TableFieldSchema(
       name='s', type='STRING', mode='NULLABLE', description='s description')
   number_field = bigquery.TableFieldSchema(
       name='n', type='INTEGER', mode='REQUIRED', description='n description')
   record_field = bigquery.TableFieldSchema(
       name='r', type='RECORD', mode='REQUIRED', description='r description',
       fields=[string_field, number_field])
   expected_schema = bigquery.TableSchema(fields=[record_field])
   json_str = json.dumps({'fields': [
       {'name': 'r', 'type': 'RECORD', 'mode': 'REQUIRED',
        'description': 'r description', 'fields': [
            {'name': 's', 'type': 'STRING', 'mode': 'NULLABLE',
             'description': 's description'},
            {'name': 'n', 'type': 'INTEGER', 'mode': 'REQUIRED',
             'description': 'n description'}]}]})
   self.assertEqual(parse_table_schema_from_json(json_str),
                    expected_schema)
Exemple #13
0
 def test_parse_table_schema_from_json(self):
   string_field = bigquery.TableFieldSchema(
       name='s', type='STRING', mode='NULLABLE', description='s description')
   number_field = bigquery.TableFieldSchema(
       name='n', type='INTEGER', mode='REQUIRED', description='n description')
   record_field = bigquery.TableFieldSchema(
       name='r', type='RECORD', mode='REQUIRED', description='r description',
       fields=[string_field, number_field])
   expected_schema = bigquery.TableSchema(fields=[record_field])
   json_str = json.dumps({'fields': [
       {'name': 'r', 'type': 'RECORD', 'mode': 'REQUIRED',
        'description': 'r description', 'fields': [
            {'name': 's', 'type': 'STRING', 'mode': 'NULLABLE',
             'description': 's description'},
            {'name': 'n', 'type': 'INTEGER', 'mode': 'REQUIRED',
             'description': 'n description'}]}]})
   self.assertEqual(parse_table_schema_from_json(json_str),
                    expected_schema)
Exemple #14
0
 def generate_url_schema():
     """BigQuery schema for the urls count table."""
     json_str = json.dumps({
         'fields': [{
             'name': 'url',
             'type': 'STRING',
             'mode': 'NULLABLE'
         }, {
             'name': 'count',
             'type': 'INTEGER',
             'mode': 'NULLABLE'
         }, {
             'name': 'ts',
             'type': 'TIMESTAMP',
             'mode': 'NULLABLE'
         }]
     })
     return parse_table_schema_from_json(json_str)
Exemple #15
0
    def parse_line(self, element):
        import re
        import csv
        from apache_beam.io.gcp.bigquery import parse_table_schema_from_json

        schema = parse_table_schema_from_json(self.schema_str)
        field_map = [f for f in schema.fields]

        for line in csv.reader(element.split('\n'),
                               quotechar='"',
                               delimiter='\t',
                               quoting=csv.QUOTE_ALL,
                               skipinitialspace=True):
            row = {}
            i = 0
            values = [v for v in line]
            for value in values:
                row[field_map[i].name.encode('utf-8')] = value
                i += 1
        return row
    def get_bq_schema(self, env, schema_file):

        if env == 'dev':
            with open(schema_file) as f:
                data = f.read()
                schema_str = '{"fields": ' + data + '}'
                #print(schema_str)
        elif env == 'test':
            from google.cloud import storage
            client = storage.Client()
            bucket = client.get_bucket(self.gcs_bucket)  # ('saledata-deba')
            blob = bucket.get_blob(
                self.schema_folder_gs + '/' +
                schema_file)  #('schema/BreadBasketSchema.json')
            schema_str = '{"fields": ' + blob.download_as_string().decode(
                "utf-8") + '}'
            # print(schema_str)

        schema = parse_table_schema_from_json(schema_str)
        return schema
def run_bq_pipeline(argv=None):
    """Run the sample BigQuery pipeline.

  Args:
    argv: Arguments to the run function.
  """
    parser = argparse.ArgumentParser()
    parser.add_argument('--query',
                        required=True,
                        help='Query to process for the table.')
    parser.add_argument('--output',
                        required=True,
                        help='Output BQ table to write results to.')
    parser.add_argument('--output_schema',
                        dest='output_schema',
                        required=True,
                        help='Schema for output BQ table.')
    parser.add_argument('--use_standard_sql',
                        action='store_true',
                        dest='use_standard_sql',
                        help='Output BQ table to write results to.')
    known_args, pipeline_args = parser.parse_known_args(argv)

    table_schema = parse_table_schema_from_json(known_args.output_schema)

    p = TestPipeline(options=PipelineOptions(pipeline_args))

    # pylint: disable=expression-not-assigned
    # pylint: disable=bad-continuation
    (p | 'read' >> beam.io.Read(
        beam.io.BigQuerySource(query=known_args.query,
                               use_standard_sql=known_args.use_standard_sql))
     | 'write' >> beam.io.Write(
         beam.io.BigQuerySink(
             known_args.output,
             schema=table_schema,
             create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
             write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE)))

    result = p.run()
    result.wait_until_finish()
    def parse_method(self, string_input):
        schema = parse_table_schema_from_json(self.schema_str)

        field_map = [f for f in schema.fields]
        reader = csv.reader(string_input.split('\n'))
        for csv_row in reader:
            values = [x for x in csv_row]
            month = u'01'
            day = u'01'
            year = values[2]

            row = {}
            i = 0
            for value in values:
                if field_map[i].type == 'DATE':
                    value = u'-'.join((year, month, day))

                row[field_map[i].name] = value
                i += 1

            return row
    def parse_method(self, string_input):
        schema = parse_table_schema_from_json(self.schema_str)
        field_map = [f for f in schema.fields]
        reader = csv.reader(string_input.split('\n'))
        for csv_row in reader:
            values = [x for x in csv_row]
            # Our source data only contains year, so default January 1st as the
            # month and day.
            month = u'01'
            day = u'01'
            # The year comes from our source data.
            year = values[2]

            row = {}
            i = 0
            for value in values:
                if field_map[i].type == 'DATE':
                    value = u'-'.join((year, month, day))
                row[field_map[i].name] = value
                i += 1
            return row
def run(argv=None):
    user_options = PipelineOptions().view_as(UserOptions)
    pipeline_options = PipelineOptions()

    data_ingestion = DataTransformation()
    pipeline_options.view_as(SetupOptions).save_main_session = True

    data_ingestion = DataTransformation()
    p = beam.Pipeline(options=pipeline_options)
    schema = parse_table_schema_from_json(data_ingestion.schema_str)

    (p
     | 'Read From Text' >> beam.io.ReadFromText(user_options.input,
                                                skip_header_lines=1)
     | 'String to BigQuery Row' >>
     beam.Map(lambda s: data_ingestion.parse_method(s))
     | 'Write to BigQuery' >> beam.io.Write(
         beam.io.WriteToBigQuery(
             user_options.output,
             schema=schema,
             create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
             write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE)))
    p.run().wait_until_finish()
Exemple #21
0
def run(argv=None):
    print('running')
    p = beam.Pipeline(options=PipelineOptions())
    lines = (p
             | beam.Create([{
                 "id": "some random name",
                 "value": "i dont know"
             }, {
                 "id": "id2",
                 "value": "whatever man"
             }]))

    schema_str = '{"fields": ' + json.dumps(SCHEMA_OBJ) + '}'
    schema = parse_table_schema_from_json(schema_str)
    output_destination = '%s.%s' % (BQ_DATASET, BQ_TABLE)
    (lines
     | 'Write lines to BigQuery' >> beam.io.WriteToBigQuery(
         output_destination,
         schema=schema,
         create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
         write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND))

    p.run().wait_until_finish()
Exemple #22
0
    def parse_method(self, string_input):
        """This method translates a single line of comma separated values to a
    dictionary which can be loaded into BigQuery.
        Args:
            string_input: A comma separated list of values in the form
        Returns:
            A dict mapping BigQuery column names as keys to the corresponding value
            parsed from string_input.

        """
        # Strip out return characters and quote characters.
        schema = parse_table_schema_from_json(self.schema_str)

        field_map = [f for f in schema.fields]

        #
        import sys
        reload(sys)
        sys.setdefaultencoding("utf-8")

        # Use a CSV Reader which can handle quoted strings etc.
        # reader = csv.reader(('"{}"'.format(string_input)), quotechar='"', delimiter=' ')
        reader = csv.reader(string_input.split('\n'))

        for csv_row in reader:
            # values = [x.decode('utf8') for x in csv_row]
            values = [x.decode('utf8') for x in csv_row]

            row = {}
            i = 0
            for value in values:
                row[field_map[i].name] = value

                i += 1

            return row
def run(argv=None):
    """The main function which creates the pipeline and runs it."""
    parser = argparse.ArgumentParser()
    # Here we add some specific command line arguments we expect.
    # This defaults the output table in your BigQuery you'll have
    # to create the example_data dataset yourself using bq mk temp
    parser.add_argument('--output',
                        dest='output',
                        required=False,
                        help='Output BQ table to write results to.',
                        default='lake.orders_denormalized_cogroupbykey')

    # Parse arguments from the command line.
    known_args, pipeline_args = parser.parse_known_args(argv)

    # DataLakeToDataMartCGBK is a class we built in this script to hold the logic for
    # transforming the file into a BigQuery table.  It also contains an example of
    # using CoGroupByKey
    data_lake_to_data_mart = DataLakeToDataMartCGBK()

    schema = parse_table_schema_from_json(data_lake_to_data_mart.schema_str)
    pipeline = beam.Pipeline(options=PipelineOptions(pipeline_args))

    # This query returns details about the account, normalized into a
    # different table.  We will be joining the data in to the main orders dataset in order
    # to create a denormalized table.
    account_details_source = (
        pipeline
        | 'Read Account Details from BigQuery ' >> beam.io.Read(
            beam.io.BigQuerySource(query="""
                SELECT
                  acct_number,
                  acct_company_name,
                  acct_group_name,
                  acct_name,
                  acct_org_name,
                  address,
                  city,
                  state,
                  zip_code,
                  country
                FROM
                  `python-dataflow-example.example_data.account`
            """,
                                   use_standard_sql=True))
        # This next stage of the pipeline maps the acct_number to a single row of
        # results from BigQuery.  Mapping this way helps Dataflow move your data arround
        # to different workers.  When later stages of the pipeline run, all results from
        # a given account number will run on one worker.
        | 'Map Account to Order Details' >>
        beam.Map(lambda row: (row['acct_number'], row)))

    orders_query = data_lake_to_data_mart.get_orders_query()
    # Read the orders from BigQuery.  This is the source of the pipeline.  All further
    # processing starts with rows read from the query results here.
    orders = (
        pipeline
        | 'Read Orders from BigQuery ' >> beam.io.Read(
            beam.io.BigQuerySource(query=orders_query, use_standard_sql=True))
        |
        # This next stage of the pipeline maps the acct_number to a single row of
        # results from BigQuery.  Mapping this way helps Dataflow move your data around
        # to different workers.  When later stages of the pipeline run, all results from
        # a given account number will run on one worker.
        'Map Account to Account Details' >>
        beam.Map(lambda row: (row['acct_number'], row)))

    # CoGroupByKey allows us to arrange the results together by key
    # Both "orders" and "account_details" are maps of
    # acct_number -> "Row of results from BigQuery".
    # The mapping is done in the above code using Beam.Map()
    result = {'orders': orders, 'account_details': account_details_source} | \
             beam.CoGroupByKey()
    # The add_account_details function is responsible for defining how to
    # join the two datasets.  It passes the results of CoGroupByKey, which
    # groups the data from the same key in each dataset together in the same
    # worker.
    joined = result | beam.FlatMap(data_lake_to_data_mart.add_account_details)
    joined | 'Write Data to BigQuery' >> beam.io.Write(
        beam.io.BigQuerySink(
            # The table name is a required argument for the BigQuery sink.
            # In this case we use the value passed in from the command line.
            known_args.output,
            # Here we use the JSON schema read in from a JSON file.
            # Specifying the schema allows the API to create the table correctly if it does not yet exist.
            schema=schema,
            # Creates the table in BigQuery if it does not yet exist.
            create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
            # Deletes all data in the BigQuery table before writing.
            write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE))

    pipeline.run().wait_until_finish()
Exemple #24
0
def run(argv=None):
  
  """The main function which creates the pipeline and runs it."""
  parser = argparse.ArgumentParser()
  parser.add_argument('--model_identifier', required=True,
                        help='Used to identify LTV model and append to table and file names',
                        default='by_sample_id')		#filter-campaign	
  parser.add_argument('--stage-bucket', dest='stage_bucket', required=False, help='Staging bucket to use', default='ltv-dataflow')
  parser.add_argument('--data-bucket', dest='data_bucket', required=False, help='Data bucket to use', default='telemetry-to-gcp')

  parser.add_argument('--load_bq', required=False,
                        help='True/False to load summary/details to bq',
                        default=False)			
  parser.add_argument('--estimate_model', required=False,
                        help='True/False to estimate model params',
                        default=False)
  parser.add_argument('--calculate_model', required=False,
                        help='True/False to calculate ltv',
                        default=False)
  parser.add_argument('--calculate_stats', required=False,
                        help='True/False to calculate stats',
                        default=False)
  parser.add_argument('--upload_stats', required=False,
                        help='True/False to calculate stats',
                        default=False) # to be merged in calc stats once i figure out wild card stuff
  parser.add_argument('--send_output', required=False,
                        help='True/False to send ltv and aggr files to Marketing GCP',
                        default=False)
  parser.add_argument('--delete_data', required=False,
                        help='True/False to delete input data and BQ data',
                        default=False)
  
  start = time.clock()
  
  # Parse arguments from the command line.
  known_args, pipeline_args = parser.parse_known_args(argv)
  logging.info('running beam_calc for model: ' + known_args.model_identifier)

  # read in the output bg table schema
  bg_out_schema= ''
  #schema_file =  "gs://ltv-dataflow-dev/templates/input/calc_output_schema.json"
  schema_file = 'gs://{}/templates/input/calc_output_schema.json'.format(known_args.stage_bucket)
  with gcs.open(schema_file) as f:
    data = f.read()
    # Wrapping the schema in fields is required for the BigQuery API.
    bg_out_schema = '{"fields": ' + data + '}'
  schema = parse_table_schema_from_json(bg_out_schema)
  #schema = bigquery_tools.parse_table_schema_from_json(bg_out_schema)
  #logging.info(schema)
  #ltv_beam.py:306: BeamDeprecationWarning: parse_table_schema_from_json is deprecated since 2.11.0. Use bigquery_tools.parse_table_schema_from_json instead.
  
  # estimate LTV model
  min_sample_size = calculate_min_sample_size()
  #estimate_model(min_sample_size)
  
  pipeline_options = PipelineOptions(pipeline_args)
  #pipeline_options.view_as(SetupOptions).save_main_session = True
  p = beam.Pipeline(options=PipelineOptions(pipeline_args))  
  
  # to be replaced with direct telem bq table to dataflow input?
  input_file_dir = 'gs://' + known_args.data_bucket + '/clv/' + known_args.model_identifier.replace('_','-') + '/{0}/*.parquet'

  # load parquet files into bq, overwrite (may not need if we can join datasets in Telemetry? hmmm, no, we need to get random selection)
  # use dummy singleton DoFn if no Parquet reader
  # much faster to directly load summary and details to bq - need in bq for quick join (not sure how fast join is as a pcollection)
  if known_args.load_bq:
    logging.info('loading bq')
    load_data_bq('ltv', 'summary_' + known_args.model_identifier, input_file_dir.format('summary'))
    load_data_bq('ltv', 'details_' + known_args.model_identifier, input_file_dir.format('details'))

  # run this to estimate LTV model parameters
  if known_args.estimate_model:
    logging.info('estimate model parameters'+known_args.estimate_model)
    serial_dummy = p | 'Read' >> beam.Create( ['serial_dummy'] ) | 'Estimate Lifetimes Model' >> beam.ParDo(ltv_calculate.EstimateLTVFn(min_sample_size,output_bucket='gs://ltv-dataflow-dev/tmp/',model_tag=known_args.model_identifier))
    p.run().wait_until_finish() # fun estimation first before moving on to calculation

  # this has to be run on dataflow or will not upload to bq
  if known_args.calculate_model:
    logging.info('calculate ltv')
    data_query = ("SELECT * FROM ltv.summary_" + known_args.model_identifier) # + " ORDER BY RAND() LIMIT {}").format(100)
    ##data_query = ("SELECT * FROM ltv.summary_" + known_args.model_identifier + " ORDER BY RAND() LIMIT {}").format(100)
    (p
    | 'Read Orders from BigQuery ' >> beam.io.Read(beam.io.BigQuerySource(query=data_query, use_standard_sql=True))
    | 'Apply Lifetimes Model' >> beam.ParDo( ltv_calculate.CalcLTVFn( dill.load(gcs.open("gs://" + known_args.stage_bucket + "/tmp/bgf_" + known_args.model_identifier + ".pkl", 'rb')), dill.load(gcs.open("gs://" + known_args.stage_bucket + "/tmp/ggf_" + known_args.model_identifier + ".pkl", 'rb')) ) )
    | 'Write Data to BigQuery' >> beam.io.WriteToBigQuery("ltv.calc_" + known_args.model_identifier, schema=schema,create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE)
    )
    p.run().wait_until_finish()
    
  # calc aggregate statistics
  if known_args.calculate_stats:
    logging.info('aggregate ltv')
    qry_outlier_stats = ("""SELECT count(distinct(client_id)) ct, avg(historical_searches) avg, stddev(historical_searches) std 
  						    FROM ltv.summary_{0} 
  						    WHERE client_id not in (SELECT client_id FROM ltv.details_{0} WHERE default_search_engine='google-nocodes')""").format(known_args.model_identifier)
    query_job = bq_client.query(qry_outlier_stats)
    outlier_stats = query_job.to_dataframe() # no need to go through query_job.result()
    #logging.info(outlier_stats.head(1))

    ct = outlier_stats['ct'][0]

    if ct > 0: 
      mu = outlier_stats['avg'][0]
      sigma = outlier_stats['std'][0]
      outliers_upper = str(mu + 2.5 * sigma)
      outliers_lower = str(mu - 2.5 * sigma)
    
    # anyway to specify computation/mem instensive machine here?    
    if known_args.model_identifier=='filter_campaign':
      sid_list = ['']
    else:
      sid_list = ['10','33','53','89']      
      
    for sid in sid_list:
      if known_args.model_identifier=='filter_campaign':
        sid_qry = ''
        assert sid == '' 
      else:
        sid_qry = 'AND l.sample_id='+sid
      
      (p | 'Read_Age'+sid >> beam.Create( ['another_serial_dummy_age'+sid] )  | 'AggrCustomerAgeFn'+sid >> beam.ParDo(ltv_aggregate.AggrCustomerAgeFn(model_tag=known_args.model_identifier, outliers_lower=outliers_lower, outliers_upper=outliers_upper, sample_id_qry=sid_qry, sid=sid, output_folder=known_args.stage_bucket)))
      (p | 'Read_E10'+sid >> beam.Create( ['another_serial_dummy_e10'+sid] )  | 'AggrE10ActivityGroupLocaleFn'+sid >> beam.ParDo(ltv_aggregate.AggrE10ActivityGroupLocaleFn(model_tag=known_args.model_identifier, outliers_lower=outliers_lower, outliers_upper=outliers_upper, sample_id_qry=sid_qry, sid=sid, output_folder=known_args.stage_bucket)))   
      (p | 'Read_Geo'+sid >> beam.Create( ['another_serial_dummy_geo'+sid] )  | 'AggrGlobalGeoUserStatusFn'+sid >> beam.ParDo(ltv_aggregate.AggrGlobalGeoUserStatusFn(model_tag=known_args.model_identifier, outliers_lower=outliers_lower, outliers_upper=outliers_upper, sample_id_qry=sid_qry, sid=sid, output_folder=known_args.stage_bucket)))
      (p | 'Read_City'+sid >> beam.Create( ['another_serial_dummy_city'+sid] )  | 'AggrCityFn'+sid >> beam.ParDo(ltv_aggregate.AggrCityFn(model_tag=known_args.model_identifier, outliers_lower=outliers_lower, outliers_upper=outliers_upper, sample_id_qry=sid_qry, sid=sid, output_folder=known_args.stage_bucket)))
      (p | 'Read_ECB'+sid >> beam.Create( ['another_serial_dummy_ecb'+sid] )  | 'AggrEngineChannelBrowserFn'+sid >> beam.ParDo(ltv_aggregate.AggrEngineChannelBrowserFn(model_tag=known_args.model_identifier, outliers_lower=outliers_lower, outliers_upper=outliers_upper, sample_id_qry=sid_qry, sid=sid, output_folder=known_args.stage_bucket)))
      (p | 'Read_Mem'+sid >> beam.Create( ['another_serial_dummy_mem'+sid] )  | 'AggrMemoryFn'+sid >> beam.ParDo(ltv_aggregate.AggrMemoryFn(model_tag=known_args.model_identifier, outliers_lower=outliers_lower, outliers_upper=outliers_upper, sample_id_qry=sid_qry, sid=sid, output_folder=known_args.stage_bucket)))
      (p | 'Read_Attr1'+sid >> beam.Create( ['another_serial_dummy_attr1'+sid] )  | 'AggrAttributes1Fn'+sid >> beam.ParDo(ltv_aggregate.AggrAttributes1Fn(model_tag=known_args.model_identifier, outliers_lower=outliers_lower, outliers_upper=outliers_upper, sample_id_qry=sid_qry, sid=sid, output_folder=known_args.stage_bucket)))
      (p | 'Read_Attr2'+sid >> beam.Create( ['another_serial_dummy_attr2'+sid] )  | 'AggrAttributes2Fn'+sid >> beam.ParDo(ltv_aggregate.AggrAttributes2Fn(model_tag=known_args.model_identifier, outliers_lower=outliers_lower, outliers_upper=outliers_upper, sample_id_qry=sid_qry, sid=sid, output_folder=known_args.stage_bucket)))
      (p | 'Read_Attr3'+sid >> beam.Create( ['another_serial_dummy_attr3'+sid] )  | 'AggrAttributes3Fn'+sid >> beam.ParDo(ltv_aggregate.AggrAttributes3Fn(model_tag=known_args.model_identifier, outliers_lower=outliers_lower, outliers_upper=outliers_upper, sample_id_qry=sid_qry, sid=sid, output_folder=known_args.stage_bucket)))
      (p | 'Read_OS1'+sid >> beam.Create( ['another_serial_dummy_os1'+sid] )  | 'AggrOS1Fn'+sid >> beam.ParDo(ltv_aggregate.AggrOS1Fn(model_tag=known_args.model_identifier, outliers_lower=outliers_lower, outliers_upper=outliers_upper, sample_id_qry=sid_qry, sid=sid, output_folder=known_args.stage_bucket)))
      (p | 'Read_OS2'+sid >> beam.Create( ['another_serial_dummy_os2'+sid] )  | 'AggrOS2Fn'+sid >> beam.ParDo(ltv_aggregate.AggrOS2Fn(model_tag=known_args.model_identifier, outliers_lower=outliers_lower, outliers_upper=outliers_upper, sample_id_qry=sid_qry, sid=sid, output_folder=known_args.stage_bucket)))
      (p | 'Read_SCD'+sid >> beam.Create( ['another_serial_dummy_scd'+sid] )  | 'AggrSyncConfiguredDesktopFn'+sid >> beam.ParDo(ltv_aggregate.AggrSyncConfiguredDesktopFn(model_tag=known_args.model_identifier, outliers_lower=outliers_lower, outliers_upper=outliers_upper, sample_id_qry=sid_qry, sid=sid, output_folder=known_args.stage_bucket)))
      (p | 'Read_SCM'+sid >> beam.Create( ['another_serial_dummy_scm'+sid] )  | 'AggrSyncConfiguredMobileFn'+sid >> beam.ParDo(ltv_aggregate.AggrSyncConfiguredMobileFn(model_tag=known_args.model_identifier, outliers_lower=outliers_lower, outliers_upper=outliers_upper, sample_id_qry=sid_qry, sid=sid, output_folder=known_args.stage_bucket)))
      (p | 'Read_BC'+sid >> beam.Create( ['another_serial_dummy_bc'+sid] )  | 'AggrBookmarksCountFn'+sid >> beam.ParDo(ltv_aggregate.AggrBookmarksCountFn(model_tag=known_args.model_identifier, outliers_lower=outliers_lower, outliers_upper=outliers_upper, sample_id_qry=sid_qry, sid=sid, output_folder=known_args.stage_bucket)))

    # compute ltv statistics
    # quantile compute https://stackoverflow.com/questions/46827512/efficient-algorithm-for-computing-quantiles-in-terabytes-dataset
    p.run().wait_until_finish()

  if known_args.upload_stats:
    # delete anything from bq aggr table before running aggr stats
    qry_truncate = ("DELETE from ltv.aggr_{} WHERE True").format(known_args.model_identifier)
    bq_client.query(qry_truncate)
    # load any files in ltv-dataflow/tmp containing self.model_tag; and delete them
    dataset_ref = bq_client.dataset('ltv')
    job_config = bigquery.LoadJobConfig()
    job_config.autodetect = True
    job_config.skip_leading_rows = 1
    # The source format defaults to CSV, so the line below is optional.
    job_config.source_format = bigquery.SourceFormat.CSV
    uri = "gs://" + known_args.stage_bucket + "/tmp/" + known_args.model_identifier + '' + "/aggr_*.csv"
    try:
      load_job = bq_client.load_table_from_uri(uri, dataset_ref.table('aggr_'+known_args.model_identifier), job_config=job_config)  # API request
      print("Starting job {}".format(load_job.job_id))
      load_job.result()  # Waits for table load to complete.
      print("Job finished.")
      destination_table = bq_client.get_table(dataset_ref.table('aggr_'+known_args.model_identifier))
      print("Loaded {} rows.".format(destination_table.num_rows))
    except ImportError:
      logging.info('uri: ' + uri)
    
  # push files to Marketing GCP in a GCF?
  # output parquet files
  if known_args.send_output:
    # test dataflow write parquet files --- SUCCESSFUL! we can output files in parquet format
    calc_query = ("SELECT client_id, sample_id FROM ltv.summary_{} LIMIT 10").format(known_args.model_identifier)
    #data_query = ("SELECT summ.*, det.* EXCEPT (client_id) FROM ltv.summary summ LEFT JOIN ltv.details det ON summ.client_id=det.client_id WHERE summ.client_id in ('3691929b0e07e22c86c1167c83ded58f481caf89','64ba414c3820805b1f64021cf3e082b091dec4f4')")
    # ugh need to differentiate by sample id
    sid = ''
    calc_fn = 'gs://' + known_args.stage_bucket + '/output/ltv_calc_' + known_args.model_identifier + sid
    (p
     | 'Read Orders from BigQuery ' >> beam.io.Read(beam.io.BigQuerySource(query=calc_query, use_standard_sql=True))  
     | 'Write Data to Parquet' >> beam.io.WriteToParquet(calc_fn, pa.schema([('client_id', pa.string()), ('sample_id', pa.int32())]))
    )
    
    # https://arrow.apache.org/docs/python/data.html#type-metadata
    p.run().wait_until_finish()
    
    # if file exists, push(move) to marketing, else throw error
    
  
  # if all files are in output dir, then clean up bq and put _SUCCESS in dir which should trigger gcf
  # delete intermediary data from bq (and gcs?)
  if known_args.send_output:
    # delete anything in bq ltv tables
    qry_truncate = ("DELETE from ltv.summary_{} WHERE True").format(known_args.model_identifier)
    bq_client.query(qry_truncate)
    qry_truncate = ("DELETE from ltv.details_{} WHERE True").format(known_args.model_identifier)
    bq_client.query(qry_truncate)
    qry_truncate = ("DELETE from ltv.calc_{} WHERE True").format(known_args.model_identifier)
    bq_client.query(qry_truncate)
    qry_truncate = ("DELETE from ltv.aggr_{} WHERE True").format(known_args.model_identifier)
    bq_client.query(qry_truncate)
    # delete gcs data files?
  
  logging.info('ltv.run() runtime: ' + str(time.clock() - start))
Exemple #25
0
def run(argv=None):
    """The main function which creates the pipeline and runs it."""

    parser = argparse.ArgumentParser()

    parser.add_argument('--input',
                        dest='input',
                        required=False,
                        help='Input file to read. This can be a local file or '
                        'a file in a Google Storage Bucket.',
                        default='./data/AB_NYC_2019.csv')

    parser.add_argument('--output',
                        dest='output',
                        required=False,
                        help='Output BQ table to write results to.',
                        default='./direct_run_output/result')

    # Parse arguments from the command line.
    known_args, pipeline_args = parser.parse_known_args(argv)

    data_ingestion = DataIngestion()

    #
    schema = parse_table_schema_from_json(data_ingestion.schema_str)

    with beam.Pipeline(options=PipelineOptions(pipeline_args)) as p:

        lines = p | 'Read from a File' >> beam.io.ReadFromText(
            known_args.input, skip_header_lines=1)

        csv_lines = (lines
                     | 'String To BigQuery Row' >>
                     beam.Map(lambda s: data_ingestion.parse_method(s)))

        original_output = (
            csv_lines
            | 'Write to BigQuery' >> beam.io.WriteToText(
                known_args.output + '1', file_name_suffix='.csv'))

        # Count the occurrences of each word.
        def count_listings(neighbourhood_listings):
            (neighbourhood, listings) = neighbourhood_listings
            n = str(sum(listings))
            return {
                u'neighbourhood': neighbourhood,
                u'count_listings': unicode(n)
            }

        #
        transformed_lines = (
            csv_lines
            | 'Projected' >> beam.Map(
                lambda row: {
                    f: row[f]
                    for f in
                    ['neighbourhood', 'calculated_host_listings_count']
                })
            | 'Key-Value' >> beam.Map(lambda row: (row[
                'neighbourhood'], int(row['calculated_host_listings_count'])))
            | 'Group' >> beam.GroupByKey()
            | 'Count' >> beam.Map(count_listings))
        # | 'Format' >>
        # beam.Map(formatting))

        transformed_output = (
            transformed_lines
            | 'Write' >> beam.io.WriteToText(known_args.output + '2',
                                             file_name_suffix='.csv'))
Exemple #26
0
def run(argv=None):
    """The main function that builds the pipeline and runs a batch processing.
    """
    # add specific command line arguments we expect
    parser = argparse.ArgumentParser()

    # Final stage of the pipeline. Define the destination of the data.
    # Define the input file to read and the output table to import in BigQuery.
    parser.add_argument('--input',
                        dest='input',
                        required=False,
                        help='Input file to read. A local file or '
                        'a file in a Google Storage Bucket.',
                        default='gs://takehom-data-k/AB_NYC_2019.csv')

    # first, we create the dataset from the command: bq mk AB_NYC_2019
    # This defaults to the project dataset in the BigQuery.
    # Form: dataset.table
    parser.add_argument('--output',
                        dest='output',
                        required=False,
                        help='Output BigQuery table to import results to.',
                        default='AB_NYC_2019.AB_NYC_2019')

    # parse args from the command line
    known_args, pipeline_args = parser.parse_known_args(argv)

    # use the class DataProcess to process the logic
    # and transform the file into a BigQuery table
    data_process = DataProcess()

    # Define output schema from json file
    schema = parse_table_schema_from_json(
        data_process.schema_str)  # output schema for raw data
    schema_2 = parse_table_schema_from_json(
        data_process.schema_str_2)  # output schema for transformed data

    # !
    """Initiate the pipeline:
     using the pipeline arguments passed in from the commnd line.
     example:
         python dataflow_pipeline.py --project=$PROJECT\
         --runner=DataflowRunner\
         --staging_location=gs://$PROJECT/temp
         --temp_location gs://$PROJECT/temp\
         --input gs://$PROJECT/AB_NYC_2019.csv\
         --save_main_session
     """
    with beam.Pipeline(options=PipelineOptions(pipeline_args)) as p:

        # read the soucre file of the pipeline
        # skip the first row (header) and starts with the lines read from the file
        lines = p | 'Read from a csv File' >> beam.io.ReadFromText(
            known_args.input, skip_header_lines=1)
        """1. First pipeline flow for raw data input and output.
        """
        # the pipeline translate from the csv file single row
        # input a string to a dictionary object that is acceptable by BigQuery
        # this is done by the defined helper function, which run parallel
        # using input from previous stage
        csv_lines = (lines
                     | 'String To BigQuery Row' >>
                     beam.Map(lambda s: data_process.parse_method(s)))

        # output stage is the importing stage to BigQuery table, taking the previous
        # stage of the pipline
        original_output = (
            csv_lines
            | 'Write to BigQuery - Raw Data' >> beam.io.Write(
                beam.io.BigQuerySink(
                    # The table name is a required argument for the BigQuery sink
                    # this is the value passed in from the command line
                    known_args.output + '_raw',
                    # The schema is from our schema file:
                    # with the form fieldName:fieldType
                    schema=schema,
                    # creates the table in BigQuery
                    create_disposition=beam.io.BigQueryDisposition.
                    CREATE_IF_NEEDED,
                    # before writing in, deletes all data in the BigQuery table
                    write_disposition=beam.io.BigQueryDisposition.
                    WRITE_TRUNCATE)))
        """2. Scenod pipeline flow that performs group by and counts
              with two fields information. input and output.
        """

        # helper function to count the number of listing by each neighbourhood
        # return into dictionary object and unicode string format
        # our schema file handles assinged data type when importing in BigQuery
        def count_listings(neighbourhood_listings):
            (neighbourhood, listings) = neighbourhood_listings
            n = str(sum(listings))
            return {
                u'neighbourhood': neighbourhood,
                u'count_listings': unicode(n)
            }

        # this stage of pipline is to to take the previous pipeline processed dictionary object
        # then projected to the fields we are interested
        # then create a key value pair for further aggregation
        # then perform group by on 'neighborhood' field with the filed 'calculated_host_listings_count'
        # after group by, we leverage the helper funtion to count the host_listing
        # this is by summing up values in each group by filed 'neighborhood'
        transformed_lines = (
            csv_lines
            | 'Projected to desired fields' >> beam.Map(
                lambda row: {
                    f: row[f]
                    for f in
                    ['neighbourhood', 'calculated_host_listings_count']
                })
            | 'Key-Value pair' >> beam.Map(lambda row: (row[
                'neighbourhood'], int(row['calculated_host_listings_count'])))
            | 'Group' >> beam.GroupByKey()
            | 'Count' >> beam.Map(count_listings))

        # second output stage is the importing stage to BigQuery table, taking the previous
        # stage of the pipline for output that is performed aggregation and transformation
        transformed_output = (
            transformed_lines
            | 'Write to BigQuery - Transformed Data' >> beam.io.Write(
                beam.io.BigQuerySink(
                    # The table name is a required argument for the BigQuery sink
                    # this is the value passed in from the command line
                    known_args.output + '_transform',
                    # The schema is from our schema file:
                    # with the form fieldName:fieldType
                    schema=schema_2,
                    # creates the table in BigQuery
                    create_disposition=beam.io.BigQueryDisposition.
                    CREATE_IF_NEEDED,
                    # before writing in, deletes all data in the BigQuery table
                    write_disposition=beam.io.BigQueryDisposition.
                    WRITE_TRUNCATE)))
def run(argv=None):
    """The main function which creates the pipeline and runs it."""
    parser = argparse.ArgumentParser()
    # Here we add some specific command line arguments we expect.   Specifically
    # we have the input file to load and the output table to write to.
    parser.add_argument(
        '--input',
        dest='input',
        required=False,
        help='Input file to read.  This can be a local file or '
        'a file in a Google Storage Bucket.',
        # This example file contains a total of only 10 lines.
        # Useful for quickly debugging on a small set of data
        default='gs://python-dataflow-example/data_files/head_usa_names.csv')
    # The output defaults to the lake dataset in your BigQuery project.  You'll have
    # to create the lake dataset yourself using this command:
    # bq mk lake
    parser.add_argument('--output',
                        dest='output',
                        required=False,
                        help='Output BQ table to write results to.',
                        default='lake.usa_names_enriched')

    # Parse arguments from the command line.
    known_args, pipeline_args = parser.parse_known_args(argv)

    # DataIngestion is a class we built in this script to hold the logic for
    # transforming the file into a BigQuery table.
    data_ingestion = DataIngestion()

    # Initiate the pipeline using the pipeline arguments passed in from the
    # command line.  This includes information like where Dataflow should store
    #  temp files, and what the project id is
    p = beam.Pipeline(options=PipelineOptions(pipeline_args))
    schema = parse_table_schema_from_json(data_ingestion.schema_str)

    # This function adds in a full state name by looking up the
    # full name in the short_to_long_name_map.  The short_to_long_name_map
    # comes from a read from BigQuery in the next few lines
    def add_full_state_name(row, short_to_long_name_map):
        row['state_full_name'] = short_to_long_name_map[row['state']]
        return row

    # This is a second source of data.  The source is from BigQuery.
    # This will come into our pipeline a side input.

    read_query = """
    SELECT
    state_name,
    state_abbreviation
    FROM
    `python-dataflow-example.example_data.state_abbreviations`"""

    state_abbreviations = (
        p
        | 'Read from BigQuery' >> beam.io.Read(
            beam.io.BigQuerySource(query=read_query, use_standard_sql=True))
        # We must create a python tuple of key to value pairs here in order to
        # use the data as a side input.  Dataflow will use the keys to distribute the
        # work to the correct worker.
        | 'Abbreviation to Full Name' >>
        beam.Map(lambda row: (row['state_abbreviation'], row['state_name'])))

    (p
     # Read the file.  This is the source of the pipeline.  All further
     # processing starts with lines read from the file.  We use the input
     # argument from the command line.  We also skip the first line which is
     # a header row.
     | 'Read From Text' >> beam.io.ReadFromText(known_args.input,
                                                skip_header_lines=1)
     # Translates from the raw string data in the CSV to a dictionary.
     # The dictionary is a keyed by column names with the values being the values
     # we want to store in BigQuery.
     | 'String to BigQuery Row' >>
     beam.Map(lambda s: data_ingestion.parse_method(s))
     # Here we pass in a side input, which is data that comes from outside our
     # CSV source.  The side input contains a map of states to their full name.
     |
     'Join Data' >> beam.Map(add_full_state_name, AsDict(state_abbreviations))
     # This is the final stage of the pipeline, where we define the destination
     #  of the data.  In this case we are writing to BigQuery.
     | 'Write to BigQuery' >> beam.io.Write(
         beam.io.BigQuerySink(
             # The table name is a required argument for the BigQuery sink.
             # In this case we use the value passed in from the command line.
             known_args.output,
             # Here we use the JSON schema read in from a JSON file.
             # Specifying the schema allows the API to create the table correctly if it does not yet exist.
             schema=schema,
             # Creates the table in BigQuery if it does not yet exist.
             create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
             # Deletes all data in the BigQuery table before writing.
             write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE)))
    p.run().wait_until_finish()
Exemple #28
0
def track_events():
    import ujson
    from uuid import uuid4
    from conf import conf
    from settings.settings import keys, appsflier_fields, events_schema, send_event_schema
    from utils.functions import validate_version, set_properties, validate_appsflyer, send_appsflyer_event
    from utils.redis_client import RedisClient

    project = conf.PROJECT
    redis_client = RedisClient()

    def transform(message):
        event = ujson.loads(message)
        # if event.get("platform") in ("Android", "iOS"):
        #     appsflyer_version = validate_version(event.get("appVersion"), keys.get("appsflyer_version_{}".
        #                                                                            format(event.get("platform"))))
        # else:
        #     appsflyer_version = False

        # event["send_appsflyer"] = validate_appsflyer(event, keys.get("appsflyer_events"), appsflyer_version)
        event["send_appsflyer"] = False
        return [event]

    def send_appsflyer(event):
        event_af = ujson.loads(ujson.dumps(event))
        if (not event_af.get("send_appsflyer")) or project == "merlin-qa":
            return []

        try:
            send_appsflyer_event(event_af, appsflier_fields)
            return [{
                "type":
                "apps",
                "idEvent":
                event_af.get("idEvent"),
                "createdAt":
                "{:%Y-%m-%d %H:%M:%S.%f}".format(datetime.now()),
                "sent":
                True
            }]
        except Exception as ex:
            logging.error(ex.message)
            return [{
                "type":
                "apps",
                "idEvent":
                event_af.get("idEvent"),
                "createdAt":
                "{:%Y-%m-%d %H:%M:%S.%f}".format(datetime.now()),
                "sent":
                False,
                "error":
                ex.message
            }]

    from datetime import datetime

    def transform_bigquery(event):
        from google.cloud import datastore

        event_bq = ujson.loads(ujson.dumps(event))
        event_bq = set_properties(event_bq)
        if "idJob" in event_bq:
            try:
                job_key = datastore.Key.from_legacy_urlsafe(
                    urlsafe=event_bq.get("idJob"))
                event_bq["idJob"] = job_key.name
            except Exception:
                pass
        event_bq.pop("appsflyer", None)
        event_bq.pop("send_appsflyer", None)
        return [event_bq]

    def select_table(bq_event):
        if bq_event.get("eventName") not in keys.get("expensive_events"):
            return [bq_event]
        return []

    def select_table_expensive(bq_event):
        if bq_event.get("eventName") in keys.get("expensive_events"):
            return [bq_event]
        return []

    # Define pipeline
    storage_bucket = "gs://{}/dataflow".format(conf.BUCKET_NAME)
    pipeline = beam.Pipeline(
        runner=conf.RUNNER,
        argv=[
            "--project", project, "--subnetwork",
            "regions/us-central1/subnetworks/default", "--staging_location",
            "{}/staging_location".format(storage_bucket), "--temp_location",
            "{}/temp".format(storage_bucket), "--output",
            "{}/output".format(storage_bucket), "--setup_file", "./setup.py",
            "--job_name", "events-sync-{}".format(str(uuid4())), "--batch",
            "--worker_machine_type", "n1-standard-2"
        ])

    # PTransforms
    # Read from pubsub
    events = pipeline | "Read pubsub messages" >> beam.Create(
        redis_client.receive_messages())
    events = events | "Transform message" >> beam.FlatMap(
        lambda message: transform(message))
    # Send event to appsflyer
    # apps_events = events | "Send to appsflyer" >> beam.FlatMap(lambda event: send_appsflyer(event))
    bigquery_events = events | "Transform to bigquery" >> beam.FlatMap(
        lambda event: transform_bigquery(event))
    bigquery_events_normal = bigquery_events | "Select event to bigquery" >> beam.FlatMap(
        lambda event: select_table(event))
    bigquery_events_expensive = bigquery_events | "Select event expensive to bigquery" >> beam.FlatMap(
        lambda event: select_table_expensive(event))

    from apache_beam.io.gcp.bigquery import parse_table_schema_from_json
    bigquery_events_schema = parse_table_schema_from_json(
        ujson.dumps(events_schema))
    bigquery_path = "{}:merlin_events.Event".format(project)
    bigquery_events_normal | "Write events on bigquery" >> beam.io.WriteToBigQuery(
        bigquery_path,
        schema=bigquery_events_schema,
        write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND,
        create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED)

    bigquery_path = "{}:merlin_events.Event_expensive".format(project)
    bigquery_events_expensive | "Write events expensive on bigquery" >> beam.io.WriteToBigQuery(
        bigquery_path,
        schema=bigquery_events_schema,
        write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND,
        create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED)

    bigquery_events_schema = parse_table_schema_from_json(
        ujson.dumps(send_event_schema))
    bigquery_path = "{}:merlin_events.SendEvent".format(project)
    # apps_events | "Write sent apps events on bigquery" >> beam.io.WriteToBigQuery(
    #     bigquery_path,
    #     schema=bigquery_events_schema,
    #     write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND,
    #     create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED
    # )
    result = pipeline.run()
    if conf.ENV == "LOCAL":
        result.wait_until_finish()
Exemple #29
0
    return instances

"""
def run(argv=None):
    # Main pipeline run def

    # Make explicit BQ schema for output tables
    bigqueryschema_json = '{"fields": [' \
                          '{"name":"id","type":"STRING"},' \
                          '{"name":"text","type":"NUMERIC"},' \
                          '{"name":"user_id","type":"STRING"},' \
                          #'{"name":"sentiment","type":"FLOAT"},' \
                          '{"name":"posted_at","type":"TIMESTAMP"}' \
                          ']}'
    bigqueryschema = parse_table_schema_from_json(bigqueryschema_json)

    bigqueryschema_mean_json = '{"fields": [' \
                               '{"name":"posted_at","type":"TIMESTAMP"},' \
                               #'{"name":"sentiment","type":"FLOAT"}' \
                               ']}'
    bigqueryschema_mean = parse_table_schema_from_json(bigqueryschema_mean_json)

    """Build and run the pipeline."""
    parser = argparse.ArgumentParser()
    group = parser.add_mutually_exclusive_group(required=False)
    group.add_argument(
        '--input_subscription',
        help=('Input PubSub subscription of the form '
              '"projects/<PROJECT>/subscriptions/<SUBSCRIPTION>."'),
        default="projects/YOUR_PROJECT/subscriptions/YOUR_SUBSCRIPTION"
Exemple #30
0
            element, orient='index').transpose().fillna(0)
        weight = self._model.predict(new_x.iloc[:, 1:8])[0]
        return [{
            'guid': element['guid'],
            'weight': weight,
            'time': str(element['time'])
        }]


schema = parse_table_schema_from_json(
    json.dumps({
        'fields': [{
            'name': 'guid',
            'type': 'STRING'
        }, {
            'name': 'weight',
            'type': 'FLOAT64'
        }, {
            'name': 'time',
            'type': 'STRING'
        }]
    }))

parser = argparse.ArgumentParser()
known_args, pipeline_args = parser.parse_known_args(None)
pipeline_options = PipelineOptions(pipeline_args)

p = beam.Pipeline(options=pipeline_options)
data = p | 'Read from Bigquery' >> beam.io.Read(
    beam.io.BigQuerySource(query=query, use_standard_sql=True))
scored = data | 'Apply Model' >> beam.ParDo(ApplyDoFn())
def run(argv=None):
    """The main function which creates the pipeline and runs it."""
    parser = argparse.ArgumentParser()
    # Here we add some specific command line arguments we expect.   S
    # This defaults the output table in your BigQuery you'll have
    # to create the example_data dataset yourself using bq mk temp
    parser.add_argument('--output', dest='output', required=False,
                        help='Output BQ table to write results to.',
                        default='lake.orders_denormalized_sideinput')

    # Parse arguments from the command line.
    known_args, pipeline_args = parser.parse_known_args(argv)

    # DataLakeToDataMart is a class we built in this script to hold the logic for
    # transforming the file into a BigQuery table.
    data_lake_to_data_mart = DataLakeToDataMart()

    p = beam.Pipeline(options=PipelineOptions(pipeline_args))
    schema = parse_table_schema_from_json(data_lake_to_data_mart.schema_str)
    pipeline = beam.Pipeline(options=PipelineOptions(pipeline_args))

    # This query returns details about the account, normalized into a
    # different table.  We will be joining the data in to the main orders dataset in order
    # to create a denormalized table.
    account_details_source = (
        pipeline
        | 'Read Account Details from BigQuery ' >> beam.io.Read(
            beam.io.BigQuerySource(query="""
                SELECT
                  acct_number,
                  acct_company_name,
                  acct_group_name,
                  acct_name,
                  acct_org_name,
                  address,
                  city,
                  state,
                  zip_code,
                  country
                FROM
                  `qwiklabs-resources.python_dataflow_example.account`""",
                                   # This next stage of the pipeline maps the acct_number to a single row of
                                   # results from BigQuery.  Mapping this way helps Dataflow move your data around
                                   # to different workers.  When later stages of the pipeline run, all results from
                                   # a given account number will run on one worker.
                                   use_standard_sql=True))
        | 'Account Details' >> beam.Map(
            lambda row: (
                row['acct_number'], row
            )))

    orders_query = data_lake_to_data_mart.get_orders_query()
    (p
     # Read the orders from BigQuery.  This is the source of the pipeline.  All further
     # processing starts with rows read from the query results here.
     | 'Read Orders from BigQuery ' >> beam.io.Read(
        beam.io.BigQuerySource(query=orders_query, use_standard_sql=True))
     # Here we pass in a side input, which is data that comes from outside our
     # main source.  The side input contains a map of states to their full name
     | 'Join Data with sideInput' >> beam.Map(data_lake_to_data_mart.add_account_details, AsDict(
        account_details_source))
     # This is the final stage of the pipeline, where we define the destination
     # of the data.  In this case we are writing to BigQuery.
     | 'Write Data to BigQuery' >> beam.io.Write(
        beam.io.BigQuerySink(
            # The table name is a required argument for the BigQuery sink.
            # In this case we use the value passed in from the command line.
            known_args.output,
            # Here we use the JSON schema read in from a JSON file.
            # Specifying the schema allows the API to create the table correctly if it does not yet exist.
            schema=schema,
            # Creates the table in BigQuery if it does not yet exist.
            create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
            # Deletes all data in the BigQuery table before writing.
            write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE)))
    p.run().wait_until_finish()
def run(argv=None):
    # Main pipeline run def

    # Make explicit BQ schema for output tables
    bigqueryschema_json = '{"fields": [' \
                          '{"name":"id","type":"STRING"},' \
                          '{"name":"text","type":"NUMERIC"},' \
                          '{"name":"user_id","type":"STRING"},' \
                          '{"name":"sentiment","type":"FLOAT"},' \
                          '{"name":"posted_at","type":"TIMESTAMP"}' \
                          ']}'
    bigqueryschema = parse_table_schema_from_json(bigqueryschema_json)

    bigqueryschema_mean_json = '{"fields": [' \
                               '{"name":"posted_at","type":"TIMESTAMP"},' \
                               '{"name":"sentiment","type":"FLOAT"}' \
                               ']}'
    bigqueryschema_mean = parse_table_schema_from_json(
        bigqueryschema_mean_json)
    """Build and run the pipeline."""
    parser = argparse.ArgumentParser()
    group = parser.add_mutually_exclusive_group(required=False)
    group.add_argument(
        '--input_subscription',
        help=('Input PubSub subscription of the form '
              '"projects/<PROJECT>/subscriptions/<SUBSCRIPTION>."'),
        default="projects/YOUR_PROJECT/subscriptions/YOUR_SUBSCRIPTION")
    group.add_argument('--input_topic',
                       help=('Input PubSub topic of the form '
                             '"projects/<PROJECT>/topics/<TOPIC>."'),
                       default="projects/YOUR_PROJECT/topics/YOUR_TOPIC")

    known_args, pipeline_args = parser.parse_known_args(argv)

    # We use the save_main_session option because one or more DoFn's in this
    # workflow rely on global context (e.g., a module imported at module level).
    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(SetupOptions).save_main_session = True
    pipeline_options.view_as(StandardOptions).streaming = True
    # Run on Cloud Dataflow by default
    pipeline_options.view_as(StandardOptions).runner = 'DataflowRunner'

    google_cloud_options = pipeline_options.view_as(GoogleCloudOptions)
    google_cloud_options.project = 'YOUR_RPOJECT'
    google_cloud_options.staging_location = 'gs://YOUR_BEAM_STAGING_BUCKET/staging'
    google_cloud_options.temp_location = 'gs://YOUR_BEAM_STAGING_BUCKET/temp'
    google_cloud_options.region = 'europe-west1'

    p = beam.Pipeline(options=pipeline_options)

    # Read from PubSub into a PCollection.
    if known_args.input_subscription:
        lines = p | "read in tweets" >> beam.io.ReadFromPubSub(
            subscription=known_args.input_subscription,
            with_attributes=False,
            id_label="tweet_id")
    else:
        lines = p | "read in tweets" >> beam.io.ReadFromPubSub(
            topic=known_args.input_topic,
            with_attributes=False,
            id_label="tweet_id")

    # Window them, and batch them into batches of 50 (not too large)
    output_tweets = (
        lines
        | 'assign window key' >> beam.WindowInto(window.FixedWindows(10))
        | 'batch into n batches' >> BatchElements(min_batch_size=49,
                                                  max_batch_size=50)
        | 'predict sentiment' >>
        beam.FlatMap(lambda messages: estimate(messages)))

    # Write to Bigquery
    output_tweets | 'store twitter posts' >> beam.io.WriteToBigQuery(
        table="YOUR_TABLE",
        dataset="YOUR_DATASET",
        schema=bigqueryschema,
        write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND,
        create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
        project="YOUR_PROJECT")

    # Average out and log the mean value
    (output_tweets
     | 'pair with key' >> beam.Map(lambda x: (1, x))
     | 'group by key' >> beam.GroupByKey()
     | 'aggregate and format' >> beam.Map(aggregate_format)
     | 'store aggregated sentiment' >> beam.io.WriteToBigQuery(
         table="YOUR_TABLE",
         dataset="YOUR_DATASET",
         schema=bigqueryschema_mean,
         write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND,
         create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
         project="YOUR_PROJECT"))

    result = p.run()
    result.wait_until_finish()