Example #1
0
def run(argv=None):

    parser = argparse.ArgumentParser()
    parser.add_argument('--input',
                        required=True,
                        help='Input file to process.')
    parser.add_argument('--output',
                        required=False,
                        help='Output file to write results to.')
    parser.add_argument('--project',
                        required=False,
                        help='Project ID for datastore')
    known_args, pipeline_args = parser.parse_known_args(argv)
    # We use the save_main_session option because one or more DoFn's in this
    # workflow rely on global context (e.g., a module imported at module level).
    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(SetupOptions).save_main_session = True
    with beam.Pipeline(options=pipeline_options) as p:

        results = (
            p  # pylint: disable=expression-not-assigned
            | 'read' >> ReadFromText(known_args.input)
            | 'split' >> beam.FlatMap(lambda x: re.findall(r'[A-Za-z\']+', x))
            | 'TopPerPrefix' >> TopPerPrefix(5, "")
            # | 'format' >> beam.Map(
            #     lambda (prefix, candidates): '%s: %s' % (prefix, candidates))
            | 'create entity' >> beam.Map(lambda
                                          (prefix, candidates): EntityWrapper(
                                          ).make_entity(prefix, candidates))
            | 'write to datastore' >> WriteToDatastore(known_args.project)
            #   | 'write' >> WriteToText(known_args.output)
        )
Example #2
0
def run(argv=None):
    """This function contains the pipeline logic."""

    parser = argparse.ArgumentParser()
    known_args, pipeline_args = parser.parse_known_args(argv)
    # It is preferable to change the job name between runs.
    pipeline_args.extend([
        '--project=' + project_id,
        '--job_name=datatransfer' + datetime.now().strftime('%Y%m%d%H%M%S%f'),
    ])

    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(SetupOptions).save_main_session = True

    # Form an aggregating query.
    query = """
    SELECT
    CURRENT_DATE() AS date,
    EXTRACT(HOUR FROM CURRENT_TIME()) AS hour,
    location,
    SUM(spend) AS total_spend
    FROM `my_dataset.stream_data`
    WHERE EXTRACT(HOUR FROM timestamp) = EXTRACT(HOUR FROM CURRENT_TIME())
    GROUP BY date, hour, location
    """

    # Datastore kind of the entities resulting from the query.
    kind = 'Hourly spend'

    with beam.Pipeline(options=pipeline_options) as p:
        (p | 'Read from BigQuery' >> Read(BigQuerySource(project=project_id,
                                                         query=query,
                                                         use_standard_sql=True))
           | 'Create entity' >> beam.Map(EntityWrapper(kind).make_entity)
           | 'Write to Datastore' >> WriteToDatastore(project_id))
Example #3
0
def read_from_datastore(user_options, pipeline_options):
    """Creates a pipeline that reads entities from Cloud Datastore."""
    p = beam.Pipeline(options=pipeline_options)
    # Create a query to read entities from datastore.
    query = make_ancestor_query(user_options.inputKind, user_options.namespace,
                                user_options.ancestor)

    # Read entities from Cloud Datastore into a PCollection.
    lines = p | 'read from datastore' >> ReadFromDatastore(
        user_options.project, query, user_options.namespace)

    # Count the occurrences of each word.
    def count_ones(word_ones):
        (word, ones) = word_ones
        return (word, sum(ones))

    processedTweets = (
        lines
        | 'split' >> (beam.ParDo(processTweet()))
        | 'create entity' >> beam.Map(
            EntityWrapper(user_options.namespace, user_options.outputKind,
                          user_options.ancestor).make_entity)
        | 'write to datastore' >> WriteToDatastore(user_options.project))
    result = p.run()
    # Wait until completion, main thread would access post-completion job results.
    result.wait_until_finish()
    return result
Example #4
0
def dataflow(run_local):
    if run_local:
        input_file_path = 'sample.csv'
    else:
        input_file_path = 'gs://' + BUCKET + '/' + INPUT_FILENAME

    JOB_NAME = 'datastore-upload-{}'.format(
        datetime.datetime.now().strftime('%Y-%m-%d-%H%M%S'))

    pipeline_options = {
        'project': PROJECT,
        'staging_location': 'gs://' + BUCKET + '/staging',
        'runner': 'DataflowRunner',
        'job_name': JOB_NAME,
        'disk_size_gb': 100,
        'temp_location': 'gs://' + BUCKET + '/temp',
        'save_main_session': True
    }

    if run_local:
        pipeline_options['runner'] = 'DirectRunner'

    options = PipelineOptions.from_dictionary(pipeline_options)
    with beam.Pipeline(options=options) as p:

        (p | 'Reading input file' >> beam.io.ReadFromText(input_file_path)
         | 'Converting from csv to dict' >> beam.ParDo(CSVtoDict(), [
             'sku', 'name', 'regularPrice', 'salePrice', 'type', 'url',
             'image', 'inStoreAvailability'
         ])
         | 'Create entities' >> beam.ParDo(CreateEntities())
         | 'Write entities into Datastore' >> WriteToDatastore(PROJECT))
Example #5
0
def model_datastoreio():
  """Using a Read and Write transform to read/write to Cloud Datastore."""

  import uuid
  from google.cloud.proto.datastore.v1 import entity_pb2
  from google.cloud.proto.datastore.v1 import query_pb2
  import googledatastore
  import apache_beam as beam
  from apache_beam.options.pipeline_options import PipelineOptions
  from apache_beam.io.gcp.datastore.v1.datastoreio import ReadFromDatastore
  from apache_beam.io.gcp.datastore.v1.datastoreio import WriteToDatastore

  project = 'my_project'
  kind = 'my_kind'
  query = query_pb2.Query()
  query.kind.add().name = kind

  # [START model_datastoreio_read]
  p = beam.Pipeline(options=PipelineOptions())
  entities = p | 'Read From Datastore' >> ReadFromDatastore(project, query)
  # [END model_datastoreio_read]

  # [START model_datastoreio_write]
  p = beam.Pipeline(options=PipelineOptions())
  musicians = p | 'Musicians' >> beam.Create(
      ['Mozart', 'Chopin', 'Beethoven', 'Vivaldi'])

  def to_entity(content):
    entity = entity_pb2.Entity()
    googledatastore.helper.add_key_path(entity.key, kind, str(uuid.uuid4()))
    googledatastore.helper.add_properties(entity, {'content': unicode(content)})
    return entity

  entities = musicians | 'To Entity' >> beam.Map(to_entity)
  entities | 'Write To Datastore' >> WriteToDatastore(project)
Example #6
0
def main():
    args, pipe_args = process_pipe_options()
    pipe_args.view_as(SetupOptions).save_main_session = True
    with beam.Pipeline(options=pipe_args) as p:
        (p | 'Read Similarities' >> beam.io.ReadFromText(args.input)
           | "Create Entities" >> beam.Map(
               EntityWrapper(args.kind, args.sim_cap).make_entity)
           | "Write to DS" >> WriteToDatastore(args.project))
Example #7
0
def write_to_datastore(user_options, pipeline_options):
    """Creates a pipeline that writes entities to Cloud Datastore."""
    with beam.Pipeline(options=pipeline_options) as p:

        # pylint: disable=expression-not-assigned
        (p
         | 'read' >> ReadFromText(user_options.input)
         | 'create entity' >> beam.Map(
             EntityWrapper(user_options.namespace, user_options.kind,
                           user_options.ancestor).make_entity)
         | 'write to datastore' >> WriteToDatastore(user_options.dataset))
Example #8
0
def write_to_datastore(project, user_options, pipeline_options):
  """Creates a pipeline that writes entities to Cloud Datastore."""
  p = beam.Pipeline(options=pipeline_options)

  # pylint: disable=expression-not-assigned
  (p
   | 'read' >> ReadFromText(user_options.input)
   | 'create entity' >> beam.Map(
       EntityWrapper(user_options.namespace, user_options.kind,
                     user_options.ancestor).make_entity)
   | 'write to datastore' >> WriteToDatastore(project))

  # Actually run the pipeline (all operations above are deferred).
  p.run().wait_until_finish()
Example #9
0
def dataflow(argv=None):
    process_options = PipelineOptions().view_as(ProcessOptions)
    p = beam.Pipeline(options=process_options)

    (p
     | 'Read From Text' >> beam.io.ReadFromText(process_options.input,
                                                skip_header_lines=0)
     | 'Process CSV' >> beam.ParDo(ProcessCSV(), ['text', 'label'])
     | 'Build entities' >>
     beam.ParDo(BuildEntities(), process_options.entity, process_options.user,
                process_options.dataset)
     | 'Write entities into Datastore' >> WriteToDatastore('io-annotator-api'))

    p.run().wait_until_finish()
  def test_DatastoreWriteLargeEntities(self):
    """100*100kB entities gets split over two Commit RPCs."""
    with patch.object(helper, 'get_datastore',
                      return_value=self._mock_datastore):
      entities = [e.entity for e in fake_datastore.create_entities(100)]

      datastore_write_fn = _Mutate.DatastoreWriteFn(self._PROJECT)
      datastore_write_fn.start_bundle()
      for entity in entities:
        datastore_helper.add_properties(
            entity, {'large': u'A' * 100000}, exclude_from_indexes=True)
        datastore_write_fn.process(WriteToDatastore.to_upsert_mutation(entity))
      datastore_write_fn.finish_bundle()

      self.assertEqual(2, self._mock_datastore.commit.call_count)
Example #11
0
  def test_DatastoreWriteLargeEntities(self):
    """100*100kB entities gets split over two Commit RPCs."""
    with patch.object(helper, 'get_datastore',
                      return_value=self._mock_datastore):
      entities = [e.entity for e in fake_datastore.create_entities(100)]

      datastore_write_fn = _Mutate.DatastoreWriteFn(
          self._PROJECT, fixed_batch_size=_Mutate._WRITE_BATCH_INITIAL_SIZE)
      datastore_write_fn.start_bundle()
      for entity in entities:
        datastore_helper.add_properties(
            entity, {'large': u'A' * 100000}, exclude_from_indexes=True)
        datastore_write_fn.process(WriteToDatastore.to_upsert_mutation(entity))
      datastore_write_fn.finish_bundle()

      self.assertEqual(2, self._mock_datastore.commit.call_count)
Example #12
0
def run(pipeline_options, known_args):

    pipeline = beam.Pipeline(options=pipeline_options)
    gcp_project = pipeline_options.get_all_options()['project']

    with impl.Context(known_args.transform_temp_dir):
        articles = (pipeline
                    | 'Read articles from BigQuery' >> beam.io.Read(
                        beam.io.BigQuerySource(project=gcp_project,
                                               query=get_source_query(
                                                   known_args.limit),
                                               use_standard_sql=True)))

        articles_dataset = (articles, get_metadata())
        embeddings_dataset, _ = (
            articles_dataset
            | 'Extract embeddings' >>
            impl.AnalyzeAndTransformDataset(preprocess_fn))

        embeddings, transformed_metadata = embeddings_dataset

        embeddings | 'Write embeddings to TFRecords' >> beam.io.tfrecordio.WriteToTFRecord(
            file_path_prefix='{0}'.format(known_args.output_dir),
            file_name_suffix='.tfrecords',
            coder=tft_coders.example_proto_coder.ExampleProtoCoder(
                transformed_metadata.schema),
            num_shards=int(known_args.limit / 25000))

        (articles
         |
         'Convert to entity' >> beam.Map(lambda input_features: create_entity(
             input_features, known_args.kind))
         | 'Write to Datastore' >> WriteToDatastore(project=gcp_project))

        if known_args.enable_debug:
            embeddings | 'Debug Output' >> beam.io.textio.WriteToText(
                file_path_prefix=known_args.debug_output_prefix,
                file_name_suffix='.txt')

    job = pipeline.run()

    if pipeline_options.get_all_options()['runner'] == 'DirectRunner':
        job.wait_until_finish()
Example #13
0
def dataflow(run_local):
    JOB_NAME = 'firestore-upload-{}'.format(
        datetime.datetime.now().strftime('%Y-%m-%d-%H%M%S'))

    pipeline_options = {
        'project': PROJECT,
        'staging_location': 'gs://' + BUCKET + '/staging',
        'runner': 'DataflowRunner',
        'job_name': JOB_NAME,
        'disk_size_gb': 100,
        'temp_location': 'gs://' + BUCKET + '/staging',
        'save_main_session': True,
        'requirements_file': 'requirements.txt'
    }

    if run_local:
        pipeline_options['runner'] = 'DirectRunner'

    options = PipelineOptions.from_dictionary(pipeline_options)

    with beam.Pipeline(options=options) as p:
        (p | 'Reading input file' >> beam.Create([1])
         | 'Create entities' >> beam.ParDo(CreateEntities())
         | 'Write entities into Datastore' >> WriteToDatastore(PROJECT))
Example #14
0
    fields = line.split(
        ',')  #id,president,startYear,endYear,party,homeState,dateOfBirth
    id = fields[0]
    president = fields[1]
    names = president.split(' ')
    firstName = names[0]
    lastName = names[1]
    startYear = fields[2]
    endYear = fields[3]
    party = fields[4]
    homeState = fields[5]
    dateOfBirth = fields[6]
    googledatastore.helper.add_key_path(entity.key, kind, str(id))
    googledatastore.helper.add_properties(
        entity, {
            'firstName': unicode(firstName),
            'lastName': unicode(lastName),
            'startYear': int(startYear),
            'endYear': int(endYear),
            'party': unicode(party),
            'homeState': unicode(homeState),
            'dateOfBirth': datetime.strptime(dateOfBirth, '%Y-%m-%d')
        })
    return entity


entities = lines | 'To Entity' >> beam.Map(to_entity)
entities | 'Write To Datastore' >> WriteToDatastore(project)
# lines | 'Write to Cloud Storage' >> beam.io.WriteToText('gs://[GCLOUD_BUCKET]/out')

p.run().wait_until_finish()
Example #15
0
    key = element['name'].upper()
    while key != '':
        for i in range(2, len(key) + 1):
            result.append(new_elm(element, key[0:i]))
        key = re.sub('\\S+\\s*\\W*', '', key, count=1)
    return result


class EntityWrapper(object):
    def __init__(self, namespace, kind):
        self._namespace = namespace
        self._kind = kind

    def make_entity(self, content):
        entity = entity_pb2.Entity()
        if self._namespace is not None:
            entity.key.partition_id.namespace_id = self._namespace

        helper.add_key_path(entity.key, self._kind, str(uuid.uuid4()))
        helper.add_properties(entity, content)
        return entity


p = beam.Pipeline(options=options)
(p | 'query from bq' >> beam.io.Read(
    beam.io.BigQuerySource(query="select * from bestbuy.products"))
 | 'generate key' >> beam.FlatMap(gen_key)
 | 'make entry' >> beam.Map(EntityWrapper(None, 'products3').make_entity)
 | WriteToDatastore("sample-datalab"))
p.run()
Example #16
0
def process_datastore_tweets(project, pipeline_options):
    """Creates a pipeline that reads tweets from Cloud Datastore from the last
  N days. The pipeline finds the top most-used words, the top most-tweeted
  URLs, ranks word co-occurrences by an 'interestingness' metric (similar to
  on tf* idf).
  """

    user_options = pipeline_options.view_as(UserOptions)
    hours = 20
    p = beam.Pipeline(options=pipeline_options)

    # Read entities from Cloud Datastore into a PCollection, then filter to get
    # only the entities from the last DAYS days.
    lines = (p | QueryDatastore(project, hours)
             | beam.ParDo(FilterDate(user_options, hours)))

    # Process the tweet.
    processedTweets = (lines
                       | 'processTweets' >>
                       (beam.ParDo(processTweet(user_options))))

    # Define some inline helper functions.

    def join_cinfo(cooccur, percents):
        """Calculate a co-occurence ranking."""
        import math

        word1 = cooccur[0][0]
        word2 = cooccur[0][1]
        try:
            word1_percent = percents[word1]
            weight1 = 1 / word1_percent
            word2_percent = percents[word2]
            weight2 = 1 / word2_percent
            return (cooccur[0], cooccur[1],
                    cooccur[1] * math.log(min(weight1, weight2)))
        except:
            return 0

    def generate_cooccur_schema():
        """BigQuery schema for the word co-occurrence table."""
        json_str = json.dumps({
            'fields': [{
                'name': 'w1',
                'type': 'STRING',
                'mode': 'NULLABLE'
            }, {
                'name': 'w2',
                'type': 'STRING',
                'mode': 'NULLABLE'
            }, {
                'name': 'count',
                'type': 'INTEGER',
                'mode': 'NULLABLE'
            }, {
                'name': 'log_weight',
                'type': 'FLOAT',
                'mode': 'NULLABLE'
            }, {
                'name': 'ts',
                'type': 'TIMESTAMP',
                'mode': 'NULLABLE'
            }]
        })
        # {'name': 'ts', 'type': 'STRING', 'mode': 'NULLABLE'}]})
        return parse_table_schema_from_json(json_str)

    def generate_url_schema():
        """BigQuery schema for the urls count table."""
        json_str = json.dumps({
            'fields': [{
                'name': 'url',
                'type': 'STRING',
                'mode': 'NULLABLE'
            }, {
                'name': 'count',
                'type': 'INTEGER',
                'mode': 'NULLABLE'
            }, {
                'name': 'ts',
                'type': 'TIMESTAMP',
                'mode': 'NULLABLE'
            }]
        })
        # {'name': 'ts', 'type': 'STRING', 'mode': 'NULLABLE'}]})
        return parse_table_schema_from_json(json_str)

    def generate_wc_schema():
        """BigQuery schema for the word count table."""
        json_str = json.dumps({
            'fields': [{
                'name': 'word',
                'type': 'STRING',
                'mode': 'NULLABLE'
            }, {
                'name': 'percent',
                'type': 'FLOAT',
                'mode': 'NULLABLE'
            }, {
                'name': 'ts',
                'type': 'TIMESTAMP',
                'mode': 'NULLABLE'
            }]
        })
        # {'name': 'ts', 'type': 'STRING', 'mode': 'NULLABLE'}]})
        return parse_table_schema_from_json(json_str)

    # Write the results to three BigQuery tables.
    (processedTweets
     | 'create entity' >> beam.Map(
         EntityWrapper("", "processedTweets", "root").make_entity)
     | 'processed tweet write' >> WriteToDatastore(project))

    # Actually run the pipeline.
    return p.run()
Example #17
0
def run():
    argv = [
        '--project={0}'.format(PROJECT), '--job_name=shq-demo-data-{}'.format(
            datetime.now().strftime('%Y%m%d%H%M%S')), '--save_main_session',
        '--requirements_file=requirements.txt',
        '--staging_location=gs://{0}/staging/'.format(BUCKET),
        '--temp_location=gs://{0}/staging/'.format(BUCKET),
        '--runner=DataflowRunner'
    ]

    # create the pipeline
    p = beam.Pipeline(argv=argv)

    # get pcollection of users
    # read rows (dicts) from BQ
    # convert offset into actual date relative to today
    users = (
        p
        | 'read users from BQ' >> beam.io.Read(
            beam.io.BigQuerySource(
                query=
                'SELECT * FROM [success-hq:datastore.user] order by email {}'.
                format(USER_LIMIT)))
        | 'get users with reg dates' >> beam.Map(get_user_with_regdate))

    # create list of companies and reg dates based on earliest user reg_date
    companies = (users
                 | 'get company and reg date from user' >>
                 beam.Map(get_company_and_regdate)
                 |
                 'find first reg_date for company' >> beam.CombinePerKey(min))

    # convert rows into datastore entities
    # write entities into datastore
    (users
     | 'build user entity' >> beam.Map(build_user_entity)
     | 'write user to Datastore' >> WriteToDatastore(PROJECT))

    # convert into datastore entities
    # write entities into datastore
    (companies
     | 'build company entity' >> beam.Map(build_company_entity)
     | 'write company to Datastore' >> WriteToDatastore(PROJECT))

    # create projects in datastore
    (companies
     | 'create project for company' >> beam.Map(build_project_entities)
     | 'write project to Datastore' >> WriteToDatastore(PROJECT))

    # create trending in datastore
    (companies
     | 'create trending for company' >> beam.Map(build_trending_entities)
     | 'write trending to Datastore' >> WriteToDatastore(PROJECT))

    # create events for company
    company_events = (
        companies
        | 'build company events' >> beam.FlatMap(build_company_events)
        | 'expand company events' >> beam.FlatMap(expand_events))

    # write company events into BQ
    (company_events
     | 'write to BQ table' >> beam.io.Write(
         beam.io.BigQuerySink(
             project=PROJECT,
             dataset=DATASET,
             table='company_events',
             write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE)))

    # find purchases for all companies
    purchases = (company_events
                 |
                 'get purchased amounts' >> beam.FlatMap(get_purchased_amounts)
                 | 'sum purchased amounts' >> beam.CombinePerKey(sum))

    # find provisions for all companies
    provisions = (
        company_events
        | 'get provisioned amounts' >> beam.FlatMap(get_provisioned_amounts)
        | 'sum provisioned amounts' >> beam.CombinePerKey(sum))

    # combine purchase and provision pcollections
    company_updates = {
        'purchased': purchases,
        'provisioned': provisions
    } | beam.CoGroupByKey()

    # write renewal records to datastore
    (company_updates
     | 'create renewal for company' >> beam.Map(build_renewal_entities)
     | 'write renewals to Datastore' >> WriteToDatastore(PROJECT))

    # create registration events for users
    reg_events = users | 'build reg events' >> beam.Map(build_reg_event)

    # create tickets events for users
    ticket_events = users | 'build ticket events' >> beam.FlatMap(
        lambda line: build_ticket_events(line))

    # create call events for users
    call_events = (users
                   | 'build call events' >> beam.FlatMap(build_call_events)
                   | 'expand call events' >> beam.FlatMap(expand_events))

    # combine the pcollections
    events = (reg_events, ticket_events, call_events) | beam.Flatten()

    # take daily collections and write them into bq
    (events
     | 'write to bq' >> beam.io.Write(
         beam.io.BigQuerySink(
             '{}:{}.{}'.format(PROJECT, DATASET, TEMP),
             write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND)))

    # run the pipeline
    print 'waiting for pipeline to finish, bq partition still to come'
    print 'do not close cloud shell window'
    status = p.run().wait_until_finish()

    # copy stuff from temp into partitions
    print 'starting bq partition work'
    today = date.today()
    days_past = 182
    bq_client = bigquery.Client(project=PROJECT)
    bq_dataset = bq_client.dataset(DATASET)
    for index in range(0, days_past):
        query_day = (datetime.now() + timedelta(days=1 - index)).date()
        query_start = query_day.strftime('%Y-%m-%d 00:00:00')
        query_end = query_day.strftime('%Y-%m-%d 23:59:59')
        part_string = query_day.strftime('%Y%m%d')
        query = 'SELECT * FROM {}.{} where date >= "{}" and date <= "{}"'.format(
            DATASET, TEMP, query_start, query_end)
        bq_target = bq_dataset.table('user_events${}'.format(part_string))
        job = bq_client.run_async_query(
            'bq_load_{}'.format(datetime.now().strftime('%Y%m%d%H%M%S%f')),
            query)
        job.destination = bq_target
        job.write_disposition = 'WRITE_TRUNCATE'
        job.begin()
    print 'Done! You can close the Cloud Shell window'
Example #18
0
def run(argv=None):
    """Main entry point."""

    parser = argparse.ArgumentParser()

    parser.add_argument('--kind',
                        dest='kind',
                        default='writereadtest',
                        help='Datastore Kind')
    parser.add_argument('--num_entities',
                        dest='num_entities',
                        type=int,
                        required=True,
                        help='Number of entities to write')
    parser.add_argument('--limit',
                        dest='limit',
                        type=int,
                        help='Limit of number of entities to write')

    known_args, pipeline_args = parser.parse_known_args(argv)
    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(SetupOptions).save_main_session = True
    gcloud_options = pipeline_options.view_as(GoogleCloudOptions)
    job_name = gcloud_options.job_name
    kind = known_args.kind
    num_entities = known_args.num_entities
    project = gcloud_options.project
    # a random ancesor key
    ancestor = str(uuid.uuid4())
    query = make_ancestor_query(kind, None, ancestor)

    # Pipeline 1: Create and write the specified number of Entities to the
    # Cloud Datastore.
    logging.info('Writing %s entities to %s', num_entities, project)
    p = new_pipeline_with_job_name(pipeline_options, job_name, '-write')

    # pylint: disable=expression-not-assigned
    (p
     | 'Input' >> beam.Create(list(range(known_args.num_entities)))
     | 'To String' >> beam.Map(str)
     |
     'To Entity' >> beam.Map(EntityWrapper(kind, None, ancestor).make_entity)
     | 'Write to Datastore' >> WriteToDatastore(project))

    p.run()

    # Optional Pipeline 2: If a read limit was provided, read it and confirm
    # that the expected entities were read.
    if known_args.limit is not None:
        logging.info(
            'Querying a limited set of %s entities and verifying count.',
            known_args.limit)
        p = new_pipeline_with_job_name(pipeline_options, job_name,
                                       '-verify-limit')
        query_with_limit = query_pb2.Query()
        query_with_limit.CopyFrom(query)
        query_with_limit.limit.value = known_args.limit
        entities = p | 'read from datastore' >> ReadFromDatastore(
            project, query_with_limit)
        assert_that(entities | beam.combiners.Count.Globally(),
                    equal_to([known_args.limit]))

        p.run()

    # Pipeline 3: Query the written Entities and verify result.
    logging.info('Querying entities, asserting they match.')
    p = new_pipeline_with_job_name(pipeline_options, job_name, '-verify')
    entities = p | 'read from datastore' >> ReadFromDatastore(project, query)

    assert_that(entities | beam.combiners.Count.Globally(),
                equal_to([num_entities]))

    p.run()

    # Pipeline 4: Delete Entities.
    logging.info('Deleting entities.')
    p = new_pipeline_with_job_name(pipeline_options, job_name, '-delete')
    entities = p | 'read from datastore' >> ReadFromDatastore(project, query)
    # pylint: disable=expression-not-assigned
    (entities
     | 'To Keys' >> beam.Map(lambda entity: entity.key)
     | 'Delete keys' >> DeleteFromDatastore(project))

    p.run()

    # Pipeline 5: Query the written Entities, verify no results.
    logging.info(
        'Querying for the entities to make sure there are none present.')
    p = new_pipeline_with_job_name(pipeline_options, job_name,
                                   '-verify-deleted')
    entities = p | 'read from datastore' >> ReadFromDatastore(project, query)

    assert_that(entities | beam.combiners.Count.Globally(), equal_to([0]))

    p.run()
Example #19
0
def run(argv=None):

    pipeline_args = [
        '--project={0}'.format(PROJECT), '--job_name=majesticmillion1',
        '--save_main_session',
        '--staging_location=gs://{0}/staging/'.format(BUCKET),
        '--temp_location=gs://{0}/temp/'.format(BUCKET), '--num_workers=4',
        '--runner=DataflowRunner',
        '--inputFile=gs://{0}/Sample_Data/majestic_million.csv'.format(BUCKET),
        '--template_location=gs://{0}/templates/majestic_million_template'.
        format(BUCKET), '--zone=australia-southeast1-a'
        #  '--region=australia-southeast1',
    ]
    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(SetupOptions).save_main_session = True
    inbound_options = pipeline_options.view_as(FileLoader)
    input = inbound_options.inputFile

    with beam.Pipeline(options=pipeline_options) as p:
        TLD_Desc = (
            p
            | 'Read TLD Description File' >> beam.io.ReadFromText(TLDFile)
            | 'Parse Descriptions' >> beam.ParDo(combine_TLD())
            | 'Combine Descriptions to Dictionary' >>
            beam.CombineGlobally(combine_pdict))

        excludedTLDs = (
            p
            | 'Read excuded TLD file' >> beam.io.ReadFromText(excludedTLDFile)
            | 'Get list of excluded TLD' >> beam.ParDo(lambda x: x.split(',')))

        # Extract records as dictionaries
        records = (
            p
            | 'Read File' >> beam.io.ReadFromText(input, skip_header_lines=1)
            | 'Parse CSV' >> beam.ParDo(Split(), SCHEMA)
            | 'Add Descriptions' >> beam.ParDo(
                AddDTLDDesc(), beam.pvalue.AsSingleton(TLD_Desc)))

        # Write TLD aggregations to BigQuery
        (records | 'Aggregate TLDS' >> CountTLDs(excludedTLDs)
         | 'Write TLDs to BigQuery' >> beam.io.WriteToBigQuery(
             '{0}:{1}.TLDCounts'.format(PROJECT,
                                        DATASET),  # Enter your table name
             schema=TLD_SCHEMA,
             create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
             write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE))

        # Write all records to BigQuery
        (records
         | 'Write Items BQ' >> beam.io.WriteToBigQuery(
             '{0}:{1}.TopSites'.format(PROJECT,
                                       DATASET),  # Enter your table name
             schema=SCHEMA + "," + DESCRIPTIONSCHEMA,
             create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
             write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE))

        # Write metadata to Datastore
        (records
         | 'Get Record Count' >> beam.combiners.Count.Globally()
         | 'Create Metadata' >> beam.ParDo(
             GetMetaData(inbound_options.inputFile))
         | 'Create DS Entity' >> beam.Map(lambda x: create_ds_entity(x))
         | 'Write To DS' >> WriteToDatastore(PROJECT))

    p.run()