예제 #1
0
def delete_from_datastore(project, pipeline_options, run_locally):
    """Creates a pipeline that reads entities from Cloud Datastore."""
    p = beam.Pipeline(options=pipeline_options)
    # Create a query to read entities from datastore.
    client = datastore.Client()

    if run_locally:
        pass
        #q.add_filter('category', '=', 'BEBOP')

    q = client.query(kind='PRDebugAttendee')
    query.order = ['-created_date']
    results = list(q.fetch(1))
    if not results:
        logging.error('No PRDebugAttendee objects found')
        return

    newest_date = results[0]['created_date']
    logging.info('Deleting elements older than %s', newest_date)

    q1 = client.query(kind='PRDebugAttendee')
    q2 = client.query(kind='PRCityCategory')
    datastore_1 = p | 'read PRDebugAttendee from datastore' >> ReadFromDatastore(
        project, query._pb_from_query(q1), num_splits=400)
    datastore_2 = p | 'read PRCityCategory from datastore' >> ReadFromDatastore(
        project, query._pb_from_query(q2), num_splits=400)
    # Set up our map/reduce pipeline
    output = (
        (datastore_1, datastore_2) | beam.Flatten()
        | 'convert to entity' >> beam.Map(ConvertToEntity)
        # Find the events we want to count, and expand all the admins/attendees
        | 'find old rankings' >> beam.FlatMap(OldPRRecord, newest_date)
        # And save it all back to the database
    )
    if not run_locally:
        output | 'delete from datastore' >> beam.ParDo(DeleteFromDatastore())
        """
        (output
            | 'convert from entity' >> beam.Map(ConvertFromEntity)
            | 'write to datastore' >> WriteToDatastore(client.project)
        )
        """

    # Actually run the pipeline (all operations above are deferred).
    result = p.run()
    # Wait until completion, main thread would access post-completion job results.
    result.wait_until_finish()
    return result
예제 #2
0
  def check_estimated_size_bytes(self, entity_bytes, timestamp, namespace=None):
    """A helper method to test get_estimated_size_bytes"""

    timestamp_req = helper.make_request(
        self._PROJECT, namespace, helper.make_latest_timestamp_query(namespace))
    timestamp_resp = self.make_stats_response(
        {'timestamp': datastore_helper.from_timestamp(timestamp)})
    kind_stat_req = helper.make_request(
        self._PROJECT, namespace, helper.make_kind_stats_query(
            namespace, self._query.kind[0].name,
            datastore_helper.micros_from_timestamp(timestamp)))
    kind_stat_resp = self.make_stats_response(
        {'entity_bytes': entity_bytes})

    def fake_run_query(req):
      if req == timestamp_req:
        return timestamp_resp
      elif req == kind_stat_req:
        return kind_stat_resp
      else:
        print kind_stat_req
        raise ValueError("Unknown req: %s" % req)

    self._mock_datastore.run_query.side_effect = fake_run_query
    self.assertEqual(entity_bytes, ReadFromDatastore.get_estimated_size_bytes(
        self._PROJECT, namespace, self._query, self._mock_datastore))
    self.assertEqual(self._mock_datastore.run_query.call_args_list,
                     [call(timestamp_req), call(kind_stat_req)])
예제 #3
0
    def test_SplitQueryFn_without_num_splits(self):
        with patch.object(helper,
                          'get_datastore',
                          return_value=self._mock_datastore):
            # Force SplitQueryFn to compute the number of query splits
            num_splits = 0
            expected_num_splits = 23
            entity_bytes = (expected_num_splits *
                            ReadFromDatastore._DEFAULT_BUNDLE_SIZE_BYTES)
            with patch.object(ReadFromDatastore,
                              'get_estimated_size_bytes',
                              return_value=entity_bytes):

                def fake_get_splits(datastore,
                                    query,
                                    num_splits,
                                    partition=None):
                    return self.split_query(query, num_splits)

                with patch.object(query_splitter,
                                  'get_splits',
                                  side_effect=fake_get_splits):
                    split_query_fn = ReadFromDatastore.SplitQueryFn(
                        self._PROJECT, self._query, None, num_splits)
                    split_query_fn.start_bundle()
                    returned_split_queries = []
                    for split_query in split_query_fn.process(self._query):
                        returned_split_queries.append(split_query)

                    self.assertEqual(len(returned_split_queries),
                                     expected_num_splits)
                    self.assertEqual(
                        0, len(self._mock_datastore.run_query.call_args_list))
                    self.verify_unique_keys(returned_split_queries)
예제 #4
0
    def test_SplitQueryFn_with_exception(self):
        """A test that verifies that no split is performed when failures occur."""
        with patch.object(helper,
                          'get_datastore',
                          return_value=self._mock_datastore):
            # Force SplitQueryFn to compute the number of query splits
            num_splits = 0
            expected_num_splits = 1
            entity_bytes = (expected_num_splits *
                            ReadFromDatastore._DEFAULT_BUNDLE_SIZE_BYTES)
            with patch.object(ReadFromDatastore,
                              'get_estimated_size_bytes',
                              return_value=entity_bytes):

                with patch.object(
                        query_splitter,
                        'get_splits',
                        side_effect=ValueError("Testing query split error")):
                    split_query_fn = ReadFromDatastore.SplitQueryFn(
                        self._PROJECT, self._query, None, num_splits)
                    split_query_fn.start_bundle()
                    returned_split_queries = []
                    for split_query in split_query_fn.process(self._query):
                        returned_split_queries.append(split_query)

                    self.assertEqual(len(returned_split_queries),
                                     expected_num_splits)
                    self.assertEqual(returned_split_queries[0][1], self._query)
                    self.assertEqual(
                        0, len(self._mock_datastore.run_query.call_args_list))
                    self.verify_unique_keys(returned_split_queries)
예제 #5
0
def read_from_datastore(project, user_options, pipeline_options):
    """Creates a pipeline that reads entities from Cloud Datastore."""
    p = beam.Pipeline(options=pipeline_options)
    # Create a query to read entities from datastore.
    query = make_ancestor_query(user_options.kind, user_options.namespace,
                                user_options.ancestor)

    # Read entities from Cloud Datastore into a PCollection.
    lines = p | 'read from datastore' >> ReadFromDatastore(
        project, query, user_options.namespace)

    # Count the occurrences of each word.
    counts = (lines
              | 'split' >>
              (beam.ParDo(WordExtractingDoFn()).with_output_types(unicode))
              | 'pair_with_one' >> beam.Map(lambda x: (x, 1))
              | 'group' >> beam.GroupByKey()
              | 'count' >> beam.Map(lambda (word, ones): (word, sum(ones))))

    # Format the counts into a PCollection of strings.
    output = counts | 'format' >> beam.Map(lambda (word, c): '%s: %s' %
                                           (word, c))

    # Write the output using a "Write" transform that has side effects.
    # pylint: disable=expression-not-assigned
    output | 'write' >> beam.io.WriteToText(
        file_path_prefix=user_options.output,
        num_shards=user_options.num_shards)

    result = p.run()
    # Wait until completion, main thread would access post-completion job results.
    result.wait_until_finish()
    return result
예제 #6
0
def model_datastoreio():
  """Using a Read and Write transform to read/write to Cloud Datastore."""

  import uuid
  from google.cloud.proto.datastore.v1 import entity_pb2
  from google.cloud.proto.datastore.v1 import query_pb2
  import googledatastore
  import apache_beam as beam
  from apache_beam.options.pipeline_options import PipelineOptions
  from apache_beam.io.gcp.datastore.v1.datastoreio import ReadFromDatastore
  from apache_beam.io.gcp.datastore.v1.datastoreio import WriteToDatastore

  project = 'my_project'
  kind = 'my_kind'
  query = query_pb2.Query()
  query.kind.add().name = kind

  # [START model_datastoreio_read]
  p = beam.Pipeline(options=PipelineOptions())
  entities = p | 'Read From Datastore' >> ReadFromDatastore(project, query)
  # [END model_datastoreio_read]

  # [START model_datastoreio_write]
  p = beam.Pipeline(options=PipelineOptions())
  musicians = p | 'Musicians' >> beam.Create(
      ['Mozart', 'Chopin', 'Beethoven', 'Vivaldi'])

  def to_entity(content):
    entity = entity_pb2.Entity()
    googledatastore.helper.add_key_path(entity.key, kind, str(uuid.uuid4()))
    googledatastore.helper.add_properties(entity, {'content': unicode(content)})
    return entity

  entities = musicians | 'To Entity' >> beam.Map(to_entity)
  entities | 'Write To Datastore' >> WriteToDatastore(project)
예제 #7
0
  def check_estimated_size_bytes(self, entity_bytes, timestamp, namespace=None):
    """A helper method to test get_estimated_size_bytes"""

    timestamp_req = helper.make_request(
        self._PROJECT, namespace, helper.make_latest_timestamp_query(namespace))
    timestamp_resp = self.make_stats_response(
        {'timestamp': datastore_helper.from_timestamp(timestamp)})
    kind_stat_req = helper.make_request(
        self._PROJECT, namespace, helper.make_kind_stats_query(
            namespace, self._query.kind[0].name,
            datastore_helper.micros_from_timestamp(timestamp)))
    kind_stat_resp = self.make_stats_response(
        {'entity_bytes': entity_bytes})

    def fake_run_query(req):
      if req == timestamp_req:
        return timestamp_resp
      elif req == kind_stat_req:
        return kind_stat_resp
      else:
        print kind_stat_req
        raise ValueError("Unknown req: %s" % req)

    self._mock_datastore.run_query.side_effect = fake_run_query
    self.assertEqual(entity_bytes, ReadFromDatastore.get_estimated_size_bytes(
        self._PROJECT, namespace, self._query, self._mock_datastore))
    self.assertEqual(self._mock_datastore.run_query.call_args_list,
                     [call(timestamp_req), call(kind_stat_req)])
예제 #8
0
def read_from_datastore(user_options, pipeline_options):
    """Creates a pipeline that reads entities from Cloud Datastore."""
    p = beam.Pipeline(options=pipeline_options)
    # Create a query to read entities from datastore.
    query = make_ancestor_query(user_options.inputKind, user_options.namespace,
                                user_options.ancestor)

    # Read entities from Cloud Datastore into a PCollection.
    lines = p | 'read from datastore' >> ReadFromDatastore(
        user_options.project, query, user_options.namespace)

    # Count the occurrences of each word.
    def count_ones(word_ones):
        (word, ones) = word_ones
        return (word, sum(ones))

    processedTweets = (
        lines
        | 'split' >> (beam.ParDo(processTweet()))
        | 'create entity' >> beam.Map(
            EntityWrapper(user_options.namespace, user_options.outputKind,
                          user_options.ancestor).make_entity)
        | 'write to datastore' >> WriteToDatastore(user_options.project))
    result = p.run()
    # Wait until completion, main thread would access post-completion job results.
    result.wait_until_finish()
    return result
예제 #9
0
  def test_SplitQueryFn_with_query_limit(self):
    """A test that verifies no split is performed when the query has a limit."""
    with patch.object(helper, 'get_datastore',
                      return_value=self._mock_datastore):
      self._query.limit.value = 3
      split_query_fn = ReadFromDatastore.SplitQueryFn(
          self._PROJECT, self._query, None, 4)
      split_query_fn.start_bundle()
      returned_split_queries = []
      for split_query in split_query_fn.process(self._query):
        returned_split_queries.append(split_query)

      self.assertEqual(1, len(returned_split_queries))
      self.assertEqual(0, len(self._mock_datastore.method_calls))
예제 #10
0
    def expand(self, pcoll):
        query = query_pb2.Query()
        query.kind.add().name = 'Tweet'
        now = datetime.datetime.now()
        # The 'earlier' var will be set to a static value on template creation.
        # That is, because of the way that templates work, the value is defined
        # at template compile time, not runtime.
        # But defining a filter based on this value will still serve to make the
        # query more efficient than if we didn't filter at all.
        earlier = now - datetime.timedelta(days=self.days)
        datastore_helper.set_property_filter(query.filter, 'created_at',
                                             PropertyFilter.GREATER_THAN,
                                             earlier)

        return (pcoll
                | 'read from datastore' >> ReadFromDatastore(
                    self.project, query, None))
예제 #11
0
  def test_SplitQueryFn_with_num_splits(self):
    with patch.object(helper, 'get_datastore',
                      return_value=self._mock_datastore):
      num_splits = 23

      def fake_get_splits(datastore, query, num_splits, partition=None):
        return self.split_query(query, num_splits)

      with patch.object(query_splitter, 'get_splits',
                        side_effect=fake_get_splits):

        split_query_fn = ReadFromDatastore.SplitQueryFn(
            self._PROJECT, self._query, None, num_splits)
        split_query_fn.start_bundle()
        returned_split_queries = []
        for split_query in split_query_fn.process(self._query):
          returned_split_queries.append(split_query)

        self.assertEqual(len(returned_split_queries), num_splits)
        self.assertEqual(0, len(self._mock_datastore.run_query.call_args_list))
        self.verify_unique_keys(returned_split_queries)
예제 #12
0
def run(argv=None):
    """Main entry point."""

    parser = argparse.ArgumentParser()

    parser.add_argument('--kind',
                        dest='kind',
                        default='writereadtest',
                        help='Datastore Kind')
    parser.add_argument('--num_entities',
                        dest='num_entities',
                        type=int,
                        required=True,
                        help='Number of entities to write')
    parser.add_argument('--limit',
                        dest='limit',
                        type=int,
                        help='Limit of number of entities to write')

    known_args, pipeline_args = parser.parse_known_args(argv)
    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(SetupOptions).save_main_session = True
    gcloud_options = pipeline_options.view_as(GoogleCloudOptions)
    job_name = gcloud_options.job_name
    kind = known_args.kind
    num_entities = known_args.num_entities
    project = gcloud_options.project
    # a random ancesor key
    ancestor = str(uuid.uuid4())
    query = make_ancestor_query(kind, None, ancestor)

    # Pipeline 1: Create and write the specified number of Entities to the
    # Cloud Datastore.
    logging.info('Writing %s entities to %s', num_entities, project)
    p = new_pipeline_with_job_name(pipeline_options, job_name, '-write')

    # pylint: disable=expression-not-assigned
    (p
     | 'Input' >> beam.Create(list(range(known_args.num_entities)))
     | 'To String' >> beam.Map(str)
     |
     'To Entity' >> beam.Map(EntityWrapper(kind, None, ancestor).make_entity)
     | 'Write to Datastore' >> WriteToDatastore(project))

    p.run()

    # Optional Pipeline 2: If a read limit was provided, read it and confirm
    # that the expected entities were read.
    if known_args.limit is not None:
        logging.info(
            'Querying a limited set of %s entities and verifying count.',
            known_args.limit)
        p = new_pipeline_with_job_name(pipeline_options, job_name,
                                       '-verify-limit')
        query_with_limit = query_pb2.Query()
        query_with_limit.CopyFrom(query)
        query_with_limit.limit.value = known_args.limit
        entities = p | 'read from datastore' >> ReadFromDatastore(
            project, query_with_limit)
        assert_that(entities | beam.combiners.Count.Globally(),
                    equal_to([known_args.limit]))

        p.run()

    # Pipeline 3: Query the written Entities and verify result.
    logging.info('Querying entities, asserting they match.')
    p = new_pipeline_with_job_name(pipeline_options, job_name, '-verify')
    entities = p | 'read from datastore' >> ReadFromDatastore(project, query)

    assert_that(entities | beam.combiners.Count.Globally(),
                equal_to([num_entities]))

    p.run()

    # Pipeline 4: Delete Entities.
    logging.info('Deleting entities.')
    p = new_pipeline_with_job_name(pipeline_options, job_name, '-delete')
    entities = p | 'read from datastore' >> ReadFromDatastore(project, query)
    # pylint: disable=expression-not-assigned
    (entities
     | 'To Keys' >> beam.Map(lambda entity: entity.key)
     | 'Delete keys' >> DeleteFromDatastore(project))

    p.run()

    # Pipeline 5: Query the written Entities, verify no results.
    logging.info(
        'Querying for the entities to make sure there are none present.')
    p = new_pipeline_with_job_name(pipeline_options, job_name,
                                   '-verify-deleted')
    entities = p | 'read from datastore' >> ReadFromDatastore(project, query)

    assert_that(entities | beam.combiners.Count.Globally(), equal_to([0]))

    p.run()
예제 #13
0
def run():
    import pickle
    import sys

    import math

    import numpy as np

    reload(sys)
    sys.setdefaultencoding('utf8')

    from gensim.models import KeyedVectors
    import apache_beam as beam
    from apache_beam.options.pipeline_options import PipelineOptions, GoogleCloudOptions, StandardOptions, SetupOptions
    from apache_beam.io.gcp.datastore.v1.datastoreio import ReadFromDatastore
    from google.cloud.proto.datastore.v1 import query_pb2
    from apache_beam.io.textio import WriteToText
    import nltk.data
    import re
    import uuid
    import perceptron

    _sentence_tokenizer = nltk.data.load("./tokenizer/punkt_turkish.pickle")
    abbreviations = set()
    with open("./tokenizer/abbreviations-long.txt") as f:
        for l in f:
            abbreviations.add(l.split(':')[0])

    _sentence_tokenizer._params.abbrev_types = abbreviations

    model_file = "perceptron_word2vec_stemmed_normalized.pickle"
    with open(model_file, 'rb') as model:
        w, b = pickle.load(model)

    def sentences_from_text(text):
        return _sentence_tokenizer.tokenize(text.strip())

    def tokens_from_sentence(sentence):
        return sentence.split(" ") # nltk.word_tokenize(sentence)

    def ngrams(obj, n):
        tokens = []
        sentences = (
            sentences_from_text(obj["title"]) +
            sentences_from_text(obj["description"]) +
            sentences_from_text(obj["content"])
        )

        for sentence in sentences:
            tokens += tokens_from_sentence(sentence)

        pairs = nltk.ngrams(tokens, n)
        return [" ".join(pair) for pair in pairs]


    def convertToObject(jsonObj):
        x = jsonObj

        link = x.properties.get('link', None)
        link = link.string_value if link else ""

        title = x.properties.get('title', None)
        title = title.string_value if title else ""

        description = x.properties.get("description", None)
        description = description.string_value if description else ""

        content = x.properties.get("text", "")
        content = content.string_value if content else ""

        published = x.properties.get("published")
        published = published.string_value if published else ""

        obj = {
            "link": link,
            "title": title,
            "description": description,
            "content": content,
            "published": published
        }

        obj["key"] = obj["link"] if obj["link"] else str(uuid.uuid4())

        return obj

    # https://stackoverflow.com/questions/9662346/python-code-to-remove-html-tags-from-a-string
    def cleanhtml(raw_html):
        cleanr = re.compile('<.*?>')
        cleantext = re.sub(cleanr, '', raw_html)
        return cleantext

    def removeHTMLFromStrings(obj):
        for key in obj.keys():
            obj[key] = cleanhtml(obj[key])

        return obj

    def tokenize_to_sentences(obj):

        obj["sentences"] = (
            sentences_from_text(obj["title"]) +
            sentences_from_text(obj["description"]) +
            sentences_from_text(obj["content"])
        )

        return obj

    def tokenize_to_words(obj):

        obj["tokens"] = []

        for sentence in obj["sentences"]:
            obj["tokens"] += tokens_from_sentence(sentence)

        for token in obj["tokens"]:
            yield (obj["key"], token)

    options = PipelineOptions()
    google_cloud_options = options.view_as(GoogleCloudOptions)
    google_cloud_options.project = 'news-197916'
    google_cloud_options.job_name = 'sentiment-analysis'
    google_cloud_options.staging_location = 'gs://news-197916.appspot.com/word_count/'
    google_cloud_options.temp_location = 'gs://news-197916.appspot.com/df_tmp'
    options.view_as(StandardOptions).runner = 'DataflowRunner'

    setup_options = options.view_as(SetupOptions)
    setup_options.requirements_file = "requirements.txt"
    setup_options.save_main_session = True

    p = beam.Pipeline(options=options)
    query = query_pb2.Query()
    query.kind.add().name = "News_Entry"

    pairs = (p
            | 'Read From Datastore' >> ReadFromDatastore(project = google_cloud_options.project, query=query)
        #     | "Read From Text" >> ReadFromText("news.json", coder=beam.coders.coders.StrUtf8Coder()) # line by line
        #     | "Convert to Json Object" >> beam.Map(convertToJsonObj)
             | "Convert to Python Object" >> beam.Map(convertToObject)
             | "Remove HTML Tags From Strings (Normalization 1)" >> beam.Map(removeHTMLFromStrings)
    )

    tokens_1gram = (pairs
                    | 'Sentence Tokenization' >> beam.Map(tokenize_to_sentences)
                    | 'Word Tokenization' >> beam.FlatMap(tokenize_to_words)  # also convert to key value pairs
                    )
    """
    tokens_2gram = (pairs
            | "Create 2-grams" >> beam.FlatMap(lambda obj: [(obj["key"], token) for token in ngrams(obj, 2)])
        )
    """

    tokens = tokens_1gram

    """
    vocabulary = (tokens
                  | "Get words only" >> beam.Values()
                  | "Remove duplicate words" >> beam.RemoveDuplicates()
                  )
    vocabulary_size = (vocabulary
            | "Count Vocabulary elements" >> beam.combiners.Count.Globally()
        )

    doc_total_words = (tokens
            | "Count Words of Doc" >> beam.combiners.Count.PerKey()
    )
    """

    tokens_paired_with_1 = (tokens
                            | "Pair with 1" >> beam.Map(lambda (doc, token): ((doc, token), 1))
                            )
    """
    token_counts_per_doc = (tokens_paired_with_1
            | "Group by Doc,Word" >> beam.GroupByKey()
            | "Count ones" >> beam.Map(lambda ((doc, token), counts): (doc, (token, sum(counts))))
            | "Group by Doc" >> beam.GroupByKey()
        )



    num_docs = (token_counts_per_doc
            | "Get Docs" >> beam.Keys()
            | "Count Docs" >> beam.combiners.Count.Globally()
    )


    word_tf_pre = (
        { 'total_tokens': doc_total_words, 'token_counts_per_doc': token_counts_per_doc }
        | "CoGroup By Document" >> beam.CoGroupByKey()
    )

    def calc_tf((doc, count)):
        [token_count] = count['token_counts_per_doc']

        [tokens_total] = count['total_tokens']

        for token, cnt in token_count:
            yield token, (doc, float(cnt) / tokens_total)


    doc_word_tf = (word_tf_pre
        | "Compute Term Frequencies" >> beam.FlatMap(calc_tf)
        )

    word_occurrences = (tokens
        | "Remove Multiple occurrences per doc" >> beam.RemoveDuplicates()
        | "Pair with 1s" >> beam.Map(lambda (doc, word): (word, 1))
        | "Group by Word" >> beam.GroupByKey()
        | "Sum 1s" >> beam.Map(lambda (word, counts): (word, sum(counts)))
    )

    token_df = (
        word_occurrences
        | "Compute Document Frequency">> beam.Map(lambda (token, count), total: (token, float(count) / total), AsSingleton(num_docs)))

    token_tf_df = (
        { 'term_frequency': doc_word_tf, 'document_frequency': token_df}
        | "CoGroup By Token" >> beam.CoGroupByKey())

    def calc_tfidf((token, tfdf)):
      [df] = tfdf['document_frequency']
      for doc, tf in tfdf['term_frequency']:
        yield (doc, token), tf * math.log(1.0 / df)

    token_tf_idf = (token_tf_df
        | "Calculate TF-IDF Scores" >> beam.FlatMap(calc_tfidf)
    )
    """

    word2vec = KeyedVectors.load_word2vec_format('tr_word2vec', binary=True)

    def get_vec(word2vec, token):
        if word2vec is None:
            word2vec = KeyedVectors.load_word2vec_format('tr_word2vec', binary=True)

        try:
            x = word2vec.get_vector(token)
            x = x.reshape(400)
        except:
            x = np.zeros(400)

        return x

    def analyze_sentiment(x):

        res = perceptron.f(x, w, b)

        return res

    doc_sentiment = (tokens_paired_with_1
                     | "Create Word2Vec Vector" >> beam.Map(lambda ((doc, token), cnt): (doc, get_vec(word2vec, token)))
                     | "Group Word2Vec Vectors By Document" >> beam.GroupByKey()
                     | "Sum Word2Vec Vectors" >> beam.Map(
        lambda (doc, vecs): (doc, analyze_sentiment(np.sum(vecs, axis=0))[0]))
                     )

    result = (doc_sentiment |
              "Format  Results" >> beam.Map(lambda (doc, tokens): '%s %s' % (doc, tokens))
              )

    (result
     | "Write Results" >> WriteToText("sentiments")
     )

    p.run()
예제 #14
0
def process_datastore_tweets(project, dataset, pipeline_options):
    """Creates a pipeline that reads tweets from Cloud Datastore from the last
  N days. The pipeline finds the top most-used words, the top most-tweeted
  URLs, ranks word co-occurrences by an 'interestingness' metric (similar to
  on tf* idf).
  """
    ts = str(datetime.datetime.utcnow())
    p = beam.Pipeline(options=pipeline_options)
    # Create a query to read entities from datastore.
    query = make_query('Tweet')

    # Read entities from Cloud Datastore into a PCollection.
    lines = (p
             |
             'read from datastore' >> ReadFromDatastore(project, query, None))

    global_count = AsSingleton(
        lines
        | 'global count' >> beam.combiners.Count.Globally())

    # Count the occurrences of each word.
    percents = (lines
                | 'split' >>
                (beam.ParDo(WordExtractingDoFn()).with_output_types(unicode))
                | 'pair_with_one' >> beam.Map(lambda x: (x, 1))
                | 'group' >> beam.GroupByKey()
                | 'count' >> beam.Map(lambda (word, ones): (word, sum(ones)))
                | 'in tweets percent' >> beam.Map(
                    lambda (word, wsum), gc:
                    (word, float(wsum) / gc), global_count))
    top_percents = (
        percents
        | 'top 500' >> combiners.Top.Of(500, lambda x, y: x[1] < y[1]))
    # Count the occurrences of each expanded url in the tweets
    url_counts = (
        lines
        | 'geturls' >>
        (beam.ParDo(URLExtractingDoFn()).with_output_types(unicode))
        | 'urls_pair_with_one' >> beam.Map(lambda x: (x, 1))
        | 'urls_group' >> beam.GroupByKey()
        | 'urls_count' >> beam.Map(lambda (word, ones): (word, sum(ones)))
        | 'urls top 300' >> combiners.Top.Of(300, lambda x, y: x[1] < y[1]))

    # Define some inline helper functions.

    def join_cinfo(cooccur, percents):
        """Calculate a co-occurence ranking."""
        import math

        word1 = cooccur[0][0]
        word2 = cooccur[0][1]
        try:
            word1_percent = percents[word1]
            weight1 = 1 / word1_percent
            word2_percent = percents[word2]
            weight2 = 1 / word2_percent
            return (cooccur[0], cooccur[1],
                    cooccur[1] * math.log(min(weight1, weight2)))
        except:
            return 0

    def generate_cooccur_schema():
        """BigQuery schema for the word co-occurrence table."""
        json_str = json.dumps({
            'fields': [{
                'name': 'w1',
                'type': 'STRING',
                'mode': 'NULLABLE'
            }, {
                'name': 'w2',
                'type': 'STRING',
                'mode': 'NULLABLE'
            }, {
                'name': 'count',
                'type': 'INTEGER',
                'mode': 'NULLABLE'
            }, {
                'name': 'log_weight',
                'type': 'FLOAT',
                'mode': 'NULLABLE'
            }, {
                'name': 'ts',
                'type': 'TIMESTAMP',
                'mode': 'NULLABLE'
            }]
        })
        return parse_table_schema_from_json(json_str)

    def generate_url_schema():
        """BigQuery schema for the urls count table."""
        json_str = json.dumps({
            'fields': [{
                'name': 'url',
                'type': 'STRING',
                'mode': 'NULLABLE'
            }, {
                'name': 'count',
                'type': 'INTEGER',
                'mode': 'NULLABLE'
            }, {
                'name': 'ts',
                'type': 'TIMESTAMP',
                'mode': 'NULLABLE'
            }]
        })
        return parse_table_schema_from_json(json_str)

    def generate_wc_schema():
        """BigQuery schema for the word count table."""
        json_str = json.dumps({
            'fields': [{
                'name': 'word',
                'type': 'STRING',
                'mode': 'NULLABLE'
            }, {
                'name': 'percent',
                'type': 'FLOAT',
                'mode': 'NULLABLE'
            }, {
                'name': 'ts',
                'type': 'TIMESTAMP',
                'mode': 'NULLABLE'
            }]
        })
        return parse_table_schema_from_json(json_str)

    # Now build the rest of the pipeline.
    # Calculate the word co-occurence scores.
    cooccur_rankings = (
        lines
        | 'getcooccur' >> (beam.ParDo(CoOccurExtractingDoFn()))
        | 'co_pair_with_one' >> beam.Map(lambda x: (x, 1))
        | 'co_group' >> beam.GroupByKey()
        | 'co_count' >> beam.Map(lambda (wordts, ones): (wordts, sum(ones)))
        | 'weights' >> beam.Map(join_cinfo, AsDict(percents))
        | 'co top 300' >> combiners.Top.Of(300, lambda x, y: x[2] < y[2]))

    # Format the counts into a PCollection of strings.
    wc_records = top_percents | 'format' >> beam.FlatMap(
        lambda x: [{
            'word': xx[0],
            'percent': xx[1],
            'ts': ts
        } for xx in x])

    url_records = url_counts | 'urls_format' >> beam.FlatMap(
        lambda x: [{
            'url': xx[0],
            'count': xx[1],
            'ts': ts
        } for xx in x])

    co_records = cooccur_rankings | 'co_format' >> beam.FlatMap(
        lambda x: [{
            'w1': xx[0][0],
            'w2': xx[0][1],
            'count': xx[1],
            'log_weight': xx[2],
            'ts': ts
        } for xx in x])

    # Write the results to three BigQuery tables.
    wc_records | 'wc_write_bq' >> beam.io.Write(
        beam.io.BigQuerySink(
            '%s:%s.word_counts' % (project, dataset),
            schema=generate_wc_schema(),
            create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
            write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND))

    url_records | 'urls_write_bq' >> beam.io.Write(
        beam.io.BigQuerySink(
            '%s:%s.urls' % (project, dataset),
            schema=generate_url_schema(),
            create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
            write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND))

    co_records | 'co_write_bq' >> beam.io.Write(
        beam.io.BigQuerySink(
            '%s:%s.word_cooccur' % (project, dataset),
            schema=generate_cooccur_schema(),
            create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
            write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND))

    # Actually run the pipeline.
    return p.run()
예제 #15
0
def run_pipeline(project, pipeline_options, run_locally, debug_attendees):
    """Creates a pipeline that reads entities from Cloud Datastore."""
    p = beam.Pipeline(options=pipeline_options)
    # Create a query to read entities from datastore.
    client = datastore.Client()
    q = client.query(kind='DBEvent')

    if run_locally:
        q.key_filter(client.key('DBEvent', '999'), '>')
        q.key_filter(client.key('DBEvent', 'A'), '<')

    # Let's build a timestamp to save all our objects with
    timestamp = datetime.datetime.now()

    # Set up our map/reduce pipeline
    produce_attendees = (
        p | 'read from datastore' >> ReadFromDatastore(project, query._pb_from_query(q), num_splits=400) |
        'convert to entity' >> beam.Map(ConvertToEntity)
        # Find the events we want to count, and expand all the admins/attendees
        | 'filter events' >> beam.FlatMap(CountableEvent) | 'load fb attending' >> beam.ParDo(GetEventAndAttending()) |
        'export attendees' >> beam.FlatMap(ExportPeople)
    )

    top_attendee_lists = (
        produce_attendees | 'map category -> person' >> beam.FlatMap(GroupPeopleByCategory) | 'group by category' >> beam.GroupByKey() |
        'build top-people lists' >> beam.FlatMap(CountPeopleInfos)
    )

    if debug_attendees:
        attendee_event_debugging = (
            produce_attendees | 'map city-attendee -> event' >> beam.FlatMap(DebugExportEventPeopleForGrouping) |
            'group by city-attendee' >> beam.GroupByKey() |
            'within city-attendee, group event_ids by admin_hash' >> beam.FlatMap(DebugGroupEventIds)
        )

        exploded_top_attendees = (
            top_attendee_lists |
            'explode the top attendees into a mapping: category-attendee -> YES' >> beam.FlatMap(DebugExplodeAttendeeList)
            # We don't deal with duplicates, since it requires the objects (ie our dicts) to be hashable
            # Instead, we rely on DebugFilterForTopAttendee to filter out duplicates created by the above
            # | 'remove duplicates from multiple overlapping attendee-lists' >> beam.RemoveDuplicates()
        )

        (
            # These both have the same keys:
            # key contains {person_type, city, category, person_id}
            (attendee_event_debugging, exploded_top_attendees) | beam.Flatten()
            # keys are {city, person_id}
            | 'group the attendee-debug info with the is-it-a-top-attendee info' >> beam.GroupByKey() |
            'filter for TOP_ATTENDEE' >> beam.FlatMap(DebugFilterForTopAttendee) |
            'build PRDebugAttendee' >> beam.ParDo(DebugBuildPRDebugAttendee(), timestamp) |
            'write PRDebugAttendee to datastore (unbatched)' >> beam.ParDo(WriteToDatastoreSingle(), actually_save=not run_locally)
        )

    (
        top_attendee_lists |
        'generate PRCityCategory database record' >> beam.ParDo(BuildPRCityCategory(), timestamp, 'PRCityCategory', TOP_ALL_N) |
        'write PRCityCategory to datastore (unbatched)' >> beam.ParDo(WriteToDatastoreSingle(), actually_save=not run_locally)
    )
    """
    (output
        | 'convert from entity' >> beam.Map(ConvertFromEntity)
        | 'write to datastore' >> WriteToDatastore(client.project)
    )
    """

    # Actually run the pipeline (all operations above are deferred).
    result = p.run()
    # Wait until completion, main thread would access post-completion job results.
    result.wait_until_finish()
    return result