Beispiel #1
0
 def expand(self, pcoll):
     do_once = pcoll.pipeline | 'DoOnce' >> core.Create([None])
     init_result_coll = do_once | 'InitializeWrite' >> core.Map(
         lambda _, sink: sink.initialize_write(), self.sink)
     if getattr(self.sink, 'num_shards', 0):
         min_shards = self.sink.num_shards
         if min_shards == 1:
             keyed_pcoll = pcoll | core.Map(lambda x: (None, x))
         else:
             keyed_pcoll = pcoll | core.ParDo(_RoundRobinKeyFn(min_shards))
         write_result_coll = (
             keyed_pcoll
             | core.WindowInto(window.GlobalWindows())
             | core.GroupByKey()
             |
             'WriteBundles' >> core.ParDo(_WriteKeyedBundleDoFn(self.sink),
                                          AsSingleton(init_result_coll)))
     else:
         min_shards = 1
         write_result_coll = (
             pcoll
             | 'WriteBundles' >> core.ParDo(_WriteBundleDoFn(self.sink),
                                            AsSingleton(init_result_coll))
             | 'Pair' >> core.Map(lambda x: (None, x))
             | core.WindowInto(window.GlobalWindows())
             | core.GroupByKey()
             | 'Extract' >> core.FlatMap(lambda x: x[1]))
     return do_once | 'FinalizeWrite' >> core.FlatMap(
         _finalize_write, self.sink, AsSingleton(init_result_coll),
         AsIter(write_result_coll), min_shards)
Beispiel #2
0
def create_groups(group_ids, corpus, word, ignore_corpus, ignore_word):
    """Generate groups given the input PCollections."""
    def attach_corpus_fn(group, corpus, ignore):
        selected = None
        len_corpus = len(corpus)
        while not selected:
            c = list(corpus[randrange(0, len_corpus)].values())[0]
            if c != ignore:
                selected = c

        yield (group, selected)

    def attach_word_fn(group, words, ignore):
        selected = None
        len_words = len(words)
        while not selected:
            c = list(words[randrange(0, len_words)].values())[0]
            if c != ignore:
                selected = c

        yield group + (selected, )

    return (group_ids
            | 'attach corpus' >> beam.FlatMap(attach_corpus_fn, AsList(corpus),
                                              AsSingleton(ignore_corpus))
            | 'attach word' >> beam.FlatMap(attach_word_fn, AsList(word),
                                            AsSingleton(ignore_word)))
Beispiel #3
0
    def test_ptransform_override_replacement_inputs(self):
        class MyParDoOverride(PTransformOverride):
            def matches(self, applied_ptransform):
                return (isinstance(applied_ptransform.transform, ParDo)
                        and isinstance(applied_ptransform.transform.fn,
                                       AddWithProductDoFn))

            def get_replacement_transform(self, transform):
                return AddThenMultiply()

            def get_replacement_inputs(self, applied_ptransform):
                assert len(applied_ptransform.inputs) == 1
                assert len(applied_ptransform.side_inputs) == 2
                # Swap the order of the two side inputs
                return (applied_ptransform.inputs[0],
                        applied_ptransform.side_inputs[1].pvalue,
                        applied_ptransform.side_inputs[0].pvalue)

        p = Pipeline()
        pcoll1 = p | 'pc1' >> beam.Create([2])
        pcoll2 = p | 'pc2' >> beam.Create([3])
        pcoll3 = p | 'pc3' >> beam.Create([4, 5, 6])
        result = pcoll3 | 'Operate' >> beam.ParDo(
            AddWithProductDoFn(), AsSingleton(pcoll1), AsSingleton(pcoll2))
        assert_that(result, equal_to([14, 16, 18]))

        p.replace_all([MyParDoOverride()])
        p.run()
Beispiel #4
0
 def test_assingleton_multi_element(self):
     with self.assertRaisesRegex(
             ValueError,
             'PCollection of size 2 with more than one element accessed as a '
             'singleton view. First two elements encountered are \"1\", \"2\".'
     ):
         AsSingleton._from_runtime_iterable([1, 2], {})
Beispiel #5
0
  def expand(self, pbegin):
    if self._read_operations is not None and isinstance(pbegin, PBegin):
      pcoll = pbegin.pipeline | Create(self._read_operations)
    elif not isinstance(pbegin, PBegin):
      if self._read_operations is not None:
        raise ValueError(
            "Read operation in the constructor only works with "
            "the root of the pipeline.")
      pcoll = pbegin
    else:
      raise ValueError(
          "Spanner required read operation, sql or table "
          "with columns.")

    if self._transaction is None:
      # reading as batch read using the spanner partitioning query to create
      # batches.
      p = (
          pcoll
          | 'Generate Partitions' >> ParDo(
              _CreateReadPartitions(spanner_configuration=self._configuration))
          | 'Reshuffle' >> Reshuffle()
          | 'Read From Partitions' >> ParDo(
              _ReadFromPartitionFn(spanner_configuration=self._configuration)))
    else:
      # reading as naive read, in which we don't make batches and execute the
      # queries as a single read.
      p = (
          pcoll
          | 'Reshuffle' >> Reshuffle().with_input_types(ReadOperation)
          | 'Perform Read' >> ParDo(
              _NaiveSpannerReadDoFn(spanner_configuration=self._configuration),
              AsSingleton(self._transaction)))
    return p
def variants_to_examples(input_data, samples_metadata, feature_encoder):
    """Converts variants to TensorFlow Example protos.

  Args:
    input_data: variant call dictionary objects with keys from
      DATA_QUERY_REPLACEMENTS
    samples_metadata: metadata dictionary objects with keys from
      METADATA_QUERY_REPLACEMENTS
    feature_encoder: the feature encoder instance to use to convert the source
      data into TensorFlow Example protos.

  Returns:
    TensorFlow Example protos.
  """
    variant_kvs = input_data | 'bucketVariants' >> beam.Map(
        lambda row: (row[FeatureEncoder.KEY_COLUMN], row))

    sample_variant_kvs = variant_kvs | 'groupBySample' >> beam.GroupByKey()

    examples = (sample_variant_kvs
                | 'samplesToExamples' >> beam.Map(
                    lambda (key, vals), samples_metadata: feature_encoder.
                    sample_variants_to_example(key, vals, samples_metadata),
                    AsSingleton(samples_metadata)))

    return examples
def filter_network_usage_by_mean(pipeline, file_path):
    with pipeline as p:
        fields = (p
                  | 'ReadInputText' >> beam.io.ReadFromText(file_path)
                  | 'ParseLogs' >> beam.ParDo(ParseNetworkLogs()))

        # Not sure I understand what AsSingleton does.
        global_mean = AsSingleton(
            fields
            |
            'GetAllBytes' >> beam.Map(lambda elem: elem['http.response.bytes'])
            | 'CalculateGlobalMean' >> beam.combiners.Mean.Globally())

        return (
            fields
            | 'MapIPBytePairs' >> beam.Map(
                lambda elem: (elem['source.ip'], elem['http.response.bytes']))
            | 'FilterBelowMean' >> beam.Filter(
                lambda entry, mean: entry[1] > mean, global_mean)
            | 'CountPacketsOverMean' >> beam.combiners.Count.PerKey()
            # Using distinct was also a way to go, but I wanted to see how many
            # were above mean
            | 'SortByCount' >> beam.combiners.Top.Of(
                10, key=(lambda entry: entry[::-1]))
            | 'ConvertToFlatMap' >> beam.FlatMap(lambda entry: entry))
Beispiel #8
0
    def test_ptransform_override_side_inputs(self):
        class MyParDoOverride(PTransformOverride):
            def matches(self, applied_ptransform):
                return (isinstance(applied_ptransform.transform, ParDo)
                        and isinstance(applied_ptransform.transform.fn,
                                       AddWithProductDoFn))

            def get_replacement_transform(self, transform):
                return AddThenMultiply()

        p = Pipeline()
        pcoll1 = p | 'pc1' >> beam.Create([2])
        pcoll2 = p | 'pc2' >> beam.Create([3])
        pcoll3 = p | 'pc3' >> beam.Create([4, 5, 6])
        result = pcoll3 | 'Operate' >> beam.ParDo(
            AddWithProductDoFn(), AsSingleton(pcoll1), AsSingleton(pcoll2))
        assert_that(result, equal_to([18, 21, 24]))

        p.replace_all([MyParDoOverride()])
        p.run()
  def test_side_input_tagged(self):
    class TestDoFn(DoFn):
      def process(self, element, prefix, suffix=DoFn.SideInputParam):
        return ['%s-%s-%s' % (prefix, element, suffix)]

    with TestPipeline() as pipeline:
      words_list = ['aa', 'bb', 'cc']
      words = pipeline | 'SomeWords' >> Create(words_list)
      prefix = 'zyx'
      suffix = pipeline | 'SomeString' >> Create(['xyz'])  # side in
      result = words | 'DecorateWordsDoFnNoTag' >> ParDo(
          TestDoFn(), prefix, suffix=AsSingleton(suffix))
      assert_that(result, equal_to(['zyx-%s-xyz' % x for x in words_list]))
Beispiel #10
0
def filter_cold_days(p, input_data, month_filter):
    """Workflow computing rows in a specific month with low temperatures.
    Args:
      input_data: a PCollection of dictionaries representing table rows. Each
        dictionary must have the keys ['year', 'month', 'day', and 'mean_temp'].
      month_filter: an int representing the month for which colder-than-average
        days should be returned.
    Returns:
      A PCollection of dictionaries with the same keys described above. Each
        row represents a day in the specified month where temperatures were
        colder than the global mean temperature in the entire dataset.
    """

    # Project to only the desired fields from a complete input row.
    # E.g., SELECT f1, f2, f3, ... FROM InputTable.
    projection_fields = ['month', 'mean_temp']
    fields_of_interest = (
        input_data
        | 'Projected' >>
        beam.Map(lambda row: {f: row[f]
                              for f in projection_fields}))

    # Compute the global mean temperature.
    global_mean = AsSingleton(
        fields_of_interest
        | 'ExtractMean' >> beam.Map(lambda row: row['mean_temp'])
        | 'GlobalMean' >> beam.combiners.Mean.Globally())
    global_mean = AsSingleton(p | beam.Create([1000]))

    # Filter to the rows representing days in the month of interest
    # in which the mean daily temperature is below the global mean.
    return (
        fields_of_interest
        |
        'DesiredMonth' >> beam.Filter(lambda row: row['month'] == month_filter)
        | 'BelowMean' >> beam.Filter(lambda row, mean: row['mean_temp'] < mean,
                                     global_mean))
    def test_sdf_with_side_inputs(self):
        with TestPipeline() as p:
            side1 = p | 'Create1' >> Create(['1', '2'])
            side2 = p | 'Create2' >> Create(['3', '4'])
            side3 = p | 'Create3' >> Create(['5'])
            result = (p
                      | 'create_main' >> beam.Create(['a', 'b', 'c'])
                      | beam.ParDo(ExpandStrings(), AsList(side1),
                                   AsList(side2), AsSingleton(side3)))

            expected_result = []
            for c in ['a', 'b', 'c']:
                for i in range(5):
                    expected_result.append(c + ':' + str(i + 1))
            assert_that(result, equal_to(expected_result))
    def test_get_sample_ids(self):
        hash_dict = {'N01': 1, 'N02': 2, 'N03': 3, 'N04': 4}
        sample_names = ['N01', 'N02', 'N03', 'N04']
        expected_sample_ids = [1, 2, 3, 4]

        pipeline = TestPipeline()
        hash_dict_pc = (pipeline
                        | 'CreateHashDict' >> Create(hash_dict)
                        | combiners.ToDict())
        sample_ids = (
            pipeline
            | Create(sample_names)
            | 'GetSampleNames' >> GetSampleIds(AsSingleton(hash_dict_pc)))

        assert_that(sample_ids, asserts.items_equal(expected_sample_ids))
        pipeline.run()
Beispiel #13
0
    def test_pcollectionview_not_recreated(self):
        pipeline = Pipeline('DirectRunner')
        value = pipeline | 'create1' >> Create([1, 2, 3])
        value2 = pipeline | 'create2' >> Create([(1, 1), (2, 2), (3, 3)])
        value3 = pipeline | 'create3' >> Create([(1, 1), (2, 2), (3, 3)])
        self.assertEqual(AsSingleton(value), AsSingleton(value))
        self.assertEqual(AsSingleton('new', value, default_value=1),
                         AsSingleton('new', value, default_value=1))
        self.assertNotEqual(AsSingleton(value),
                            AsSingleton('new', value, default_value=1))
        self.assertEqual(AsIter(value), AsIter(value))
        self.assertEqual(AsList(value), AsList(value))
        self.assertEqual(AsDict(value2), AsDict(value2))

        self.assertNotEqual(AsSingleton(value), AsSingleton(value2))
        self.assertNotEqual(AsIter(value), AsIter(value2))
        self.assertNotEqual(AsList(value), AsList(value2))
        self.assertNotEqual(AsDict(value2), AsDict(value3))
Beispiel #14
0
def process_datastore_tweets(project, dataset, pipeline_options):
    """Creates a pipeline that reads tweets from Cloud Datastore from the last
  N days. The pipeline finds the top most-used words, the top most-tweeted
  URLs, ranks word co-occurrences by an 'interestingness' metric (similar to
  on tf* idf).
  """
    ts = str(datetime.datetime.utcnow())
    p = beam.Pipeline(options=pipeline_options)
    # Create a query to read entities from datastore.
    query = make_query('Tweet')

    # Read entities from Cloud Datastore into a PCollection.
    lines = (p
             |
             'read from datastore' >> ReadFromDatastore(project, query, None))

    global_count = AsSingleton(
        lines
        | 'global count' >> beam.combiners.Count.Globally())

    # Count the occurrences of each word.
    percents = (lines
                | 'split' >>
                (beam.ParDo(WordExtractingDoFn()).with_output_types(unicode))
                | 'pair_with_one' >> beam.Map(lambda x: (x, 1))
                | 'group' >> beam.GroupByKey()
                | 'count' >> beam.Map(lambda (word, ones): (word, sum(ones)))
                | 'in tweets percent' >> beam.Map(
                    lambda (word, wsum), gc:
                    (word, float(wsum) / gc), global_count))
    top_percents = (
        percents
        | 'top 500' >> combiners.Top.Of(500, lambda x, y: x[1] < y[1]))
    # Count the occurrences of each expanded url in the tweets
    url_counts = (
        lines
        | 'geturls' >>
        (beam.ParDo(URLExtractingDoFn()).with_output_types(unicode))
        | 'urls_pair_with_one' >> beam.Map(lambda x: (x, 1))
        | 'urls_group' >> beam.GroupByKey()
        | 'urls_count' >> beam.Map(lambda (word, ones): (word, sum(ones)))
        | 'urls top 300' >> combiners.Top.Of(300, lambda x, y: x[1] < y[1]))

    # Define some inline helper functions.

    def join_cinfo(cooccur, percents):
        """Calculate a co-occurence ranking."""
        import math

        word1 = cooccur[0][0]
        word2 = cooccur[0][1]
        try:
            word1_percent = percents[word1]
            weight1 = 1 / word1_percent
            word2_percent = percents[word2]
            weight2 = 1 / word2_percent
            return (cooccur[0], cooccur[1],
                    cooccur[1] * math.log(min(weight1, weight2)))
        except:
            return 0

    def generate_cooccur_schema():
        """BigQuery schema for the word co-occurrence table."""
        json_str = json.dumps({
            'fields': [{
                'name': 'w1',
                'type': 'STRING',
                'mode': 'NULLABLE'
            }, {
                'name': 'w2',
                'type': 'STRING',
                'mode': 'NULLABLE'
            }, {
                'name': 'count',
                'type': 'INTEGER',
                'mode': 'NULLABLE'
            }, {
                'name': 'log_weight',
                'type': 'FLOAT',
                'mode': 'NULLABLE'
            }, {
                'name': 'ts',
                'type': 'TIMESTAMP',
                'mode': 'NULLABLE'
            }]
        })
        return parse_table_schema_from_json(json_str)

    def generate_url_schema():
        """BigQuery schema for the urls count table."""
        json_str = json.dumps({
            'fields': [{
                'name': 'url',
                'type': 'STRING',
                'mode': 'NULLABLE'
            }, {
                'name': 'count',
                'type': 'INTEGER',
                'mode': 'NULLABLE'
            }, {
                'name': 'ts',
                'type': 'TIMESTAMP',
                'mode': 'NULLABLE'
            }]
        })
        return parse_table_schema_from_json(json_str)

    def generate_wc_schema():
        """BigQuery schema for the word count table."""
        json_str = json.dumps({
            'fields': [{
                'name': 'word',
                'type': 'STRING',
                'mode': 'NULLABLE'
            }, {
                'name': 'percent',
                'type': 'FLOAT',
                'mode': 'NULLABLE'
            }, {
                'name': 'ts',
                'type': 'TIMESTAMP',
                'mode': 'NULLABLE'
            }]
        })
        return parse_table_schema_from_json(json_str)

    # Now build the rest of the pipeline.
    # Calculate the word co-occurence scores.
    cooccur_rankings = (
        lines
        | 'getcooccur' >> (beam.ParDo(CoOccurExtractingDoFn()))
        | 'co_pair_with_one' >> beam.Map(lambda x: (x, 1))
        | 'co_group' >> beam.GroupByKey()
        | 'co_count' >> beam.Map(lambda (wordts, ones): (wordts, sum(ones)))
        | 'weights' >> beam.Map(join_cinfo, AsDict(percents))
        | 'co top 300' >> combiners.Top.Of(300, lambda x, y: x[2] < y[2]))

    # Format the counts into a PCollection of strings.
    wc_records = top_percents | 'format' >> beam.FlatMap(
        lambda x: [{
            'word': xx[0],
            'percent': xx[1],
            'ts': ts
        } for xx in x])

    url_records = url_counts | 'urls_format' >> beam.FlatMap(
        lambda x: [{
            'url': xx[0],
            'count': xx[1],
            'ts': ts
        } for xx in x])

    co_records = cooccur_rankings | 'co_format' >> beam.FlatMap(
        lambda x: [{
            'w1': xx[0][0],
            'w2': xx[0][1],
            'count': xx[1],
            'log_weight': xx[2],
            'ts': ts
        } for xx in x])

    # Write the results to three BigQuery tables.
    wc_records | 'wc_write_bq' >> beam.io.Write(
        beam.io.BigQuerySink(
            '%s:%s.word_counts' % (project, dataset),
            schema=generate_wc_schema(),
            create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
            write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND))

    url_records | 'urls_write_bq' >> beam.io.Write(
        beam.io.BigQuerySink(
            '%s:%s.urls' % (project, dataset),
            schema=generate_url_schema(),
            create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
            write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND))

    co_records | 'co_write_bq' >> beam.io.Write(
        beam.io.BigQuerySink(
            '%s:%s.word_cooccur' % (project, dataset),
            schema=generate_cooccur_schema(),
            create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
            write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND))

    # Actually run the pipeline.
    return p.run()
 def test_assingleton_multi_element(self):
   with self.assertRaisesRegexp(
       ValueError,
       'PCollection of size 2 with more than one element accessed as a '
       'singleton view. First two elements encountered are \"1\", \"2\".'):
     AsSingleton._from_runtime_iterable([1, 2], {})
Beispiel #16
0
 def expand(self, pvalues):
     return pvalues[0] | beam.ParDo(AddThenMultiplyDoFn(),
                                    AsSingleton(pvalues[1]),
                                    AsSingleton(pvalues[2]))
Beispiel #17
0
        # Compute a mapping from each word to its document frequency.
        # A word's document frequency in a corpus is the number of
        # documents in which the word appears divided by the total
        # number of documents in the corpus.
        #
        # This calculation uses a side input, a Dataflow-computed auxiliary value
        # presented to each invocation of our MapFn lambda. The second argument to
        # the lambda (called total---note that we are unpacking the first argument)
        # receives the value we listed after the lambda in Map(). Additional side
        # inputs (and ordinary Python values, too) can be provided to MapFns and
        # DoFns in this way.
        word_to_df = (
            word_to_doc_count
            | 'ComputeDocFrequencies' >> beam.Map(
                lambda (word, count), total:
                (word, float(count) / total), AsSingleton(total_documents)))

        # Join the term frequency and document frequency collections,
        # each keyed on the word.
        word_to_uri_and_tf_and_df = (
            {
                'tf': word_to_uri_and_tf,
                'df': word_to_df
            }
            | 'CoGroupWordsByTf-df' >> beam.CoGroupByKey())

        # Compute a mapping from each word to a (URI, TF-IDF) score for each URI.
        # There are a variety of definitions of TF-IDF
        # ("term frequency - inverse document frequency") score; here we use a
        # basic version that is the term frequency divided by the log of the
        # document frequency.
Beispiel #18
0
def side_input_factory(p, item, label_name):
    return AsSingleton(
        p
        | label_name + ', creating collection' >> beam.Create([item])
        | label_name + ', combining globally' >> beam.CombineGlobally(
            beam.combiners.ToListCombineFn()))
Beispiel #19
0
    def expand(self, uri_to_content):

        # Compute the total number of documents, and prepare a singleton
        # PCollection to use as side input.
        total_documents = (uri_to_content
                           | 'GetUris 1' >> beam.Keys()
                           | 'GetUniqueUris' >> beam.Distinct()
                           | 'CountUris' >> beam.combiners.Count.Globally())

        # Create a collection of pairs mapping a URI to each of the words
        # in the document associated with that that URI.

        def split_into_words(uri_line):
            (uri, line) = uri_line
            return [(uri, w.lower()) for w in re.findall(r'[A-Za-z\']+', line)]

        uri_to_words = (uri_to_content
                        | 'SplitWords' >> beam.FlatMap(split_into_words))

        # Compute a mapping from each word to the total number of documents
        # in which it appears.
        word_to_doc_count = (
            uri_to_words
            | 'GetUniqueWordsPerDoc' >> beam.Distinct()
            | 'GetWords' >> beam.Values()
            | 'CountDocsPerWord' >> beam.combiners.Count.PerElement())

        # Compute a mapping from each URI to the total number of words in the
        # document associated with that URI.
        uri_to_word_total = (
            uri_to_words
            | 'GetUris 2' >> beam.Keys()
            | 'CountWordsInDoc' >> beam.combiners.Count.PerElement())

        # Count, for each (URI, word) pair, the number of occurrences of that word
        # in the document associated with the URI.
        uri_and_word_to_count = (
            uri_to_words
            | 'CountWord-DocPairs' >> beam.combiners.Count.PerElement())

        # Adjust the above collection to a mapping from (URI, word) pairs to counts
        # into an isomorphic mapping from URI to (word, count) pairs, to prepare
        # for a join by the URI key.
        def shift_keys(uri_word_count):
            return (uri_word_count[0][0], (uri_word_count[0][1],
                                           uri_word_count[1]))

        uri_to_word_and_count = (uri_and_word_to_count
                                 | 'ShiftKeys' >> beam.Map(shift_keys))

        # Perform a CoGroupByKey (a sort of pre-join) on the prepared
        # uri_to_word_total and uri_to_word_and_count tagged by 'word totals' and
        # 'word counts' strings. This yields a mapping from URI to a dictionary
        # that maps the above mentioned tag strings to an iterable containing the
        # word total for that URI and word and count respectively.
        #
        # A diagram (in which '[]' just means 'iterable'):
        #
        #   URI: {'word totals': [count],  # Total words within this URI's document.
        #         'word counts': [(word, count),  # Counts of specific words
        #                         (word, count),  # within this URI's document.
        #                         ... ]}
        uri_to_word_and_count_and_total = (
            {
                'word totals': uri_to_word_total,
                'word counts': uri_to_word_and_count
            }
            | 'CoGroupByUri' >> beam.CoGroupByKey())

        # Compute a mapping from each word to a (URI, term frequency) pair for each
        # URI. A word's term frequency for a document is simply the number of times
        # that word occurs in the document divided by the total number of words in
        # the document.

        def compute_term_frequency(uri_count_and_total):
            (uri, count_and_total) = uri_count_and_total
            word_and_count = count_and_total['word counts']
            # We have an iterable for one element that we want extracted.
            [word_total] = count_and_total['word totals']
            for word, count in word_and_count:
                yield word, (uri, float(count) / word_total)

        word_to_uri_and_tf = (
            uri_to_word_and_count_and_total
            | 'ComputeTermFrequencies' >> beam.FlatMap(compute_term_frequency))

        # Compute a mapping from each word to its document frequency.
        # A word's document frequency in a corpus is the number of
        # documents in which the word appears divided by the total
        # number of documents in the corpus.
        #
        # This calculation uses a side input, a Dataflow-computed auxiliary value
        # presented to each invocation of our MapFn lambda. The second argument to
        # the function (called total---note that the first argument is a tuple)
        # receives the value we listed after the lambda in Map(). Additional side
        # inputs (and ordinary Python values, too) can be provided to MapFns and
        # DoFns in this way.
        def div_word_count_by_total(word_count, total):
            (word, count) = word_count
            return (word, float(count) / total)

        word_to_df = (
            word_to_doc_count
            | 'ComputeDocFrequencies' >> beam.Map(
                div_word_count_by_total, AsSingleton(total_documents)))

        # Join the term frequency and document frequency collections,
        # each keyed on the word.
        word_to_uri_and_tf_and_df = (
            {
                'tf': word_to_uri_and_tf,
                'df': word_to_df
            }
            | 'CoGroupWordsByTf-df' >> beam.CoGroupByKey())

        # Compute a mapping from each word to a (URI, TF-IDF) score for each URI.
        # There are a variety of definitions of TF-IDF
        # ("term frequency - inverse document frequency") score; here we use a
        # basic version that is the term frequency divided by the log of the
        # document frequency.

        def compute_tf_idf(word_tf_and_df):
            (word, tf_and_df) = word_tf_and_df
            [docf] = tf_and_df['df']
            for uri, tf in tf_and_df['tf']:
                yield word, (uri, tf * math.log(1 / docf))

        word_to_uri_and_tfidf = (
            word_to_uri_and_tf_and_df
            | 'ComputeTf-idf' >> beam.FlatMap(compute_tf_idf))

        return word_to_uri_and_tfidf
Beispiel #20
0
    # Compute a mapping from each word to its document frequency.
    # A word's document frequency in a corpus is the number of
    # documents in which the word appears divided by the total
    # number of documents in the corpus.
    #
    # This calculation uses a side input, a Dataflow-computed auxiliary value
    # presented to each invocation of our MapFn lambda. The second argument to
    # the lambda (called total---note that we are unpacking the first argument)
    # receives the value we listed after the lambda in Map(). Additional side
    # inputs (and ordinary Python values, too) can be provided to MapFns and
    # DoFns in this way.
    word_to_df = (
        word_to_doc_count
        | 'ComputeDocFrequencies' >> beam.Map(
            lambda (word, count), total: (word, float(count) / total),
            AsSingleton(total_documents)))

    # Join the term frequency and document frequency collections,
    # each keyed on the word.
    word_to_uri_and_tf_and_df = (
        {'tf': word_to_uri_and_tf, 'df': word_to_df}
        | 'CoGroupWordsByTf-df' >> beam.CoGroupByKey())

    # Compute a mapping from each word to a (URI, TF-IDF) score for each URI.
    # There are a variety of definitions of TF-IDF
    # ("term frequency - inverse document frequency") score; here we use a
    # basic version that is the term frequency divided by the log of the
    # document frequency.

    def compute_tf_idf((word, tf_and_df)):
      [docf] = tf_and_df['df']