def expand(self, pcoll): do_once = pcoll.pipeline | 'DoOnce' >> core.Create([None]) init_result_coll = do_once | 'InitializeWrite' >> core.Map( lambda _, sink: sink.initialize_write(), self.sink) if getattr(self.sink, 'num_shards', 0): min_shards = self.sink.num_shards if min_shards == 1: keyed_pcoll = pcoll | core.Map(lambda x: (None, x)) else: keyed_pcoll = pcoll | core.ParDo(_RoundRobinKeyFn(min_shards)) write_result_coll = ( keyed_pcoll | core.WindowInto(window.GlobalWindows()) | core.GroupByKey() | 'WriteBundles' >> core.ParDo(_WriteKeyedBundleDoFn(self.sink), AsSingleton(init_result_coll))) else: min_shards = 1 write_result_coll = ( pcoll | 'WriteBundles' >> core.ParDo(_WriteBundleDoFn(self.sink), AsSingleton(init_result_coll)) | 'Pair' >> core.Map(lambda x: (None, x)) | core.WindowInto(window.GlobalWindows()) | core.GroupByKey() | 'Extract' >> core.FlatMap(lambda x: x[1])) return do_once | 'FinalizeWrite' >> core.FlatMap( _finalize_write, self.sink, AsSingleton(init_result_coll), AsIter(write_result_coll), min_shards)
def create_groups(group_ids, corpus, word, ignore_corpus, ignore_word): """Generate groups given the input PCollections.""" def attach_corpus_fn(group, corpus, ignore): selected = None len_corpus = len(corpus) while not selected: c = list(corpus[randrange(0, len_corpus)].values())[0] if c != ignore: selected = c yield (group, selected) def attach_word_fn(group, words, ignore): selected = None len_words = len(words) while not selected: c = list(words[randrange(0, len_words)].values())[0] if c != ignore: selected = c yield group + (selected, ) return (group_ids | 'attach corpus' >> beam.FlatMap(attach_corpus_fn, AsList(corpus), AsSingleton(ignore_corpus)) | 'attach word' >> beam.FlatMap(attach_word_fn, AsList(word), AsSingleton(ignore_word)))
def test_ptransform_override_replacement_inputs(self): class MyParDoOverride(PTransformOverride): def matches(self, applied_ptransform): return (isinstance(applied_ptransform.transform, ParDo) and isinstance(applied_ptransform.transform.fn, AddWithProductDoFn)) def get_replacement_transform(self, transform): return AddThenMultiply() def get_replacement_inputs(self, applied_ptransform): assert len(applied_ptransform.inputs) == 1 assert len(applied_ptransform.side_inputs) == 2 # Swap the order of the two side inputs return (applied_ptransform.inputs[0], applied_ptransform.side_inputs[1].pvalue, applied_ptransform.side_inputs[0].pvalue) p = Pipeline() pcoll1 = p | 'pc1' >> beam.Create([2]) pcoll2 = p | 'pc2' >> beam.Create([3]) pcoll3 = p | 'pc3' >> beam.Create([4, 5, 6]) result = pcoll3 | 'Operate' >> beam.ParDo( AddWithProductDoFn(), AsSingleton(pcoll1), AsSingleton(pcoll2)) assert_that(result, equal_to([14, 16, 18])) p.replace_all([MyParDoOverride()]) p.run()
def test_assingleton_multi_element(self): with self.assertRaisesRegex( ValueError, 'PCollection of size 2 with more than one element accessed as a ' 'singleton view. First two elements encountered are \"1\", \"2\".' ): AsSingleton._from_runtime_iterable([1, 2], {})
def expand(self, pbegin): if self._read_operations is not None and isinstance(pbegin, PBegin): pcoll = pbegin.pipeline | Create(self._read_operations) elif not isinstance(pbegin, PBegin): if self._read_operations is not None: raise ValueError( "Read operation in the constructor only works with " "the root of the pipeline.") pcoll = pbegin else: raise ValueError( "Spanner required read operation, sql or table " "with columns.") if self._transaction is None: # reading as batch read using the spanner partitioning query to create # batches. p = ( pcoll | 'Generate Partitions' >> ParDo( _CreateReadPartitions(spanner_configuration=self._configuration)) | 'Reshuffle' >> Reshuffle() | 'Read From Partitions' >> ParDo( _ReadFromPartitionFn(spanner_configuration=self._configuration))) else: # reading as naive read, in which we don't make batches and execute the # queries as a single read. p = ( pcoll | 'Reshuffle' >> Reshuffle().with_input_types(ReadOperation) | 'Perform Read' >> ParDo( _NaiveSpannerReadDoFn(spanner_configuration=self._configuration), AsSingleton(self._transaction))) return p
def variants_to_examples(input_data, samples_metadata, feature_encoder): """Converts variants to TensorFlow Example protos. Args: input_data: variant call dictionary objects with keys from DATA_QUERY_REPLACEMENTS samples_metadata: metadata dictionary objects with keys from METADATA_QUERY_REPLACEMENTS feature_encoder: the feature encoder instance to use to convert the source data into TensorFlow Example protos. Returns: TensorFlow Example protos. """ variant_kvs = input_data | 'bucketVariants' >> beam.Map( lambda row: (row[FeatureEncoder.KEY_COLUMN], row)) sample_variant_kvs = variant_kvs | 'groupBySample' >> beam.GroupByKey() examples = (sample_variant_kvs | 'samplesToExamples' >> beam.Map( lambda (key, vals), samples_metadata: feature_encoder. sample_variants_to_example(key, vals, samples_metadata), AsSingleton(samples_metadata))) return examples
def filter_network_usage_by_mean(pipeline, file_path): with pipeline as p: fields = (p | 'ReadInputText' >> beam.io.ReadFromText(file_path) | 'ParseLogs' >> beam.ParDo(ParseNetworkLogs())) # Not sure I understand what AsSingleton does. global_mean = AsSingleton( fields | 'GetAllBytes' >> beam.Map(lambda elem: elem['http.response.bytes']) | 'CalculateGlobalMean' >> beam.combiners.Mean.Globally()) return ( fields | 'MapIPBytePairs' >> beam.Map( lambda elem: (elem['source.ip'], elem['http.response.bytes'])) | 'FilterBelowMean' >> beam.Filter( lambda entry, mean: entry[1] > mean, global_mean) | 'CountPacketsOverMean' >> beam.combiners.Count.PerKey() # Using distinct was also a way to go, but I wanted to see how many # were above mean | 'SortByCount' >> beam.combiners.Top.Of( 10, key=(lambda entry: entry[::-1])) | 'ConvertToFlatMap' >> beam.FlatMap(lambda entry: entry))
def test_ptransform_override_side_inputs(self): class MyParDoOverride(PTransformOverride): def matches(self, applied_ptransform): return (isinstance(applied_ptransform.transform, ParDo) and isinstance(applied_ptransform.transform.fn, AddWithProductDoFn)) def get_replacement_transform(self, transform): return AddThenMultiply() p = Pipeline() pcoll1 = p | 'pc1' >> beam.Create([2]) pcoll2 = p | 'pc2' >> beam.Create([3]) pcoll3 = p | 'pc3' >> beam.Create([4, 5, 6]) result = pcoll3 | 'Operate' >> beam.ParDo( AddWithProductDoFn(), AsSingleton(pcoll1), AsSingleton(pcoll2)) assert_that(result, equal_to([18, 21, 24])) p.replace_all([MyParDoOverride()]) p.run()
def test_side_input_tagged(self): class TestDoFn(DoFn): def process(self, element, prefix, suffix=DoFn.SideInputParam): return ['%s-%s-%s' % (prefix, element, suffix)] with TestPipeline() as pipeline: words_list = ['aa', 'bb', 'cc'] words = pipeline | 'SomeWords' >> Create(words_list) prefix = 'zyx' suffix = pipeline | 'SomeString' >> Create(['xyz']) # side in result = words | 'DecorateWordsDoFnNoTag' >> ParDo( TestDoFn(), prefix, suffix=AsSingleton(suffix)) assert_that(result, equal_to(['zyx-%s-xyz' % x for x in words_list]))
def filter_cold_days(p, input_data, month_filter): """Workflow computing rows in a specific month with low temperatures. Args: input_data: a PCollection of dictionaries representing table rows. Each dictionary must have the keys ['year', 'month', 'day', and 'mean_temp']. month_filter: an int representing the month for which colder-than-average days should be returned. Returns: A PCollection of dictionaries with the same keys described above. Each row represents a day in the specified month where temperatures were colder than the global mean temperature in the entire dataset. """ # Project to only the desired fields from a complete input row. # E.g., SELECT f1, f2, f3, ... FROM InputTable. projection_fields = ['month', 'mean_temp'] fields_of_interest = ( input_data | 'Projected' >> beam.Map(lambda row: {f: row[f] for f in projection_fields})) # Compute the global mean temperature. global_mean = AsSingleton( fields_of_interest | 'ExtractMean' >> beam.Map(lambda row: row['mean_temp']) | 'GlobalMean' >> beam.combiners.Mean.Globally()) global_mean = AsSingleton(p | beam.Create([1000])) # Filter to the rows representing days in the month of interest # in which the mean daily temperature is below the global mean. return ( fields_of_interest | 'DesiredMonth' >> beam.Filter(lambda row: row['month'] == month_filter) | 'BelowMean' >> beam.Filter(lambda row, mean: row['mean_temp'] < mean, global_mean))
def test_sdf_with_side_inputs(self): with TestPipeline() as p: side1 = p | 'Create1' >> Create(['1', '2']) side2 = p | 'Create2' >> Create(['3', '4']) side3 = p | 'Create3' >> Create(['5']) result = (p | 'create_main' >> beam.Create(['a', 'b', 'c']) | beam.ParDo(ExpandStrings(), AsList(side1), AsList(side2), AsSingleton(side3))) expected_result = [] for c in ['a', 'b', 'c']: for i in range(5): expected_result.append(c + ':' + str(i + 1)) assert_that(result, equal_to(expected_result))
def test_get_sample_ids(self): hash_dict = {'N01': 1, 'N02': 2, 'N03': 3, 'N04': 4} sample_names = ['N01', 'N02', 'N03', 'N04'] expected_sample_ids = [1, 2, 3, 4] pipeline = TestPipeline() hash_dict_pc = (pipeline | 'CreateHashDict' >> Create(hash_dict) | combiners.ToDict()) sample_ids = ( pipeline | Create(sample_names) | 'GetSampleNames' >> GetSampleIds(AsSingleton(hash_dict_pc))) assert_that(sample_ids, asserts.items_equal(expected_sample_ids)) pipeline.run()
def test_pcollectionview_not_recreated(self): pipeline = Pipeline('DirectRunner') value = pipeline | 'create1' >> Create([1, 2, 3]) value2 = pipeline | 'create2' >> Create([(1, 1), (2, 2), (3, 3)]) value3 = pipeline | 'create3' >> Create([(1, 1), (2, 2), (3, 3)]) self.assertEqual(AsSingleton(value), AsSingleton(value)) self.assertEqual(AsSingleton('new', value, default_value=1), AsSingleton('new', value, default_value=1)) self.assertNotEqual(AsSingleton(value), AsSingleton('new', value, default_value=1)) self.assertEqual(AsIter(value), AsIter(value)) self.assertEqual(AsList(value), AsList(value)) self.assertEqual(AsDict(value2), AsDict(value2)) self.assertNotEqual(AsSingleton(value), AsSingleton(value2)) self.assertNotEqual(AsIter(value), AsIter(value2)) self.assertNotEqual(AsList(value), AsList(value2)) self.assertNotEqual(AsDict(value2), AsDict(value3))
def process_datastore_tweets(project, dataset, pipeline_options): """Creates a pipeline that reads tweets from Cloud Datastore from the last N days. The pipeline finds the top most-used words, the top most-tweeted URLs, ranks word co-occurrences by an 'interestingness' metric (similar to on tf* idf). """ ts = str(datetime.datetime.utcnow()) p = beam.Pipeline(options=pipeline_options) # Create a query to read entities from datastore. query = make_query('Tweet') # Read entities from Cloud Datastore into a PCollection. lines = (p | 'read from datastore' >> ReadFromDatastore(project, query, None)) global_count = AsSingleton( lines | 'global count' >> beam.combiners.Count.Globally()) # Count the occurrences of each word. percents = (lines | 'split' >> (beam.ParDo(WordExtractingDoFn()).with_output_types(unicode)) | 'pair_with_one' >> beam.Map(lambda x: (x, 1)) | 'group' >> beam.GroupByKey() | 'count' >> beam.Map(lambda (word, ones): (word, sum(ones))) | 'in tweets percent' >> beam.Map( lambda (word, wsum), gc: (word, float(wsum) / gc), global_count)) top_percents = ( percents | 'top 500' >> combiners.Top.Of(500, lambda x, y: x[1] < y[1])) # Count the occurrences of each expanded url in the tweets url_counts = ( lines | 'geturls' >> (beam.ParDo(URLExtractingDoFn()).with_output_types(unicode)) | 'urls_pair_with_one' >> beam.Map(lambda x: (x, 1)) | 'urls_group' >> beam.GroupByKey() | 'urls_count' >> beam.Map(lambda (word, ones): (word, sum(ones))) | 'urls top 300' >> combiners.Top.Of(300, lambda x, y: x[1] < y[1])) # Define some inline helper functions. def join_cinfo(cooccur, percents): """Calculate a co-occurence ranking.""" import math word1 = cooccur[0][0] word2 = cooccur[0][1] try: word1_percent = percents[word1] weight1 = 1 / word1_percent word2_percent = percents[word2] weight2 = 1 / word2_percent return (cooccur[0], cooccur[1], cooccur[1] * math.log(min(weight1, weight2))) except: return 0 def generate_cooccur_schema(): """BigQuery schema for the word co-occurrence table.""" json_str = json.dumps({ 'fields': [{ 'name': 'w1', 'type': 'STRING', 'mode': 'NULLABLE' }, { 'name': 'w2', 'type': 'STRING', 'mode': 'NULLABLE' }, { 'name': 'count', 'type': 'INTEGER', 'mode': 'NULLABLE' }, { 'name': 'log_weight', 'type': 'FLOAT', 'mode': 'NULLABLE' }, { 'name': 'ts', 'type': 'TIMESTAMP', 'mode': 'NULLABLE' }] }) return parse_table_schema_from_json(json_str) def generate_url_schema(): """BigQuery schema for the urls count table.""" json_str = json.dumps({ 'fields': [{ 'name': 'url', 'type': 'STRING', 'mode': 'NULLABLE' }, { 'name': 'count', 'type': 'INTEGER', 'mode': 'NULLABLE' }, { 'name': 'ts', 'type': 'TIMESTAMP', 'mode': 'NULLABLE' }] }) return parse_table_schema_from_json(json_str) def generate_wc_schema(): """BigQuery schema for the word count table.""" json_str = json.dumps({ 'fields': [{ 'name': 'word', 'type': 'STRING', 'mode': 'NULLABLE' }, { 'name': 'percent', 'type': 'FLOAT', 'mode': 'NULLABLE' }, { 'name': 'ts', 'type': 'TIMESTAMP', 'mode': 'NULLABLE' }] }) return parse_table_schema_from_json(json_str) # Now build the rest of the pipeline. # Calculate the word co-occurence scores. cooccur_rankings = ( lines | 'getcooccur' >> (beam.ParDo(CoOccurExtractingDoFn())) | 'co_pair_with_one' >> beam.Map(lambda x: (x, 1)) | 'co_group' >> beam.GroupByKey() | 'co_count' >> beam.Map(lambda (wordts, ones): (wordts, sum(ones))) | 'weights' >> beam.Map(join_cinfo, AsDict(percents)) | 'co top 300' >> combiners.Top.Of(300, lambda x, y: x[2] < y[2])) # Format the counts into a PCollection of strings. wc_records = top_percents | 'format' >> beam.FlatMap( lambda x: [{ 'word': xx[0], 'percent': xx[1], 'ts': ts } for xx in x]) url_records = url_counts | 'urls_format' >> beam.FlatMap( lambda x: [{ 'url': xx[0], 'count': xx[1], 'ts': ts } for xx in x]) co_records = cooccur_rankings | 'co_format' >> beam.FlatMap( lambda x: [{ 'w1': xx[0][0], 'w2': xx[0][1], 'count': xx[1], 'log_weight': xx[2], 'ts': ts } for xx in x]) # Write the results to three BigQuery tables. wc_records | 'wc_write_bq' >> beam.io.Write( beam.io.BigQuerySink( '%s:%s.word_counts' % (project, dataset), schema=generate_wc_schema(), create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND)) url_records | 'urls_write_bq' >> beam.io.Write( beam.io.BigQuerySink( '%s:%s.urls' % (project, dataset), schema=generate_url_schema(), create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND)) co_records | 'co_write_bq' >> beam.io.Write( beam.io.BigQuerySink( '%s:%s.word_cooccur' % (project, dataset), schema=generate_cooccur_schema(), create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND)) # Actually run the pipeline. return p.run()
def test_assingleton_multi_element(self): with self.assertRaisesRegexp( ValueError, 'PCollection of size 2 with more than one element accessed as a ' 'singleton view. First two elements encountered are \"1\", \"2\".'): AsSingleton._from_runtime_iterable([1, 2], {})
def expand(self, pvalues): return pvalues[0] | beam.ParDo(AddThenMultiplyDoFn(), AsSingleton(pvalues[1]), AsSingleton(pvalues[2]))
# Compute a mapping from each word to its document frequency. # A word's document frequency in a corpus is the number of # documents in which the word appears divided by the total # number of documents in the corpus. # # This calculation uses a side input, a Dataflow-computed auxiliary value # presented to each invocation of our MapFn lambda. The second argument to # the lambda (called total---note that we are unpacking the first argument) # receives the value we listed after the lambda in Map(). Additional side # inputs (and ordinary Python values, too) can be provided to MapFns and # DoFns in this way. word_to_df = ( word_to_doc_count | 'ComputeDocFrequencies' >> beam.Map( lambda (word, count), total: (word, float(count) / total), AsSingleton(total_documents))) # Join the term frequency and document frequency collections, # each keyed on the word. word_to_uri_and_tf_and_df = ( { 'tf': word_to_uri_and_tf, 'df': word_to_df } | 'CoGroupWordsByTf-df' >> beam.CoGroupByKey()) # Compute a mapping from each word to a (URI, TF-IDF) score for each URI. # There are a variety of definitions of TF-IDF # ("term frequency - inverse document frequency") score; here we use a # basic version that is the term frequency divided by the log of the # document frequency.
def side_input_factory(p, item, label_name): return AsSingleton( p | label_name + ', creating collection' >> beam.Create([item]) | label_name + ', combining globally' >> beam.CombineGlobally( beam.combiners.ToListCombineFn()))
def expand(self, uri_to_content): # Compute the total number of documents, and prepare a singleton # PCollection to use as side input. total_documents = (uri_to_content | 'GetUris 1' >> beam.Keys() | 'GetUniqueUris' >> beam.Distinct() | 'CountUris' >> beam.combiners.Count.Globally()) # Create a collection of pairs mapping a URI to each of the words # in the document associated with that that URI. def split_into_words(uri_line): (uri, line) = uri_line return [(uri, w.lower()) for w in re.findall(r'[A-Za-z\']+', line)] uri_to_words = (uri_to_content | 'SplitWords' >> beam.FlatMap(split_into_words)) # Compute a mapping from each word to the total number of documents # in which it appears. word_to_doc_count = ( uri_to_words | 'GetUniqueWordsPerDoc' >> beam.Distinct() | 'GetWords' >> beam.Values() | 'CountDocsPerWord' >> beam.combiners.Count.PerElement()) # Compute a mapping from each URI to the total number of words in the # document associated with that URI. uri_to_word_total = ( uri_to_words | 'GetUris 2' >> beam.Keys() | 'CountWordsInDoc' >> beam.combiners.Count.PerElement()) # Count, for each (URI, word) pair, the number of occurrences of that word # in the document associated with the URI. uri_and_word_to_count = ( uri_to_words | 'CountWord-DocPairs' >> beam.combiners.Count.PerElement()) # Adjust the above collection to a mapping from (URI, word) pairs to counts # into an isomorphic mapping from URI to (word, count) pairs, to prepare # for a join by the URI key. def shift_keys(uri_word_count): return (uri_word_count[0][0], (uri_word_count[0][1], uri_word_count[1])) uri_to_word_and_count = (uri_and_word_to_count | 'ShiftKeys' >> beam.Map(shift_keys)) # Perform a CoGroupByKey (a sort of pre-join) on the prepared # uri_to_word_total and uri_to_word_and_count tagged by 'word totals' and # 'word counts' strings. This yields a mapping from URI to a dictionary # that maps the above mentioned tag strings to an iterable containing the # word total for that URI and word and count respectively. # # A diagram (in which '[]' just means 'iterable'): # # URI: {'word totals': [count], # Total words within this URI's document. # 'word counts': [(word, count), # Counts of specific words # (word, count), # within this URI's document. # ... ]} uri_to_word_and_count_and_total = ( { 'word totals': uri_to_word_total, 'word counts': uri_to_word_and_count } | 'CoGroupByUri' >> beam.CoGroupByKey()) # Compute a mapping from each word to a (URI, term frequency) pair for each # URI. A word's term frequency for a document is simply the number of times # that word occurs in the document divided by the total number of words in # the document. def compute_term_frequency(uri_count_and_total): (uri, count_and_total) = uri_count_and_total word_and_count = count_and_total['word counts'] # We have an iterable for one element that we want extracted. [word_total] = count_and_total['word totals'] for word, count in word_and_count: yield word, (uri, float(count) / word_total) word_to_uri_and_tf = ( uri_to_word_and_count_and_total | 'ComputeTermFrequencies' >> beam.FlatMap(compute_term_frequency)) # Compute a mapping from each word to its document frequency. # A word's document frequency in a corpus is the number of # documents in which the word appears divided by the total # number of documents in the corpus. # # This calculation uses a side input, a Dataflow-computed auxiliary value # presented to each invocation of our MapFn lambda. The second argument to # the function (called total---note that the first argument is a tuple) # receives the value we listed after the lambda in Map(). Additional side # inputs (and ordinary Python values, too) can be provided to MapFns and # DoFns in this way. def div_word_count_by_total(word_count, total): (word, count) = word_count return (word, float(count) / total) word_to_df = ( word_to_doc_count | 'ComputeDocFrequencies' >> beam.Map( div_word_count_by_total, AsSingleton(total_documents))) # Join the term frequency and document frequency collections, # each keyed on the word. word_to_uri_and_tf_and_df = ( { 'tf': word_to_uri_and_tf, 'df': word_to_df } | 'CoGroupWordsByTf-df' >> beam.CoGroupByKey()) # Compute a mapping from each word to a (URI, TF-IDF) score for each URI. # There are a variety of definitions of TF-IDF # ("term frequency - inverse document frequency") score; here we use a # basic version that is the term frequency divided by the log of the # document frequency. def compute_tf_idf(word_tf_and_df): (word, tf_and_df) = word_tf_and_df [docf] = tf_and_df['df'] for uri, tf in tf_and_df['tf']: yield word, (uri, tf * math.log(1 / docf)) word_to_uri_and_tfidf = ( word_to_uri_and_tf_and_df | 'ComputeTf-idf' >> beam.FlatMap(compute_tf_idf)) return word_to_uri_and_tfidf
# Compute a mapping from each word to its document frequency. # A word's document frequency in a corpus is the number of # documents in which the word appears divided by the total # number of documents in the corpus. # # This calculation uses a side input, a Dataflow-computed auxiliary value # presented to each invocation of our MapFn lambda. The second argument to # the lambda (called total---note that we are unpacking the first argument) # receives the value we listed after the lambda in Map(). Additional side # inputs (and ordinary Python values, too) can be provided to MapFns and # DoFns in this way. word_to_df = ( word_to_doc_count | 'ComputeDocFrequencies' >> beam.Map( lambda (word, count), total: (word, float(count) / total), AsSingleton(total_documents))) # Join the term frequency and document frequency collections, # each keyed on the word. word_to_uri_and_tf_and_df = ( {'tf': word_to_uri_and_tf, 'df': word_to_df} | 'CoGroupWordsByTf-df' >> beam.CoGroupByKey()) # Compute a mapping from each word to a (URI, TF-IDF) score for each URI. # There are a variety of definitions of TF-IDF # ("term frequency - inverse document frequency") score; here we use a # basic version that is the term frequency divided by the log of the # document frequency. def compute_tf_idf((word, tf_and_df)): [docf] = tf_and_df['df']