forked from cs327e-fall2019/snippets
/
oscars_3.py
40 lines (31 loc) · 1.29 KB
/
oscars_3.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
import apache_beam as beam
from apache_beam.io import ReadFromText
from apache_beam.io import WriteToText
# PTransform: parse line in file, return (actor name, 1)
class ActorCountFn(beam.DoFn):
def process(self, element):
values = element.strip().split('\t')
year = values[0]
category = values[1]
winner = values[2]
entity = values[3]
if 'ACTOR' in category or 'ACTRESS' in category:
return [(entity, 1)]
# PTransform: sum up nominations for a given actor/actress
class ActorSumFn(beam.DoFn):
def process(self, element):
actor, counts = element
total_count = len(counts)
return [(actor, total_count)]
# Create a Pipeline using a local runner for execution
with beam.Pipeline('DirectRunner') as p:
# create a PCollection from the file contents.
in_pcoll = p | 'Read File' >> ReadFromText('oscars_input.tsv')
# apply ParDo to the PCollection
actor_pcoll = in_pcoll | 'Extract Actor' >> beam.ParDo(ActorCountFn())
# apply GroupByKey to the PCollection
group_pcoll = actor_pcoll | 'Group by Actor' >> beam.GroupByKey()
# apply ParDo to the PCollection
out_pcoll = group_pcoll | 'Sum up Counts' >> beam.ParDo(ActorSumFn())
# write PCollection to a file
out_pcoll | 'Write File' >> WriteToText('oscars_output.txt')