/
study_groups_mapper.py
executable file
·58 lines (43 loc) · 1.76 KB
/
study_groups_mapper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
#!/usr/bin/env python
# encoding: utf-8
from __future__ import print_function
import sys
import csv
from common import getField, isValidNodeLine
# To make our reducers lives' easier, we want questions before the rest. {{{
QUESTION = 'A'
WHATEVER = 'B'
# }}}
def mapper():
"""Mapper function.
Input is read from sys.stdin and written to sys.stdout. Both streams can be
overwritten if needed.
:returns: Nothing. Writes to standard output.
"""
# The input file is saved as a tab-separated file. The data itself comes from
# http://content.udacity-data.com/course/hadoop/forum_data.tar.gz -- file
# "forum_nodes.tsv".
reader = csv.reader(sys.stdin, delimiter='\t')
for line in reader:
# Basic data sanity check
if not isValidNodeLine(line):
continue
# The fields we're interested in. Question represent new threads,
# comments & answers, posts to that thread. Hence, we need the node id
# for questions and the parent id for answers / comments. We obviously
# need the author id as well, so we can group that.
node = getField(line, 'id')
nodeType = getField(line, 'node_type')
parent = getField(line, 'abs_parent_id')
author = getField(line, 'author_id')
# If any of the fields we're interested in is None, then it is no good
# for us. Drop this line altogether.
if any(map(lambda x: x == None, (node, nodeType, parent, author))):
continue
# Data output, as announced by the comments above
if nodeType == 'question':
print('{0}\t{1}\t{2}'.format(node, QUESTION, author))
else:
print('{0}\t{1}\t{2}'.format(parent, WHATEVER, author))
if __name__ == '__main__':
mapper()