def mapper(): """Mapper function. Input is read from sys.stdin and written to sys.stdout. Both streams can be overwritten if needed. :returns: Nothing. Writes to standard output. """ # The input file is saved as a tab-separated file. The data itself comes from # http://content.udacity-data.com/course/hadoop/forum_data.tar.gz -- file # "forum_nodes.tsv". reader = csv.reader(sys.stdin, delimiter="\t") for line in reader: # Basic data sanity check if not isValidNodeLine(line): continue author = getField(line, "author_id") date = parseDate(getField(line, "added_at")) if date is None or author is None: # Something's gone wrong. Ignore this line. continue print("{0}\t{1}".format(author, date.hour))
def mapper(): """Mapper function. Input is read from sys.stdin and written to sys.stdout. Both streams can be overwritten if needed. :returns: Nothing. Writes to standard output. """ # The input file is saved as a tab-separated file. The data itself comes from # http://content.udacity-data.com/course/hadoop/forum_data.tar.gz -- file # "forum_nodes.tsv". reader = csv.reader(sys.stdin, delimiter='\t') for line in reader: # Basic data sanity check if not isValidNodeLine(line): continue # The fields we're interested in. For questions, we obviously want # their ids to be output, along with their body lengths. For answers, # we actually want to output the value of "parent_id", and the reason # for that is that we want answers to be grouped together with the # questions that caused them to be. The node_type is used to decide if # this is a question or an answer. Comments should be ignored. node = getField(line, 'id') nodeType = getField(line, 'node_type') parent = getField(line, 'abs_parent_id') body = getField(line, 'body') # If any of the fields we're interested in is None, then it is no good # for us. Drop this line altogether. if any(map(lambda x: x == None, (node, nodeType, parent, body))): continue # Data output, as announced by the comments above # NOTE: We're assuming neither questions nor answers can be empty. # We're hoping that the forum software prevented that from happening. # The only reason this comment is made here is because it looks like, # from the database dump, that empty values get filled with the "\N" # string, which has a length of two. This could be improved, but # I believe this happening would be quite unlikely. if nodeType == 'question': print('{0}\t{1}\t{2}'.format(node, QUESTION, len(body))) elif nodeType == 'answer': print('{0}\t{1}\t{2}'.format(parent, ANSWER, len(body))) else: # We don't care about it. continue
def mapper(): """Mapper function. Input is read from sys.stdin and written to sys.stdout. Both streams can be overwritten if needed. :returns: Nothing. Writes to standard output. """ # The input file is saved as a tab-separated file. The data itself comes from # http://content.udacity-data.com/course/hadoop/forum_data.tar.gz -- file # "forum_nodes.tsv". reader = csv.reader(sys.stdin, delimiter='\t') for line in reader: # Basic data sanity check if not isValidNodeLine(line): continue # The fields we're interested in. Question represent new threads, # comments & answers, posts to that thread. Hence, we need the node id # for questions and the parent id for answers / comments. We obviously # need the author id as well, so we can group that. node = getField(line, 'id') nodeType = getField(line, 'node_type') parent = getField(line, 'abs_parent_id') author = getField(line, 'author_id') # If any of the fields we're interested in is None, then it is no good # for us. Drop this line altogether. if any(map(lambda x: x == None, (node, nodeType, parent, author))): continue # Data output, as announced by the comments above if nodeType == 'question': print('{0}\t{1}\t{2}'.format(node, QUESTION, author)) else: print('{0}\t{1}\t{2}'.format(parent, WHATEVER, author))
def mapper(): """Mapper function. Input is read from sys.stdin and written to sys.stdout. Both streams can be overwritten if needed. :returns: Nothing. Writes to standard output. """ # The input file is saved as a tab-separated file. The data itself comes # from http://content.udacity-data.com/course/hadoop/forum_data.tar.gz -- # file "forum_nodes.tsv". reader = csv.reader(sys.stdin, delimiter='\t') # Dictionary that will hold the count of tags tagDict = {} for line in reader: # Basic data sanity check if not isValidNodeLine(line): continue tags = getField(line, 'tagnames') if tags is None: continue # Every tag gets added to the dictionary. If it doesn't exist yet, it # is added with value 1. Otherwise, its current value is incremented. tags = tags.split() for tag in tags: tagDict[tag] = tagDict.get(tag, 0) + 1 # NOTE: Funny thing happening here. We *cannot use* the top-N pattern # explained in class! Suppose the following example: we want the top 1 # tag and we're using the top-N pattern presented in class. Now, suppose # we have two mappers and we actually have only two tags: tag1, and tag2. # Suppose our mappers see the following # Mapper 1: # This gets all instances of tag1, which happen to be in this example, # equal to 1000. BUT it also sees 400 instances of tag two. # So, internally, we can have in mapper 1: # tag1 = 1000 # tag2 = 400 # # Mapper 2: Suppose mapper 2 sees only tag2 and its count is 900. # So, the internal list for mapper 2 will be: # tag2 = 900 # # Now, if we used the topN patter *as presented* in class, we'd be lost. # Our reducer would see as its input: # tag1 = 1000 # tag2 = 900 # # And it would output the incorrect top1 tag, tag1, with 1000 occurrences. # # If, instead, the mappers output all tags they saw, then the reducer would # have the chance to see this list: # tag1 = 1000 # tag2 = 400 # tag2 = 900 # Now it would be able to reduce correctly and output the top 1 tag, tag2. # We print everything we got. Can be out of order, Hadoop will sort it for # us. for tag in tagDict.items(): print('%s\t%s' % tag)