def mapper(what): stream = get_stream() reader = csv.reader(stream, delimiter='\t') for line in reader: try: if len(line) > 4: body = line[4] # remove html markup body = re.sub('<[^<]+?>', '', body) parts = prepare(body) index = {} for part in parts: if len(part) > 0: if index.has_key(part): index[part] += 1 else: index[part] = 1 if what == "number": print_no(index, line[0]) elif what == "source": print_source(index, line[0]) else: print_all(index, line[0]) except: pass close_stream(stream)
def reducer(): max_value = 0 old_key = None stream = get_stream() for line in stream: data_mapped = line.strip().split("\t") if len(data_mapped) != 2: # Skip this line. continue this_key, this_sale = data_mapped # special case - first key if old_key is None: old_key = this_key if old_key != this_key: print_result(max_value, old_key) old_key = this_key max_value = 0 value = float(this_sale) if max_value < value: max_value = value # special case - last key print_result(max_value, old_key) close_stream(stream)
def reducer(): stream = get_stream() old_key = None hours = [0] * 24 for line in stream: data_mapped = line.strip().split("\t") if len(data_mapped) != 2: # Skip this line. continue this_key, this_hour = data_mapped # special case - first key if old_key is None: old_key = this_key if old_key != this_key: print_result(hours, old_key) old_key = this_key hours = [0] * 24 hour = int(this_hour) if -1 < hour < len(hours): hours[hour] += 1 print_result(hours, old_key) close_stream(stream)
def mapper(): type_forum_user = "******" type_forum_node = "B" stream = get_stream() reader = csv.reader(stream, delimiter='\t') for line in reader: if len(line) == 5: user_ptr_id, reputation, gold, silver, bronze = line try: int(user_ptr_id) print "{0}\t{1}\t{2}\t{3}\t{4}\t{5}".format( user_ptr_id, type_forum_user, reputation, gold, silver, bronze) except ValueError: print("Skipping line") elif len(line) > 9: # just pick the first 10 elements post_id, title, tagnames, author_id, body, node_type, parent_id, abs_parent_id, added_at = line[: 9] try: int(post_id) print "{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\t{7}\t{8}".format( author_id, type_forum_node, post_id, title, tagnames, node_type, parent_id, abs_parent_id, added_at) except ValueError: print("Skipping line") close_stream(stream)
def reducer(): user_data = [] forum_data = [] old_id = None stream = get_stream() for line in stream: data_mapped = line.strip().split("\t") data_len = len(data_mapped) if not (data_len == 5 or data_len == 9): # Skip this line. continue this_id = data_mapped[0] this_type = data_mapped[1] # special case - first key if old_id is None: old_id = this_id if old_id != this_id: print_result(forum_data, user_data) old_id = this_id user_data = [] forum_data = [] if this_type == "A": forum_data.append(data_mapped) else: user_data = data_mapped # special case - last key print_result(forum_data, user_data) close_stream(stream)
def reducer(): stream = get_stream() total_hit = 0 old_key = None for line in stream: data_mapped = line.strip().split("\t") if len(data_mapped) != 2: # Skip this line. continue this_key, other = data_mapped # special case - first key if old_key is None: old_key = this_key if old_key != this_key: print_result(old_key, total_hit) old_key = this_key total_hit = 0 total_hit += 1 # special case - last key print_result(old_key, total_hit) close_stream(stream)
def reducer(): value_total = 0 old_key = None stream = get_stream() for line in stream: data_mapped = line.strip().split("\t") if len(data_mapped) != 2: # Skip this line. continue this_key, this_value = data_mapped # special case - first key if old_key is None: old_key = this_key if old_key != this_key: print_result(old_key, value_total) old_key = this_key value_total = 0 value_total += float(this_value) # special case - last key print_result(old_key, value_total) close_stream(stream)
def mapper(): inf = get_stream() for line in inf: data = line.strip().split("\t") if len(data) == 6: date, time, store, item, cost, payment = data print "{0}\t{1}".format(store, cost) close_stream(inf)
def mapper(): stream = get_stream() for line in stream: try: parts = prepare(line) for part in parts: if len(part) > 0: print "{0}\t{1}".format(part, 1) except: pass close_stream(stream)
def mapper(): # debug by passing test_text as argument to get_stream() stream = get_stream() reader = csv.reader(stream, delimiter='\t') writer = csv.writer(sys.stdout, delimiter='\t', quotechar='"', quoting=csv.QUOTE_ALL) for line in reader: if len(line) > 4: body = line[4] if match_none_or_single_in_last_position(body): writer.writerow(line) close_stream(stream)
def mapper(): pattern = get_common_log_format() inf = get_stream() for line in inf: try: data = pattern.match(line).groups() if len(data) == 7: address, identity, username, timestamp, request, statuscode, size = data print "{0}\t{1}".format(address, identity) except: pass close_stream(inf)
def mapper(): stream = get_stream() reader = csv.reader(stream, delimiter='\t') for line in reader: if len(line) > 9: # just pick the first 10 elements post_id, title, tagnames, author_id, body, node_type, parent_id, abs_parent_id, added_at = line[: 9] try: date_time = get_date_time(added_at) print "{0}\t{1}".format(author_id, date_time.hour) except ValueError: print("Skipping line") close_stream(stream)
def mapper(): stream = get_stream() days = [ "Sunday", "Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday" ] try: for line in stream: data = line.strip().split("\t") if len(data) == 6: date, time, store, item, cost, payment = data weekday = days[datetime.strptime(date, "%Y-%m-%d").weekday()] print "{0}\t{1}".format(weekday, float(cost)) except: pass close_stream(stream)
def mapper(): stream = get_stream() reader = csv.reader(stream, delimiter='\t') for line in reader: if len(line) > 9: # just pick needed elements post_id, title, tagnames, author_id, body, node_type, parent_id = line[:7] try: if node_type == "question": print "{0}\t{1}\t{2}".format(post_id, node_type, len(body)) elif node_type == "answer": print "{0}\t{1}\t{2}".format(parent_id, node_type, len(body)) except ValueError: print("Skipping line") close_stream(stream)
def reducer(): sales_total = 0 no_of_sales = 0 stream = get_stream() for line in stream: data_mapped = line.strip().split("\t") if len(data_mapped) != 2: # Skip this line. continue this_key, this_sale = data_mapped sales_total += float(this_sale) no_of_sales += 1 print no_of_sales, "\t", sales_total close_stream(stream)
def mapper(): # debug by passing test_text as argument to get_stream() stream = get_stream() reader = csv.reader(stream, delimiter='\t') writer = csv.writer(sys.stdout, delimiter='\t', quotechar='"', quoting=csv.QUOTE_ALL) longest = [] for line in reader: longest.append(line) longest.sort(key=lambda entry: len(entry[4]), reverse=True) longest = longest[:10] for i in reversed(longest): writer.writerow(i) close_stream(stream)
def reducer(): stream = get_stream() index = {} for line in stream: data_mapped = line.strip().split("\t") if len(data_mapped) != 2: # Skip this line. continue key, source = data_mapped if key not in index: index[key] = [] index[key].append(source) for key, value in index.iteritems(): print "{0}\t{1}".format(key, value) close_stream(stream)
def reducer(): question_length = 0 total_answers_length = 0 no_of_answers = 0 old_id = None stream = get_stream() for line in stream: data_mapped = line.strip().split("\t") data_len = len(data_mapped) if data_len != 3: # Skip this line. continue this_id, this_type, this_length = data_mapped # special case - first key if old_id is None: old_id = this_id if old_id != this_id: print_result(old_id, question_length, total_answers_length, no_of_answers) old_id = this_id question_length = 0 total_answers_length = 0 no_of_answers = 0 if this_type == "answer": total_answers_length += float(this_length) no_of_answers += 1 else: question_length = float(this_length) # special case - last key print_result(old_id, question_length, total_answers_length, no_of_answers) close_stream(stream)