def mapper(what):
    stream = get_stream()
    reader = csv.reader(stream, delimiter='\t')

    for line in reader:
        try:
            if len(line) > 4:
                body = line[4]

                # remove html markup
                body = re.sub('<[^<]+?>', '', body)
                parts = prepare(body)

                index = {}
                for part in parts:
                    if len(part) > 0:
                        if index.has_key(part):
                            index[part] += 1
                        else:
                            index[part] = 1

                if what == "number":
                    print_no(index, line[0])
                elif what == "source":
                    print_source(index, line[0])
                else:
                    print_all(index, line[0])
        except:
            pass

    close_stream(stream)
def reducer():
    max_value = 0
    old_key = None
    stream = get_stream()

    for line in stream:
        data_mapped = line.strip().split("\t")
        if len(data_mapped) != 2:
            # Skip this line.
            continue

        this_key, this_sale = data_mapped

        # special case - first key
        if old_key is None:
            old_key = this_key

        if old_key != this_key:
            print_result(max_value, old_key)
            old_key = this_key
            max_value = 0

        value = float(this_sale)

        if max_value < value:
            max_value = value

    # special case - last key
    print_result(max_value, old_key)

    close_stream(stream)
Esempio n. 3
0
def reducer():
    stream = get_stream()
    old_key = None
    hours = [0] * 24

    for line in stream:
        data_mapped = line.strip().split("\t")
        if len(data_mapped) != 2:
            # Skip this line.
            continue

        this_key, this_hour = data_mapped

        # special case - first key
        if old_key is None:
            old_key = this_key

        if old_key != this_key:
            print_result(hours, old_key)
            old_key = this_key
            hours = [0] * 24

        hour = int(this_hour)

        if -1 < hour < len(hours):
            hours[hour] += 1

    print_result(hours, old_key)
    close_stream(stream)
def mapper():
    type_forum_user = "******"
    type_forum_node = "B"

    stream = get_stream()
    reader = csv.reader(stream, delimiter='\t')

    for line in reader:
        if len(line) == 5:
            user_ptr_id, reputation, gold, silver, bronze = line
            try:
                int(user_ptr_id)
                print "{0}\t{1}\t{2}\t{3}\t{4}\t{5}".format(
                    user_ptr_id, type_forum_user, reputation, gold, silver,
                    bronze)
            except ValueError:
                print("Skipping line")
        elif len(line) > 9:
            # just pick the first 10 elements
            post_id, title, tagnames, author_id, body, node_type, parent_id, abs_parent_id, added_at = line[:
                                                                                                            9]
            try:
                int(post_id)
                print "{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\t{7}\t{8}".format(
                    author_id, type_forum_node, post_id, title, tagnames,
                    node_type, parent_id, abs_parent_id, added_at)
            except ValueError:
                print("Skipping line")

    close_stream(stream)
def reducer():
    user_data = []
    forum_data = []
    old_id = None
    stream = get_stream()

    for line in stream:
        data_mapped = line.strip().split("\t")

        data_len = len(data_mapped)
        if not (data_len == 5 or data_len == 9):
            # Skip this line.
            continue
        this_id = data_mapped[0]
        this_type = data_mapped[1]

        # special case - first key
        if old_id is None:
            old_id = this_id

        if old_id != this_id:
            print_result(forum_data, user_data)
            old_id = this_id
            user_data = []
            forum_data = []

        if this_type == "A":
            forum_data.append(data_mapped)
        else:
            user_data = data_mapped

    # special case - last key
    print_result(forum_data, user_data)

    close_stream(stream)
def reducer():
    stream = get_stream()
    total_hit = 0
    old_key = None

    for line in stream:
        data_mapped = line.strip().split("\t")
        if len(data_mapped) != 2:
            # Skip this line.
            continue

        this_key, other = data_mapped

        # special case - first key
        if old_key is None:
            old_key = this_key

        if old_key != this_key:
            print_result(old_key, total_hit)
            old_key = this_key
            total_hit = 0

        total_hit += 1

    # special case - last key
    print_result(old_key, total_hit)

    close_stream(stream)
def reducer():
    value_total = 0
    old_key = None

    stream = get_stream()
    for line in stream:
        data_mapped = line.strip().split("\t")
        if len(data_mapped) != 2:
            # Skip this line.
            continue

        this_key, this_value = data_mapped

        # special case - first key
        if old_key is None:
            old_key = this_key

        if old_key != this_key:
            print_result(old_key, value_total)
            old_key = this_key
            value_total = 0

        value_total += float(this_value)

    # special case - last key
    print_result(old_key, value_total)

    close_stream(stream)
def mapper():
    inf = get_stream()

    for line in inf:
        data = line.strip().split("\t")
        if len(data) == 6:
            date, time, store, item, cost, payment = data
            print "{0}\t{1}".format(store, cost)

    close_stream(inf)
Esempio n. 9
0
def mapper():
    stream = get_stream()

    for line in stream:
        try:
            parts = prepare(line)
            for part in parts:
                if len(part) > 0:
                    print "{0}\t{1}".format(part, 1)

        except:
            pass

    close_stream(stream)
Esempio n. 10
0
def mapper():
    # debug by passing test_text as argument to get_stream()
    stream = get_stream()
    reader = csv.reader(stream, delimiter='\t')
    writer = csv.writer(sys.stdout,
                        delimiter='\t',
                        quotechar='"',
                        quoting=csv.QUOTE_ALL)

    for line in reader:
        if len(line) > 4:
            body = line[4]
            if match_none_or_single_in_last_position(body):
                writer.writerow(line)

    close_stream(stream)
Esempio n. 11
0
def mapper():
    pattern = get_common_log_format()
    inf = get_stream()

    for line in inf:
        try:
            data = pattern.match(line).groups()

            if len(data) == 7:
                address, identity, username, timestamp, request, statuscode, size = data

                print "{0}\t{1}".format(address, identity)

        except:
            pass

    close_stream(inf)
def mapper():
    stream = get_stream()
    reader = csv.reader(stream, delimiter='\t')

    for line in reader:
        if len(line) > 9:
            # just pick the first 10 elements
            post_id, title, tagnames, author_id, body, node_type, parent_id, abs_parent_id, added_at = line[:
                                                                                                            9]
            try:
                date_time = get_date_time(added_at)

                print "{0}\t{1}".format(author_id, date_time.hour)
            except ValueError:
                print("Skipping line")

    close_stream(stream)
def mapper():
    stream = get_stream()
    days = [
        "Sunday", "Monday", "Tuesday", "Wednesday", "Thursday", "Friday",
        "Saturday"
    ]
    try:
        for line in stream:
            data = line.strip().split("\t")
            if len(data) == 6:
                date, time, store, item, cost, payment = data
                weekday = days[datetime.strptime(date, "%Y-%m-%d").weekday()]

                print "{0}\t{1}".format(weekday, float(cost))
    except:
        pass

    close_stream(stream)
def mapper():
    stream = get_stream()
    reader = csv.reader(stream, delimiter='\t')

    for line in reader:
        if len(line) > 9:
            # just pick needed elements
            post_id, title, tagnames, author_id, body, node_type, parent_id = line[:7]
            try:
                if node_type == "question":
                    print "{0}\t{1}\t{2}".format(post_id, node_type, len(body))

                elif node_type == "answer":
                    print "{0}\t{1}\t{2}".format(parent_id, node_type, len(body))

            except ValueError:
                print("Skipping line")

    close_stream(stream)
def reducer():
    sales_total = 0
    no_of_sales = 0

    stream = get_stream()
    for line in stream:
        data_mapped = line.strip().split("\t")
        if len(data_mapped) != 2:
            # Skip this line.
            continue

        this_key, this_sale = data_mapped

        sales_total += float(this_sale)
        no_of_sales += 1

    print no_of_sales, "\t", sales_total

    close_stream(stream)
def mapper():
    # debug by passing test_text as argument to get_stream()
    stream = get_stream()
    reader = csv.reader(stream, delimiter='\t')
    writer = csv.writer(sys.stdout,
                        delimiter='\t',
                        quotechar='"',
                        quoting=csv.QUOTE_ALL)

    longest = []

    for line in reader:
        longest.append(line)
        longest.sort(key=lambda entry: len(entry[4]), reverse=True)
        longest = longest[:10]

    for i in reversed(longest):
        writer.writerow(i)

    close_stream(stream)
Esempio n. 17
0
def reducer():
    stream = get_stream()
    index = {}

    for line in stream:
        data_mapped = line.strip().split("\t")
        if len(data_mapped) != 2:
            # Skip this line.
            continue

        key, source = data_mapped

        if key not in index:
            index[key] = []

        index[key].append(source)

    for key, value in index.iteritems():
        print "{0}\t{1}".format(key, value)

    close_stream(stream)
Esempio n. 18
0
def reducer():
    question_length = 0
    total_answers_length = 0
    no_of_answers = 0
    old_id = None
    stream = get_stream()

    for line in stream:
        data_mapped = line.strip().split("\t")

        data_len = len(data_mapped)
        if data_len != 3:
            # Skip this line.
            continue

        this_id, this_type, this_length = data_mapped

        # special case - first key
        if old_id is None:
            old_id = this_id

        if old_id != this_id:
            print_result(old_id, question_length, total_answers_length,
                         no_of_answers)
            old_id = this_id
            question_length = 0
            total_answers_length = 0
            no_of_answers = 0

        if this_type == "answer":
            total_answers_length += float(this_length)
            no_of_answers += 1
        else:
            question_length = float(this_length)

        # special case - last key
    print_result(old_id, question_length, total_answers_length, no_of_answers)

    close_stream(stream)