Esempio n. 1
0
def emit_word_count(key, collected):
    token = key

    try:
        mr.emit((token, len(collected)))

    except ValueError, err:
        sys.stderr.write("emit_word_count ValueError: %(err)s\n%(data)s\n" % {"err": str(err), "data": str(collected)})
        raise
Esempio n. 2
0
def run(k, normalize=0):
    """Emit the top K values for each key.  If NORMALIZE > 0, divide counts
    by the total for each key and round to NORMALIZE digits."""
    k = int(k)
    for key, value_iterator in values_by_key(sys.stdin):
        counts = Counter(value_iterator)
        format = lambda x: x
        if normalize:
            digits = int(normalize)
            total = sum(counts.values())
            def format(pair):
                return (pair[0], round(pair[1]/total, digits))
        emit(key, tuple(map(format, counts.most_common(k))))
Esempio n. 3
0
#!/usr/bin/env python

### Marco Vivero, MISK

import sys
import os.path

### Load mr library.

sys.path.append(os.path.dirname(__file__))
from mr import emit

### Input at this stage is of the form:
### Token1|Token2 \t AdID,Token-Set1,Token-Set2

for line in sys.stdin:
    line = line.replace("'", '').replace('\n', '').split('\t')
    ### Extract match key, Token1|Token2
    key, line = line[0], line[1].split(',')
    ### Extract tokens as a set of strings.
    tok1, tok2 = set(line[1].split('|')), set(line[2].split('|'))
    ### Compute local similarity.
    value = float(len(tok1.intersection(tok2)))
    value = value / float(len(tok1.union(tok2)))
    ### Output (Python readable): AdID,Token1|Token2 \t Similarity.
    emit(key + ',' + line[0], value)

Esempio n. 4
0
import os.path

### Load mr library.

sys.path.append(os.path.dirname(__file__))
from mr import emit

### Create dictionary of token classes and coresponding file indices.

token_dict = {'Description' : 12, 'Keyword' : 10, 'Query' : 9, 'Title' : 11}
adID = 5

### Creates generator object of all possible combinations of two items in a given list.

def combinations(lst):
    while lst:
        key = lst.pop(0)
        for elem in lst:
            yield key, elem


for line in sys.stdin:
    line = line.split('\t')
    ### Output for each line should be
    ### Token1|ID1, Token2|ID2, Token1|Token2, AdID, respectively.
    for elem in combinations(token_dict.keys()):
        emit('{0}|{1}'.format(elem[0], line[ token_dict[elem[0]] ]),
             '{0}|{1}'.format(elem[1], line[ token_dict[elem[1]] ]) + ',' + 
             '{0}|{1}'.format(elem[0], elem[1]) + ',' + 
             '{0}'.format(line[adID]))
Esempio n. 5
0
    for token in text.split(" "):
        token = token.strip().strip(".").strip("-").strip(":").lower()

        if (len(token) > 1) and (token not in stop_words) and (not token.isdigit()):
            yield [token, 1]


def emit_word_count(key, collected):
    token = key

    try:
        mr.emit((token, len(collected)))

    except ValueError, err:
        sys.stderr.write("emit_word_count ValueError: %(err)s\n%(data)s\n" % {"err": str(err), "data": str(collected)})
        raise


if __name__ == "__main__":
    mode = sys.argv[1]

    if mode == "map1":
        read_stop_words(sys.argv[2])
        mr.emit(("token", "count"))
        mr.mapper(load_text)

    elif mode == "red1":
        mr.emit(("token", "count"))
        mr.reducer(emit_word_count)
def count_vowels(line):
    """A map function that counts vowels."""
    for vowel in 'aeiou':
        count = line.count(vowel)
        if count > 0:
            emit(vowel, count)
#!/usr/bin/env python3
"""Sum values for each key."""

import sys
from mr import values_by_key, emit

for key, value_iterator in values_by_key(sys.stdin):
    emit(key, sum(value_iterator))
Esempio n. 8
0
#!/usr/bin/env python

### Marco Vivero, MISK

import sys
import os.path

### Import mr library.

sys.path.append(os.path.dirname(__file__))
from mr import emit

### Put UserID as key for each traiining instance.

for line in sys.stdin:
    try:
        line = line.replace('\n', '').split('\t')
        emit(line[13], str(line[2 : 13]).replace("'",'').replace(', ', '\t')[1 : -1])
    except IndexError:
        pass

Esempio n. 9
0
def run():
    for key, value_iterator in values_by_key(sys.stdin):
        emit(key, set(value_iterator))
Esempio n. 10
0
def run():
    for key, value_iterator in values_by_key(sys.stdin):
        emit(key, set(value_iterator))
Esempio n. 11
0
#!/usr/bin/env python

### Marco Vivero, MISK

import sys
import os.path

### Load mr library.

sys.path.append(os.path.dirname(__file__))
from mr import emit, values_by_key

### Define a helper function to define a binary operation on a list of sets
### (component-wise union).

def cross_union(x, y):
    return [x[0].union(y[0]), x[1].union(y[1])]

for key, value_iterator in values_by_key(sys.stdin):
    ### Take component-wise unions aggreagted by AdID.
    value_iterator = reduce(cross_union, value_iterator)
    ### Calculate Jaccard Similarity
    value = float(len(value_iterator[0].intersection(value_iterator[1])))
    value /= float(len(value_iterator[0].union(value_iterator[1])))
    key = key.split(',')
    ### Output AdID \t Token1|Token2,Ad-Similarity
    emit(key[1], key[0] + ',' + str(value))
Esempio n. 12
0
def run():
    for tree in read_trees(sys.stdin):
        for word, tag in words(tree):
            mr.emit(word.lower(), tag)
Esempio n. 13
0
def count_vowels(line):
    """A map function that counts the vowels in a line."""
    for vowel in 'aeiou':  # for every type of vowel.
        count = line.count(vowel)  # count certain type of vowel in a line.
        if count > 0:
            emit(vowel, count)  # output a pair composed of (vowel, count)
Esempio n. 14
0
#!/usr/bin/env python

import sys
import os.path

sys.path.append(os.path.dirname(__file__))
from mr import emit

for line in sys.stdin:
    line = [int(elem) for elem in line.replace('\n', '').split('\t')]
    # 1. Click-Binom, 2. Total, 3. Clicks, 4. Impressions
    out = [1 if line[0] > 0 else 0, 1] + line[ : 2]
    # 5. AdvertiserID
    out.append(line[4])
    depth, position = [0]*3, [0]*3
    depth[line[5] - 1], position[line[6] - 1] = 1, 1
    # 6-8. Depth, 9-11. Position
    out += depth + position
    # 12. Relative Depth
    out.append(float(line[5] - line[6]) / float(line[6]))
    # 13. Distinct Users
    out.append(str(line[11]))
    # 14-16. Gender, 17-23. Age
    gender, age = [0]*3, [0]*7
    gender[line[11]], age[line[12]] = 1, 1
    out += gender + age
    emit(line[3], out)
    

Esempio n. 15
0
#!/usr/bin/env python

### Marco Vivero, MISK

import sys
import os.path

### Import mr library.

sys.path.append(os.path.dirname(__file__))
from mr import emit

### Read command line argument (token class, i.e. Description, Keyword, etc.)

name = sys.argv[1]

### Print Python readable string output for each line in token file, and flag
### token sets with &&& for identification in Process 2.

for line in sys.stdin:
    line = line.replace('\n', '').split('\t')
    emit('{0}|{1}'.format(name, line[0]), '&&&' + line[1])
Esempio n. 16
0
def count_pairs(line):
    """A map function that counts all pairs of letters."""
    for word in line.lower().split():
        for i in range(len(word) - 2):
            emit(word[i:i + 2], 1)
Esempio n. 17
0
def run():
    for tree in read_trees(sys.stdin):
        for tag, children in rules(tree):
            mr.emit(tag, children)
Esempio n. 18
0
#!/usr/bin/env python

import sys
from mr import values_by_key, emit  # MapReduce module.

for key, value_iterator in values_by_key(
        sys.stdin):  # group values by key into an iterator value_iterator
    emit(
        key, sum(value_iterator)
    )  # emit pairs of each unique key and sum the related iterator to get pair (key, summation)
Esempio n. 19
0
def run():
	for line in sys.stdin:
		for word in line.split():
			if word == "the":
				emit(word, 1)
			elif word == "he":
				emit(word, 1)
			elif word == "she":
				emit(word, 1)
			elif word == "it":
				emit(word, 1)
			elif word == "thee":
				emit(word, 1)
			else:
				emit('other words', 1)
Esempio n. 20
0
def run():
    for line in sys.stdin:
        tag, children = mr.parse_key_value_pair(line)
        for rule in binarize(tag, children):
            mr.emit(*rule)
Esempio n. 21
0
def count_vowels(line):
    for vowel in 'aeiou':
        count = line.cout(vowel)
        if count > 0:
            emit(vowel, count)
Esempio n. 22
0
def count_pairs(line):
    """A map function that counts all pairs of letters."""
    for word in line.lower().split():
        for i in range(len(word)-2):
            emit(word[i:i+2], 1)
Esempio n. 23
0
#!/usr/bin/env python

### Marco Vivero, MISK

import sys
import os.path

### Import mr library.

sys.path.append(os.path.dirname(__file__))
from mr import emit, values_by_key

for key, value_iterator in values_by_key(sys.stdin):
    value_iterator = list(value_iterator)
    key = key.split(',')
    ### Emit AdID \t Token1|Token2,Ad-Similarity
    emit(key[1], key[0] + ',' + str(sum(value_iterator) / len(value_iterator)))
Esempio n. 24
0
def count_vowels(line):
    """A map function that counts the vowels in a line."""
    for vowel in 'aeiou':
        count = line.count(vowel)
        if count > 0:
            emit(vowel, count)
Esempio n. 25
0
def run():
            for line in sys.stdin:
                    emit('line', 1)