def emit_word_count(key, collected): token = key try: mr.emit((token, len(collected))) except ValueError, err: sys.stderr.write("emit_word_count ValueError: %(err)s\n%(data)s\n" % {"err": str(err), "data": str(collected)}) raise
def run(k, normalize=0): """Emit the top K values for each key. If NORMALIZE > 0, divide counts by the total for each key and round to NORMALIZE digits.""" k = int(k) for key, value_iterator in values_by_key(sys.stdin): counts = Counter(value_iterator) format = lambda x: x if normalize: digits = int(normalize) total = sum(counts.values()) def format(pair): return (pair[0], round(pair[1]/total, digits)) emit(key, tuple(map(format, counts.most_common(k))))
#!/usr/bin/env python ### Marco Vivero, MISK import sys import os.path ### Load mr library. sys.path.append(os.path.dirname(__file__)) from mr import emit ### Input at this stage is of the form: ### Token1|Token2 \t AdID,Token-Set1,Token-Set2 for line in sys.stdin: line = line.replace("'", '').replace('\n', '').split('\t') ### Extract match key, Token1|Token2 key, line = line[0], line[1].split(',') ### Extract tokens as a set of strings. tok1, tok2 = set(line[1].split('|')), set(line[2].split('|')) ### Compute local similarity. value = float(len(tok1.intersection(tok2))) value = value / float(len(tok1.union(tok2))) ### Output (Python readable): AdID,Token1|Token2 \t Similarity. emit(key + ',' + line[0], value)
import os.path ### Load mr library. sys.path.append(os.path.dirname(__file__)) from mr import emit ### Create dictionary of token classes and coresponding file indices. token_dict = {'Description' : 12, 'Keyword' : 10, 'Query' : 9, 'Title' : 11} adID = 5 ### Creates generator object of all possible combinations of two items in a given list. def combinations(lst): while lst: key = lst.pop(0) for elem in lst: yield key, elem for line in sys.stdin: line = line.split('\t') ### Output for each line should be ### Token1|ID1, Token2|ID2, Token1|Token2, AdID, respectively. for elem in combinations(token_dict.keys()): emit('{0}|{1}'.format(elem[0], line[ token_dict[elem[0]] ]), '{0}|{1}'.format(elem[1], line[ token_dict[elem[1]] ]) + ',' + '{0}|{1}'.format(elem[0], elem[1]) + ',' + '{0}'.format(line[adID]))
for token in text.split(" "): token = token.strip().strip(".").strip("-").strip(":").lower() if (len(token) > 1) and (token not in stop_words) and (not token.isdigit()): yield [token, 1] def emit_word_count(key, collected): token = key try: mr.emit((token, len(collected))) except ValueError, err: sys.stderr.write("emit_word_count ValueError: %(err)s\n%(data)s\n" % {"err": str(err), "data": str(collected)}) raise if __name__ == "__main__": mode = sys.argv[1] if mode == "map1": read_stop_words(sys.argv[2]) mr.emit(("token", "count")) mr.mapper(load_text) elif mode == "red1": mr.emit(("token", "count")) mr.reducer(emit_word_count)
def count_vowels(line): """A map function that counts vowels.""" for vowel in 'aeiou': count = line.count(vowel) if count > 0: emit(vowel, count)
#!/usr/bin/env python3 """Sum values for each key.""" import sys from mr import values_by_key, emit for key, value_iterator in values_by_key(sys.stdin): emit(key, sum(value_iterator))
#!/usr/bin/env python ### Marco Vivero, MISK import sys import os.path ### Import mr library. sys.path.append(os.path.dirname(__file__)) from mr import emit ### Put UserID as key for each traiining instance. for line in sys.stdin: try: line = line.replace('\n', '').split('\t') emit(line[13], str(line[2 : 13]).replace("'",'').replace(', ', '\t')[1 : -1]) except IndexError: pass
def run(): for key, value_iterator in values_by_key(sys.stdin): emit(key, set(value_iterator))
#!/usr/bin/env python ### Marco Vivero, MISK import sys import os.path ### Load mr library. sys.path.append(os.path.dirname(__file__)) from mr import emit, values_by_key ### Define a helper function to define a binary operation on a list of sets ### (component-wise union). def cross_union(x, y): return [x[0].union(y[0]), x[1].union(y[1])] for key, value_iterator in values_by_key(sys.stdin): ### Take component-wise unions aggreagted by AdID. value_iterator = reduce(cross_union, value_iterator) ### Calculate Jaccard Similarity value = float(len(value_iterator[0].intersection(value_iterator[1]))) value /= float(len(value_iterator[0].union(value_iterator[1]))) key = key.split(',') ### Output AdID \t Token1|Token2,Ad-Similarity emit(key[1], key[0] + ',' + str(value))
def run(): for tree in read_trees(sys.stdin): for word, tag in words(tree): mr.emit(word.lower(), tag)
def count_vowels(line): """A map function that counts the vowels in a line.""" for vowel in 'aeiou': # for every type of vowel. count = line.count(vowel) # count certain type of vowel in a line. if count > 0: emit(vowel, count) # output a pair composed of (vowel, count)
#!/usr/bin/env python import sys import os.path sys.path.append(os.path.dirname(__file__)) from mr import emit for line in sys.stdin: line = [int(elem) for elem in line.replace('\n', '').split('\t')] # 1. Click-Binom, 2. Total, 3. Clicks, 4. Impressions out = [1 if line[0] > 0 else 0, 1] + line[ : 2] # 5. AdvertiserID out.append(line[4]) depth, position = [0]*3, [0]*3 depth[line[5] - 1], position[line[6] - 1] = 1, 1 # 6-8. Depth, 9-11. Position out += depth + position # 12. Relative Depth out.append(float(line[5] - line[6]) / float(line[6])) # 13. Distinct Users out.append(str(line[11])) # 14-16. Gender, 17-23. Age gender, age = [0]*3, [0]*7 gender[line[11]], age[line[12]] = 1, 1 out += gender + age emit(line[3], out)
#!/usr/bin/env python ### Marco Vivero, MISK import sys import os.path ### Import mr library. sys.path.append(os.path.dirname(__file__)) from mr import emit ### Read command line argument (token class, i.e. Description, Keyword, etc.) name = sys.argv[1] ### Print Python readable string output for each line in token file, and flag ### token sets with &&& for identification in Process 2. for line in sys.stdin: line = line.replace('\n', '').split('\t') emit('{0}|{1}'.format(name, line[0]), '&&&' + line[1])
def count_pairs(line): """A map function that counts all pairs of letters.""" for word in line.lower().split(): for i in range(len(word) - 2): emit(word[i:i + 2], 1)
def run(): for tree in read_trees(sys.stdin): for tag, children in rules(tree): mr.emit(tag, children)
#!/usr/bin/env python import sys from mr import values_by_key, emit # MapReduce module. for key, value_iterator in values_by_key( sys.stdin): # group values by key into an iterator value_iterator emit( key, sum(value_iterator) ) # emit pairs of each unique key and sum the related iterator to get pair (key, summation)
def run(): for line in sys.stdin: for word in line.split(): if word == "the": emit(word, 1) elif word == "he": emit(word, 1) elif word == "she": emit(word, 1) elif word == "it": emit(word, 1) elif word == "thee": emit(word, 1) else: emit('other words', 1)
def run(): for line in sys.stdin: tag, children = mr.parse_key_value_pair(line) for rule in binarize(tag, children): mr.emit(*rule)
def count_vowels(line): for vowel in 'aeiou': count = line.cout(vowel) if count > 0: emit(vowel, count)
def count_pairs(line): """A map function that counts all pairs of letters.""" for word in line.lower().split(): for i in range(len(word)-2): emit(word[i:i+2], 1)
#!/usr/bin/env python ### Marco Vivero, MISK import sys import os.path ### Import mr library. sys.path.append(os.path.dirname(__file__)) from mr import emit, values_by_key for key, value_iterator in values_by_key(sys.stdin): value_iterator = list(value_iterator) key = key.split(',') ### Emit AdID \t Token1|Token2,Ad-Similarity emit(key[1], key[0] + ',' + str(sum(value_iterator) / len(value_iterator)))
def count_vowels(line): """A map function that counts the vowels in a line.""" for vowel in 'aeiou': count = line.count(vowel) if count > 0: emit(vowel, count)
def run(): for line in sys.stdin: emit('line', 1)