def map_tf_idf(corpus_size, input=sys.stdin, output=sys.stdout): """ (word file_name) (n N m) --> (word file_name) (tfidf) computes the tf-idf metric for each word in each file in the corpus which is defined as the term frequency multiplied by the inverse document frequency. The term frequency is what porportion of the words in the document are a given word. The inverse document frequency is the number of documents in the corpus that the word appears. """ for in_key, in_value in mru.json_loader(input): n = in_value['word_freq'] N = in_value['doc_size'] m = in_value['corp_freq'] D = corpus_size tf = float(n) / float(N) idf = (float(D) / float(m)) log_idf = log(idf, 10) tfidf = tf * idf tf_log_idf = tf * log_idf # in_key == out_key out_value = { 'tfidf': tfidf, 'tf log idf': tf_log_idf, 'log idf': log_idf, 'idf': idf, 'tf': tf, 'word frequency': n, 'document length': N, 'corpus frequency': m, 'corpus size': D } mru.reducer_emit(in_key, out_value, output)
def map_tf_idf(corpus_size, input=sys.stdin, output=sys.stdout): """ (word file_name) (n N m) --> (word file_name) (tfidf) computes the tf-idf metric for each word in each file in the corpus which is defined as the term frequency multiplied by the inverse document frequency. The term frequency is what porportion of the words in the document are a given word. The inverse document frequency is the number of documents in the corpus that the word appears. """ for in_key, in_value in mru.json_loader(input): n = in_value['word_freq'] N = in_value['doc_size'] m = in_value['corp_freq'] D = corpus_size tf = float(n) / float(N) idf = (float(D) / float(m)) log_idf = log(idf, 10) tfidf = tf * idf tf_log_idf = tf * log_idf # in_key == out_key out_value = {'tfidf': tfidf, 'tf log idf': tf_log_idf, 'log idf': log_idf, 'idf': idf, 'tf': tf, 'word frequency': n, 'document length': N, 'corpus frequency': m, 'corpus size': D} mru.reducer_emit(in_key, out_value, output)
def normalize_mapper(input=sys.stdin, output=sys.stdout): for in_key, in_value in mru.json_loader(input): ngram = in_key['word'] uid = in_key['filename'] out_key = {'uid': uid} in_value['ngram'] = ngram out_value = in_value mru.mapper_emit(out_key, out_value, output)
def map_word_join(input=sys.stdin, output=sys.stdout): """ (word file_name) (tfidf) --> (word) (file_name tfidf) emits a line for each word in each file with the word as a key and the filename and tfidf score as the value """ for in_key, in_value in mru.json_loader(input): out_key = {'word': in_key['word']} out_value = {'filename': in_key['filename'], 'tfidf': in_value['tfidf']} mru.mapper_emit(out_key, out_value, output)
def map_word_count(input=sys.stdin, output=sys.stdout): """ (word filename) (n) --> (filename) (word n) for each word in each document, emits the document name as the key and the word and the number of occurrences in that file as the value """ for in_key, in_value in mru.json_loader(input): filename = in_key['filename'] word = in_key['word'] word_frequency = in_value['word_freq'] out_key = {'filename': filename} out_value = {'word': word, 'word_freq': word_frequency} mru.mapper_emit(out_key, out_value, output)
def map_word_join(input=sys.stdin, output=sys.stdout): """ (word file_name) (tfidf) --> (word) (file_name tfidf) emits a line for each word in each file with the word as a key and the filename and tfidf score as the value """ for in_key, in_value in mru.json_loader(input): out_key = {'word': in_key['word']} out_value = { 'filename': in_key['filename'], 'tfidf': in_value['tfidf'] } mru.mapper_emit(out_key, out_value, output)
def map_corpus_frequency(input=sys.stdin, output=sys.stdout): """ (word filename) (n N) --> (word) (filename n N 1) emits a line for each unique word in each file to be consumed by corp_freq_red to find the number of occurences of each unique word throughout the entire corpus. """ for in_key, in_value in mru.json_loader(input): out_key = {'word': in_key['word']} out_value = {'filename': in_key['filename'], 'word_freq': in_value['word_freq'], 'doc_size': in_value['doc_size'], 'count': 1} mru.mapper_emit(out_key, out_value, output)
def map_corpus_frequency(input=sys.stdin, output=sys.stdout): """ (word filename) (n N) --> (word) (filename n N 1) emits a line for each unique word in each file to be consumed by corp_freq_red to find the number of occurences of each unique word throughout the entire corpus. """ for in_key, in_value in mru.json_loader(input): out_key = {'word': in_key['word']} out_value = { 'filename': in_key['filename'], 'word_freq': in_value['word_freq'], 'doc_size': in_value['doc_size'], 'count': 1 } mru.mapper_emit(out_key, out_value, output)
def map_cosine_similarity(input=sys.stdin, output=sys.stdout): """ (word) (file1 file2 tfidf1*tfidf2) --> (file1 file2) (tfidf1*tfidf2) for each word common to two documents, removes the word from the key/value pair and replaces it with the two filenames so that we can sum up the values for each pair of documents in the reducer. """ for in_key, in_value in mru.json_loader(input): file1 = in_value['file1'] file2 = in_value['file2'] # we want to ensure that (file1 file2) and (file2 file1) get # sent to the same reducer, so we order them alphabetically if file1 > file2: file1, file2 = file2, file1 out_key = {'file1': file1, 'file2': file2} out_value = {'product': in_value['product']} mru.mapper_emit(out_key, out_value, output)
def map_word_frequency(input=sys.stdin, output=sys.stdout, gram_size=1): """ (file_name) (file_contents) --> (word file_name) (1) maps file contents to words for use in a word count reducer. For each word in the document, a new key-value pair is emitted with a value of 1. """ for in_key, in_value in mru.json_loader(input): filename = in_key['filename'] words = in_value['words'] out_value = {'count': 1} n = gram_size if n > len(words): n = len(words) ngrams = [' '.join(map(lambda x: x, words[i:i + n])) for i in range(len(words) - n + 1)] for ngram in ngrams: out_key = {'word': ngram, 'filename': filename} mru.mapper_emit(out_key, out_value, output)
def map_corpus_size(input=sys.stdin, output=sys.stdout): for in_key, in_value in mru.json_loader(input): out_key = {'count': 1} out_value = {'count': 1} mru.mapper_emit(out_key, out_value, output)