Example #1
0
def map_tf_idf(corpus_size, input=sys.stdin, output=sys.stdout):
    """
    (word file_name) (n N m) --> (word file_name) (tfidf)

    computes the tf-idf metric for each word in each file in the corpus
    which is defined as the term frequency multiplied by the inverse document
    frequency. The term frequency is what porportion of the words in
    the document are a given word. The inverse document frequency is the
    number of documents in the corpus that the word appears.
    """

    for in_key, in_value in mru.json_loader(input):
        n = in_value['word_freq']
        N = in_value['doc_size']
        m = in_value['corp_freq']
        D = corpus_size
        tf = float(n) / float(N)
        idf = (float(D) / float(m))
        log_idf = log(idf, 10)
        tfidf = tf * idf
        tf_log_idf = tf * log_idf
        # in_key == out_key
        out_value = {
            'tfidf': tfidf,
            'tf log idf': tf_log_idf,
            'log idf': log_idf,
            'idf': idf,
            'tf': tf,
            'word frequency': n,
            'document length': N,
            'corpus frequency': m,
            'corpus size': D
        }
        mru.reducer_emit(in_key, out_value, output)
Example #2
0
def map_tf_idf(corpus_size, input=sys.stdin, output=sys.stdout):
    """
    (word file_name) (n N m) --> (word file_name) (tfidf)

    computes the tf-idf metric for each word in each file in the corpus
    which is defined as the term frequency multiplied by the inverse document
    frequency. The term frequency is what porportion of the words in
    the document are a given word. The inverse document frequency is the
    number of documents in the corpus that the word appears.
    """

    for in_key, in_value in mru.json_loader(input):
        n = in_value['word_freq']
        N = in_value['doc_size']
        m = in_value['corp_freq']
        D = corpus_size
        tf = float(n) / float(N)
        idf = (float(D) / float(m))
        log_idf = log(idf, 10)
        tfidf = tf * idf
        tf_log_idf = tf * log_idf
        # in_key == out_key
        out_value = {'tfidf': tfidf, 'tf log idf': tf_log_idf,
                     'log idf': log_idf, 'idf': idf, 'tf': tf,
                     'word frequency': n, 'document length': N,
                     'corpus frequency': m, 'corpus size': D}
        mru.reducer_emit(in_key, out_value, output)
Example #3
0
def normalize_mapper(input=sys.stdin, output=sys.stdout):
    for in_key, in_value in mru.json_loader(input):
        ngram = in_key['word']
        uid = in_key['filename']
        out_key = {'uid': uid}
        in_value['ngram'] = ngram
        out_value = in_value
        mru.mapper_emit(out_key, out_value, output)
Example #4
0
def map_word_join(input=sys.stdin, output=sys.stdout):
    """
    (word file_name) (tfidf) --> (word) (file_name tfidf)

    emits a line for each word in each file with the word as a key
    and the filename and tfidf score as the value
    """

    for in_key, in_value in mru.json_loader(input):
        out_key = {'word': in_key['word']}
        out_value = {'filename': in_key['filename'], 'tfidf': in_value['tfidf']}
        mru.mapper_emit(out_key, out_value, output)
Example #5
0
def map_word_count(input=sys.stdin, output=sys.stdout):
    """
    (word filename) (n) --> (filename) (word n)

    for each word in each document, emits the document name as the key
    and the word and the number of occurrences in that file as the value
    """

    for in_key, in_value in mru.json_loader(input):
        filename = in_key['filename']
        word = in_key['word']
        word_frequency = in_value['word_freq']
        out_key = {'filename': filename}
        out_value = {'word': word, 'word_freq': word_frequency}
        mru.mapper_emit(out_key, out_value, output)
Example #6
0
def map_word_join(input=sys.stdin, output=sys.stdout):
    """
    (word file_name) (tfidf) --> (word) (file_name tfidf)

    emits a line for each word in each file with the word as a key
    and the filename and tfidf score as the value
    """

    for in_key, in_value in mru.json_loader(input):
        out_key = {'word': in_key['word']}
        out_value = {
            'filename': in_key['filename'],
            'tfidf': in_value['tfidf']
        }
        mru.mapper_emit(out_key, out_value, output)
Example #7
0
def map_corpus_frequency(input=sys.stdin, output=sys.stdout):
    """
    (word filename) (n N) --> (word) (filename n N 1)

    emits a line for each unique word in each file to be consumed
    by corp_freq_red to find the number of occurences of each
    unique word throughout the entire corpus.
    """
    for in_key, in_value in mru.json_loader(input):
        out_key = {'word': in_key['word']}
        out_value = {'filename': in_key['filename'],
                     'word_freq': in_value['word_freq'],
                     'doc_size': in_value['doc_size'],
                     'count': 1}
        mru.mapper_emit(out_key, out_value, output)
Example #8
0
def map_corpus_frequency(input=sys.stdin, output=sys.stdout):
    """
    (word filename) (n N) --> (word) (filename n N 1)

    emits a line for each unique word in each file to be consumed
    by corp_freq_red to find the number of occurences of each
    unique word throughout the entire corpus.
    """
    for in_key, in_value in mru.json_loader(input):
        out_key = {'word': in_key['word']}
        out_value = {
            'filename': in_key['filename'],
            'word_freq': in_value['word_freq'],
            'doc_size': in_value['doc_size'],
            'count': 1
        }
        mru.mapper_emit(out_key, out_value, output)
Example #9
0
def map_cosine_similarity(input=sys.stdin, output=sys.stdout):
    """
    (word) (file1 file2 tfidf1*tfidf2) --> (file1 file2) (tfidf1*tfidf2)

    for each word common to two documents, removes the word from the
    key/value pair and replaces it with the two filenames so that we can
    sum up the values for each pair of documents in the reducer.
    """
    for in_key, in_value in mru.json_loader(input):
        file1 = in_value['file1']
        file2 = in_value['file2']
        # we want to ensure that (file1 file2) and (file2 file1) get
        # sent to the same reducer, so we order them alphabetically
        if file1 > file2:
            file1, file2 = file2, file1
        out_key = {'file1': file1, 'file2': file2}
        out_value = {'product': in_value['product']}
        mru.mapper_emit(out_key, out_value, output)
Example #10
0
def map_word_frequency(input=sys.stdin, output=sys.stdout, gram_size=1):
    """
    (file_name) (file_contents) --> (word file_name) (1)

    maps file contents to words for use in a word count reducer. For each
    word in the document, a new key-value pair is emitted with a value of 1.
    """

    for in_key, in_value in mru.json_loader(input):
        filename = in_key['filename']
        words = in_value['words']
        out_value = {'count': 1}
        n = gram_size
        if n > len(words):
            n = len(words)
        ngrams = [' '.join(map(lambda x: x, words[i:i + n]))
                  for i in range(len(words) - n + 1)]
        for ngram in ngrams:
            out_key = {'word': ngram, 'filename': filename}
            mru.mapper_emit(out_key, out_value, output)
Example #11
0
def map_corpus_size(input=sys.stdin, output=sys.stdout):
    for in_key, in_value in mru.json_loader(input):
        out_key = {'count': 1}
        out_value = {'count': 1}
        mru.mapper_emit(out_key, out_value, output)