Beispiel #1
0
def map_tf_idf(corpus_size, input=sys.stdin, output=sys.stdout):
    """
    (word file_name) (n N m) --> (word file_name) (tfidf)

    computes the tf-idf metric for each word in each file in the corpus
    which is defined as the term frequency multiplied by the inverse document
    frequency. The term frequency is what porportion of the words in
    the document are a given word. The inverse document frequency is the
    number of documents in the corpus that the word appears.
    """

    for in_key, in_value in mru.json_loader(input):
        n = in_value['word_freq']
        N = in_value['doc_size']
        m = in_value['corp_freq']
        D = corpus_size
        tf = float(n) / float(N)
        idf = (float(D) / float(m))
        log_idf = log(idf, 10)
        tfidf = tf * idf
        tf_log_idf = tf * log_idf
        # in_key == out_key
        out_value = {'tfidf': tfidf, 'tf log idf': tf_log_idf,
                     'log idf': log_idf, 'idf': idf, 'tf': tf,
                     'word frequency': n, 'document length': N,
                     'corpus frequency': m, 'corpus size': D}
        mru.reducer_emit(in_key, out_value, output)
Beispiel #2
0
def map_tf_idf(corpus_size, input=sys.stdin, output=sys.stdout):
    """
    (word file_name) (n N m) --> (word file_name) (tfidf)

    computes the tf-idf metric for each word in each file in the corpus
    which is defined as the term frequency multiplied by the inverse document
    frequency. The term frequency is what porportion of the words in
    the document are a given word. The inverse document frequency is the
    number of documents in the corpus that the word appears.
    """

    for in_key, in_value in mru.json_loader(input):
        n = in_value['word_freq']
        N = in_value['doc_size']
        m = in_value['corp_freq']
        D = corpus_size
        tf = float(n) / float(N)
        idf = (float(D) / float(m))
        log_idf = log(idf, 10)
        tfidf = tf * idf
        tf_log_idf = tf * log_idf
        # in_key == out_key
        out_value = {
            'tfidf': tfidf,
            'tf log idf': tf_log_idf,
            'log idf': log_idf,
            'idf': idf,
            'tf': tf,
            'word frequency': n,
            'document length': N,
            'corpus frequency': m,
            'corpus size': D
        }
        mru.reducer_emit(in_key, out_value, output)
Beispiel #3
0
def reduce_word_join(input=mru.reducer_stream(), output=sys.stdout):
    """
    (word) (file_name tfidf) --> (word) (file1 file2 tfidf1*tfidf2)

    for each word, if two distinct documents both contain that word,
    a line is emitted containing the product of the tfidf scores of that
    word in both documents.

    This is the first step in computing the pairwise dot product of the tf-idf
    vectors between all documents, where the corresponding elements for every
    pair of documents are multiplied together.
    """

    for in_key, key_stream in input:
        values = []
        for in_value in key_stream:
            values.append(in_value)
        for val1 in values:
            for val2 in values:
                if not val1['filename'] == val2['filename']:
                    out_key = {'word': in_key['word']}
                    out_value = {
                        'file1': val1['filename'],
                        'file2': val2['filename'],
                        'product': val1['tfidf'] * val2['tfidf']
                    }
                    mru.reducer_emit(out_key, out_value, output)
Beispiel #4
0
def reduce_corpus_size(input=mru.reducer_stream(), output=sys.stdout):
    corpus_size = 0
    for in_key, key_stream in input:
        for in_value in key_stream:
            corpus_size += 1
    out_key = 'corpus size'
    out_value = corpus_size
    mru.reducer_emit(out_key, out_value, output)
def map_contents(input=sys.stdin, output=sys.stdout, stop_words=None):
    for line in input:
        docname = os.environ['mapreduce_map_input_file']
        if stop_words is None:
            contents = mru.clean_text(line)
        else:
            contents = mru.clean_text(line, stop_words)
        key = {'filename': docname}
        value = {'words': [word for word in contents]}
        # we emit as if we were a reducer since the contents don't get put
        # through a reducer
        mru.reducer_emit(key, value, output)
Beispiel #6
0
def map_contents(input=sys.stdin, output=sys.stdout, stop_words=None):
    for line in input:
        docname = os.environ['mapreduce_map_input_file']
        if stop_words is None:
            contents = mru.clean_text(line)
        else:
            contents = mru.clean_text(line, stop_words)
        key = {'filename': docname}
        value = {'words': [word for word in contents]}
        # we emit as if we were a reducer since the contents don't get put
        # through a reducer
        mru.reducer_emit(key, value, output)
Beispiel #7
0
def map_claims(input=sys.stdin, output=sys.stdout,
               kv_delim=INPUT_KV_DELIM, stop_words_file=None, stem=True):
    for line in input:
        key, value = line.strip().split(kv_delim)
        patent_id = key.strip()
        if stop_words_file is not None:
            stop_words = json.loads(open(stop_words_file).read())
            contents = mru.clean_text(value, stop_words, stem)
        else:
            contents = mru.clean_text(value, stem=stem)
        key = {'filename': patent_id}
        contents = {'words': [word for word in contents]}
        mru.reducer_emit(key, contents, output)
Beispiel #8
0
def reduce_cosine_similarity(input=mru.reducer_stream(), output=stdout):
    """
    (file1 file2) (tfidf1*tfidf2) --> (file1 file2) (cosine_similarity(f1, f2))

    sums up the products of the tfidf values of words common between every
    pair of documents to produce the cosine similarity of the two documents
    """
    for in_key, key_stream in input:
        sum_for_docs = 0
        for in_value in key_stream:
            sum_for_docs += in_value['product']
        out_key = {'file1': in_key['file1'], 'file2': in_key['file2']}
        out_value = {'cos_similarity': sum_for_docs}
        mru.reducer_emit(out_key, out_value, output)
Beispiel #9
0
def reduce_word_frequency(input=mru.reducer_stream(), output=sys.stdout):
    """
    (word filename) (1) --> (word filename) (n)

    sums up the number of occurences of each word in each file and emits
    the result for each word/filename combination
    """

    for in_key, key_stream in input:
        word_frequency = 0
        for in_value in key_stream:
            word_frequency += in_value['count']
        out_key = {'word': in_key['word'], 'filename': in_key['filename']}
        out_value = {'word_freq': word_frequency}
        mru.reducer_emit(out_key, out_value, output)
def normalize_reducer(input=mru.reducer_stream(), output=sys.stdout,
                      keys_to_normalize=KEYS_TO_NORMALIZE):
    for in_key, key_stream in input:
        normalize_factors = {to_factor: 0.0 for to_factor in keys_to_normalize}
        terms_to_normalize = []
        for in_value in key_stream:
            terms_to_normalize.append(in_value)
            normalize_factors = {k: normalize_factors[k] + in_value[k] ** 2
                                 for k, v in normalize_factors.iteritems()}
        for term in terms_to_normalize:
            out_key = {'uid': in_key['uid'], 'ngram': term['ngram']}
            out_value = term
            del out_value['ngram']
            for key in keys_to_normalize:
                out_value[key] /= math.sqrt(normalize_factors[key])
            mru.reducer_emit(out_key, out_value, output)
def reduce_word_count(input=mru.reducer_stream(), output=sys.stdout):
    """
    (file_name) (word word_freq) --> (word file_name) (n N)

    sums up the total number of words in each document and emits
    that sum for each word along with the number of occurences of that
    word in the given document
    """

    for in_key, key_stream in input:
        doc_size = 0
        values = []
        for in_value in key_stream:
            values.append(in_value)
            doc_size += in_value['word_freq']
        for value in values:
            out_key = {'word': value['word'], 'filename': in_key['filename']}
            out_value = {'word_freq': value['word_freq'], 'doc_size': doc_size}
            mru.reducer_emit(out_key, out_value, output)
Beispiel #12
0
def normalize_reducer(input=mru.reducer_stream(),
                      output=sys.stdout,
                      keys_to_normalize=KEYS_TO_NORMALIZE):
    for in_key, key_stream in input:
        normalize_factors = {to_factor: 0.0 for to_factor in keys_to_normalize}
        terms_to_normalize = []
        for in_value in key_stream:
            terms_to_normalize.append(in_value)
            normalize_factors = {
                k: normalize_factors[k] + in_value[k]**2
                for k, v in normalize_factors.iteritems()
            }
        for term in terms_to_normalize:
            out_key = {'uid': in_key['uid'], 'ngram': term['ngram']}
            out_value = term
            del out_value['ngram']
            for key in keys_to_normalize:
                out_value[key] /= math.sqrt(normalize_factors[key])
            mru.reducer_emit(out_key, out_value, output)
Beispiel #13
0
def reduce_word_count(input=mru.reducer_stream(), output=sys.stdout):
    """
    (file_name) (word word_freq) --> (word file_name) (n N)

    sums up the total number of words in each document and emits
    that sum for each word along with the number of occurences of that
    word in the given document
    """

    for in_key, key_stream in input:
        doc_size = 0
        values = []
        for in_value in key_stream:
            values.append(in_value)
            doc_size += in_value['word_freq']
        for value in values:
            out_key = {'word': value['word'], 'filename': in_key['filename']}
            out_value = {'word_freq': value['word_freq'], 'doc_size': doc_size}
            mru.reducer_emit(out_key, out_value, output)
def reduce_corpus_frequency(input=mru.reducer_stream(), output=stdout):
    """
    (word) (filename n N 1) --> (word filename) (n N m)

    sums up the number of occurences of each unique word throughout
    the corpus and emits this sum for each document that the word
    occurs in.
    """
    for in_key, key_stream in input:
        corpus_frequency = 0
        values = []
        for in_value in key_stream:
            corpus_frequency += in_value['count']
            values.append(in_value)
        for value in values:
            out_key = {'word': in_key['word'], 'filename': value['filename']}
            out_value = {'word_freq': value['word_freq'],
                         'doc_size': value['doc_size'],
                         'corp_freq': corpus_frequency}
            mru.reducer_emit(out_key, out_value, output)
Beispiel #15
0
def reduce_corpus_frequency(input=mru.reducer_stream(), output=stdout):
    """
    (word) (filename n N 1) --> (word filename) (n N m)

    sums up the number of occurences of each unique word throughout
    the corpus and emits this sum for each document that the word
    occurs in.
    """
    for in_key, key_stream in input:
        corpus_frequency = 0
        values = []
        for in_value in key_stream:
            corpus_frequency += in_value['count']
            values.append(in_value)
        for value in values:
            out_key = {'word': in_key['word'], 'filename': value['filename']}
            out_value = {
                'word_freq': value['word_freq'],
                'doc_size': value['doc_size'],
                'corp_freq': corpus_frequency
            }
            mru.reducer_emit(out_key, out_value, output)