Esempio n. 1
0
def test_compare():
    actual = llr.llr_compare(Counter('abcabcabcababa'), Counter('cccccc'))
    ref = {
        'a': 2.3050260628857417,
        'c': -3.6024043433364215,
        'b': 2.060150982796662
    }
    assert actual == ref
    def reduce_features(self, llr_factor, label1, label2, min_x=0):
        x1 = [self._X_train[i] for i in range(0, len(self._X_train)) if self._y_train[i] == label1]
        x2 = [self._X_train[i] for i in range(0, len(self._X_train)) if self._y_train[i] == label2]
        x1_counter = Counter(' '.join(x1).split())
        x2_counter = Counter(' '.join(x2).split())

        cmp_results = llr.llr_compare(x1_counter, x2_counter)

        top_x1 = {k:v for k,v in sorted(cmp_results.items(), key=lambda x: (-x[1], x[0]))[:llr_factor]}
        top_x2 = {k:v for k,v in sorted(cmp_results.items(), key=lambda x: (x[1], x[0]))[:llr_factor]}

        X_train_new = self._reduce_helper(self._X_train, top_x1, top_x2, min_x)
        X_test_new = self._reduce_helper(self._X_test, top_x1, top_x2, min_x)

        return X_train_new, X_test_new
def calculate_llr(X, y, label1, label2, n=25):
    x1 = [X[i] for i in range(0, len(X)) if y[i] == label1]
    x2 = [X[i] for i in range(0, len(X)) if y[i] == label2]
    x1_counter = Counter(' '.join(x1).split())
    x2_counter = Counter(' '.join(x2).split())

    cmp_results = llr.llr_compare(x1_counter, x2_counter)

    top_x1 = {
        k: v
        for k, v in sorted(cmp_results.items(), key=lambda x: (-x[1], x[0]))
        [:n]
    }
    top_x2 = {
        k: v
        for k, v in sorted(cmp_results.items(), key=lambda x: (x[1], x[0]))[:n]
    }

    return top_x1, top_x2
Esempio n. 4
0
def main(focus_path, other_paths, output_path, num_terms=30):
    '''Finds the significant words in the text located at focus_path, compared
        with the rest of the corpus.

    focus_path - string path to .txt file to focus on
    other_paths - list of string paths to .txt files comprising the rest of the
        corpus
    output_path - string path to .txt file in which to write results
    num_terms - number of significant terms to show
    '''

    focus_text = count([focus_path])
    other_text = count(other_paths)

    diff = llr.llr_compare(focus_text, other_text)
    ranked = sorted(diff.items(), key=lambda x: x[1])

    with open(output_path, 'w') as output:
        for word, score in reversed(ranked[-num_terms:]):
            output.write('{:<20.10}   {}\n'.format(score, word))
Esempio n. 5
0
from collections import Counter
import re

import llr


def count(file):
    '''Counts the words contained in a file'''
    with open(file) as f:
        return Counter(re.findall('\w+', re.sub('[\r\n]', ' ', f.read())))


# Count words in Hamlet
hamlet = count('data/hamlet')
# and the Declaration of Independence
declaration = count('data/declaration')

# Find out which words are used more or less
diff = llr.llr_compare(hamlet, declaration)
ranked = sorted(diff.items(), key=lambda x: x[1])

print("\nMore in Declaration of Independence")
for k, v in ranked[:10]:
    print(k, v)

print("\nMore in Hamlet")
for k, v in ranked[-10:]:
    print(k, v)
Esempio n. 6
0
from collections import Counter 
import re

import llr

def count(file):
    with open(file) as f:
        return Counter(re.findall('\w+', re.sub('[\r\n]', ' ', f.read())))

hamlet = count('data/hamlet')
declaration = count('data/declaration') 

diff = llr.llr_compare(hamlet, declaration)

print("\nMore in Declaration of Independence")
for k,v in sorted(diff.items(), key=lambda x: x[1])[:10]:
    print(k, v)

print("\nMore in Hamlet")
for k,v in sorted(diff.items(), key=lambda x: x[1])[-10:]:
    print(k, v)
Esempio n. 7
0
def test_compare():
    actual = llr.llr_compare(Counter('abcabcabcababa'), Counter('cccccc'))
    ref = {'a': 2.3050260628857417, 'c': -3.6024043433364215, 'b': 2.060150982796662}
    assert actual == ref