Esempio n. 1
0
def test_unstem():
    """
    Given a word stem, unstem() should return the most frequently- occurring
    unstemmed variant in the text.
    """

    # cat > cats
    t = Text('cat cat cats')
    assert t.unstem('cat') == 'cat'

    # cats > cat
    t = Text('cat cat cats cats cats')
    assert t.unstem('cat') == 'cats'
Esempio n. 2
0
def build_graph(path, term_depth=1000, skim_depth=10,
                d_weights=False, **kwargs):

    """
    Tokenize a text, index a term matrix, and build out a graph.

    Args:
        path (str): The file path.
        term_depth (int): Consider the N most frequent terms.
        skim_depth (int): Connect each word to the N closest siblings.
        d_weights (bool): If true, give "close" nodes low weights.

    Returns:
        Skimmer: The indexed graph.
    """

    # Tokenize text.
    click.echo('\nTokenizing text...')
    t = Text.from_file(path)
    click.echo('Extracted %d tokens' % len(t.tokens))

    m = Matrix()

    # Index the term matrix.
    click.echo('\nIndexing terms:')
    m.index(t, t.most_frequent_terms(term_depth), **kwargs)

    g = Skimmer()

    # Construct the network.
    click.echo('\nGenerating graph:')
    g.build(t, m, skim_depth, d_weights)

    return g
Esempio n. 3
0
def test_term_counts():
    """
    term_counts() should return a map of term -> count.
    """

    t = Text('aa bb bb cc cc cc')

    assert t.term_counts() == OrderedDict([('cc', 3), ('bb', 2), ('aa', 1)])
def test_most_frequent_terms():
    """
    most_frequent_terms() should return the N most frequent terms.
    """

    t = Text('aa bb bb cc cc cc')

    # Top 2 words are 'cc' and 'bb'
    assert t.most_frequent_terms(2) == set(['cc', 'bb'])
Esempio n. 5
0
def test_term_count_buckets():
    """
    term_count_buckets() should map integer counts to the list of terms in the
    text that appear that many times.
    """

    t = Text('aa bb bb cc cc dd dd dd')

    assert t.term_count_buckets() == {1: ['aa'], 2: ['bb', 'cc'], 3: ['dd']}
Esempio n. 6
0
def test_set_term_offsets():

    """
    During tokenization, store map of token -> offsets positions.
    """

    t = Text('aa bb aa bb')

    assert t.terms['aa'] == [0, 2]
    assert t.terms['bb'] == [1, 3]
Esempio n. 7
0
def test_custom_file():
    """
    Load a custom file, when a path is passed.
    """

    path = os.path.join(os.path.dirname(__file__), 'fixtures/stopwords.txt')

    t = Text('test', stopwords=path)

    assert t.stopwords == set(['sa', 'sb', 'sc'])
Esempio n. 8
0
def test_set_tokens():

    """
    tokenize() should record individual tokens.
    """

    t = Text('aa bb cc')

    assert t.tokens[0]['unstemmed'] == 'aa'
    assert t.tokens[1]['unstemmed'] == 'bb'
    assert t.tokens[2]['unstemmed'] == 'cc'
    assert len(t.tokens) == 3
def test_merge_smallest_bucket():
    """
    Say 1000 gets passed as the depth, and the 1000th term in the term-counts
    dictionary has a count of 10. But, there are 20 other terms that also show
    up 10 times in the text. In this case, all of the terms in this smallest
    bucket should be included, so as not to arbitrarily omit words that appear
    with the same frequency as words that do get included.
    """

    t = Text('aa bb bb cc cc dd dd dd')

    # Top 2 words are 'cc' and 'bb'
    assert t.most_frequent_terms(2) == set(['dd', 'cc', 'bb'])
Esempio n. 10
0
def test_index():
    """
    index() should index the Bray-Curtis distances between terms.
    """

    t = Text('aa bb cc')
    m = Matrix()

    m.index(t)

    assert m.get_pair('aa', 'bb') == t.score_braycurtis('aa', 'bb')
    assert m.get_pair('aa', 'cc') == t.score_braycurtis('aa', 'cc')
    assert m.get_pair('bb', 'cc') == t.score_braycurtis('bb', 'cc')
Esempio n. 11
0
def test_default_file():
    """
    When no path is passed to load_stopwords(), the default file in the
    textplot module should be loaded.
    """

    defaults = set(
        pkgutil.get_data('textplot',
                         'data/stopwords.txt').decode('utf8').splitlines())

    t = Text('test')

    assert t.stopwords == defaults
Esempio n. 12
0
def test_ignore_stopwords():

    """
    Stopwords should be represented as None in the token list.
    """

    t = Text('aa the bb an cc')

    assert t.tokens[0]['unstemmed'] == 'aa'
    assert t.tokens[1] == None
    assert t.tokens[2]['unstemmed'] == 'bb'
    assert t.tokens[3] == None
    assert t.tokens[4]['unstemmed'] == 'cc'
    assert len(t.tokens) == 5
Esempio n. 13
0
def test_term_subset():
    """
    When a subset of terms is passed, just those terms should be indexed.
    """

    t = Text('aa bb cc')
    m = Matrix()

    m.index(t, ['aa', 'bb'])

    # Should index 'aa' and 'bb'.
    assert m.get_pair('aa', 'bb') == t.score_braycurtis('aa', 'bb')

    # Should ignore 'cc'.
    assert not m.get_pair('aa', 'cc')
    assert not m.get_pair('bb', 'cc')
Esempio n. 14
0
def test_anchored_pairs():

    """
    For a given anchor term, anchored_pairs() should return an ordered map of
    term -> distance for all other indexed terms.
    """

    t = Text('aa bb cc dd')
    m = Matrix()

    m.index(t)

    pairs = m.anchored_pairs('aa')

    assert list(pairs.keys()) == ['bb', 'cc', 'dd']
    assert pairs['bb'] > pairs['cc'] > pairs['dd']
Esempio n. 15
0
def clumpy(path, term_depth=500, skim_depth=10, d_weights=False, **kwargs):
    """
    Use "clumpiest" terms.
    """

    t = Text.from_file(path)
    m = Matrix(t)

    print('Indexing terms:')
    m.index(t.densities(**kwargs).keys()[:term_depth], **kwargs)

    g = Skimmer()

    print('Generating graph:')
    g.build(m, skim_depth, d_weights)

    return g
Esempio n. 16
0
def frequent(path, term_depth=500, skim_depth=10, d_weights=False, **kwargs):
    """
    Use most frequent terms.
    """

    t = Text.from_file(path)
    m = Matrix(t)

    print('Indexing terms:')
    m.index(t.most_frequent_terms(term_depth), **kwargs)

    g = Skimmer()

    print('Generating graph:')
    g.build(m, skim_depth, d_weights)

    return g
Esempio n. 17
0
def clumpy(path, term_depth=500, skim_depth=10, d_weights=False, **kwargs):

    """
    Use "clumpiest" terms.
    """

    t = Text.from_file(path)
    m = Matrix(t)

    print('Indexing terms:')
    m.index(t.densities(**kwargs).keys()[:term_depth], **kwargs)

    g = Skimmer()

    print('Generating graph:')
    g.build(m, skim_depth, d_weights)

    return g
Esempio n. 18
0
def frequent(path, term_depth=500, skim_depth=10, d_weights=False, **kwargs):

    """
    Use most frequent terms.
    """

    t = Text.from_file(path)
    m = Matrix(t)

    print('Indexing terms:')
    m.index(t.most_frequent_terms(term_depth), **kwargs)

    g = Skimmer()

    print('Generating graph:')
    g.build(m, skim_depth, d_weights)

    return g
Esempio n. 19
0
def build_graph(path,
                term_depth=1000,
                skim_depth=10,
                d_weights=False,
                **kwargs):
    """
    Tokenize a text, index a term matrix, and build out a graph.

    Args:
        path (str): The file path.
        term_depth (int): Consider the N most frequent terms.
        skim_depth (int): Connect each word to the N closest siblings.
        d_weights (bool): If true, give "close" nodes low weights.

    Returns:
        Skimmer: The indexed graph.
    """

    # Tokenize text.
    click.echo('\nTokenizing text...')
    t = Text.from_file(path)
    click.echo('Extracted %d tokens' % len(t.tokens))

    m = Matrix()

    # Index the term matrix.
    click.echo('\nIndexing terms:')
    m.index(t, t.most_frequent_terms(term_depth), **kwargs)

    g = Skimmer()

    # Construct the network.
    click.echo('\nGenerating graph:')
    g.build(t, m, skim_depth, d_weights)

    return g