Python Text Examples, textplot.text.Text Python Examples

Example #1

0

Show file

File: test_unstem.py Project: xkuang/textplot

def test_unstem():
    """
    Given a word stem, unstem() should return the most frequently- occurring
    unstemmed variant in the text.
    """

    # cat > cats
    t = Text('cat cat cats')
    assert t.unstem('cat') == 'cat'

    # cats > cat
    t = Text('cat cat cats cats cats')
    assert t.unstem('cat') == 'cats'

Example #2

0

Show file

File: helpers.py Project: MartinPaulEve/textplot

def build_graph(path, term_depth=1000, skim_depth=10,
                d_weights=False, **kwargs):

    """
    Tokenize a text, index a term matrix, and build out a graph.

    Args:
        path (str): The file path.
        term_depth (int): Consider the N most frequent terms.
        skim_depth (int): Connect each word to the N closest siblings.
        d_weights (bool): If true, give "close" nodes low weights.

    Returns:
        Skimmer: The indexed graph.
    """

    # Tokenize text.
    click.echo('\nTokenizing text...')
    t = Text.from_file(path)
    click.echo('Extracted %d tokens' % len(t.tokens))

    m = Matrix()

    # Index the term matrix.
    click.echo('\nIndexing terms:')
    m.index(t, t.most_frequent_terms(term_depth), **kwargs)

    g = Skimmer()

    # Construct the network.
    click.echo('\nGenerating graph:')
    g.build(t, m, skim_depth, d_weights)

    return g

Example #3

0

Show file

File: test_term_counts.py Project: xkuang/textplot

def test_term_counts():
    """
    term_counts() should return a map of term -> count.
    """

    t = Text('aa bb bb cc cc cc')

    assert t.term_counts() == OrderedDict([('cc', 3), ('bb', 2), ('aa', 1)])

Example #4

0

Show file

File: test_most_frequent_terms.py Project: xkuang/textplot

def test_most_frequent_terms():
    """
    most_frequent_terms() should return the N most frequent terms.
    """

    t = Text('aa bb bb cc cc cc')

    # Top 2 words are 'cc' and 'bb'
    assert t.most_frequent_terms(2) == set(['cc', 'bb'])

Example #5

0

Show file

File: test_term_count_buckets.py Project: xkuang/textplot

def test_term_count_buckets():
    """
    term_count_buckets() should map integer counts to the list of terms in the
    text that appear that many times.
    """

    t = Text('aa bb bb cc cc dd dd dd')

    assert t.term_count_buckets() == {1: ['aa'], 2: ['bb', 'cc'], 3: ['dd']}

Example #6

0

Show file

def test_set_term_offsets():

    """
    During tokenization, store map of token -> offsets positions.
    """

    t = Text('aa bb aa bb')

    assert t.terms['aa'] == [0, 2]
    assert t.terms['bb'] == [1, 3]

Example #7

0

Show file

File: test_load_stopwords.py Project: xkuang/textplot

def test_custom_file():
    """
    Load a custom file, when a path is passed.
    """

    path = os.path.join(os.path.dirname(__file__), 'fixtures/stopwords.txt')

    t = Text('test', stopwords=path)

    assert t.stopwords == set(['sa', 'sb', 'sc'])

Example #8

0

Show file

def test_set_tokens():

    """
    tokenize() should record individual tokens.
    """

    t = Text('aa bb cc')

    assert t.tokens[0]['unstemmed'] == 'aa'
    assert t.tokens[1]['unstemmed'] == 'bb'
    assert t.tokens[2]['unstemmed'] == 'cc'
    assert len(t.tokens) == 3

Example #9

0

Show file

File: test_most_frequent_terms.py Project: xkuang/textplot

def test_merge_smallest_bucket():
    """
    Say 1000 gets passed as the depth, and the 1000th term in the term-counts
    dictionary has a count of 10. But, there are 20 other terms that also show
    up 10 times in the text. In this case, all of the terms in this smallest
    bucket should be included, so as not to arbitrarily omit words that appear
    with the same frequency as words that do get included.
    """

    t = Text('aa bb bb cc cc dd dd dd')

    # Top 2 words are 'cc' and 'bb'
    assert t.most_frequent_terms(2) == set(['dd', 'cc', 'bb'])

Example #10

0

Show file

File: test_index.py Project: xkuang/textplot

def test_index():
    """
    index() should index the Bray-Curtis distances between terms.
    """

    t = Text('aa bb cc')
    m = Matrix()

    m.index(t)

    assert m.get_pair('aa', 'bb') == t.score_braycurtis('aa', 'bb')
    assert m.get_pair('aa', 'cc') == t.score_braycurtis('aa', 'cc')
    assert m.get_pair('bb', 'cc') == t.score_braycurtis('bb', 'cc')

Example #11

0

Show file

File: test_load_stopwords.py Project: xkuang/textplot

def test_default_file():
    """
    When no path is passed to load_stopwords(), the default file in the
    textplot module should be loaded.
    """

    defaults = set(
        pkgutil.get_data('textplot',
                         'data/stopwords.txt').decode('utf8').splitlines())

    t = Text('test')

    assert t.stopwords == defaults

Example #12

0

Show file

def test_ignore_stopwords():

    """
    Stopwords should be represented as None in the token list.
    """

    t = Text('aa the bb an cc')

    assert t.tokens[0]['unstemmed'] == 'aa'
    assert t.tokens[1] == None
    assert t.tokens[2]['unstemmed'] == 'bb'
    assert t.tokens[3] == None
    assert t.tokens[4]['unstemmed'] == 'cc'
    assert len(t.tokens) == 5

Example #13

0

Show file

File: test_index.py Project: xkuang/textplot

def test_term_subset():
    """
    When a subset of terms is passed, just those terms should be indexed.
    """

    t = Text('aa bb cc')
    m = Matrix()

    m.index(t, ['aa', 'bb'])

    # Should index 'aa' and 'bb'.
    assert m.get_pair('aa', 'bb') == t.score_braycurtis('aa', 'bb')

    # Should ignore 'cc'.
    assert not m.get_pair('aa', 'cc')
    assert not m.get_pair('bb', 'cc')

Example #14

0

Show file

File: test_anchored_pairs.py Project: xkuang/textplot

def test_anchored_pairs():

    """
    For a given anchor term, anchored_pairs() should return an ordered map of
    term -> distance for all other indexed terms.
    """

    t = Text('aa bb cc dd')
    m = Matrix()

    m.index(t)

    pairs = m.anchored_pairs('aa')

    assert list(pairs.keys()) == ['bb', 'cc', 'dd']
    assert pairs['bb'] > pairs['cc'] > pairs['dd']

Example #15

0

Show file

File: __init__.py Project: AndersNYC/textplot

def clumpy(path, term_depth=500, skim_depth=10, d_weights=False, **kwargs):
    """
    Use "clumpiest" terms.
    """

    t = Text.from_file(path)
    m = Matrix(t)

    print('Indexing terms:')
    m.index(t.densities(**kwargs).keys()[:term_depth], **kwargs)

    g = Skimmer()

    print('Generating graph:')
    g.build(m, skim_depth, d_weights)

    return g

Example #16

0

Show file

File: __init__.py Project: AndersNYC/textplot

def frequent(path, term_depth=500, skim_depth=10, d_weights=False, **kwargs):
    """
    Use most frequent terms.
    """

    t = Text.from_file(path)
    m = Matrix(t)

    print('Indexing terms:')
    m.index(t.most_frequent_terms(term_depth), **kwargs)

    g = Skimmer()

    print('Generating graph:')
    g.build(m, skim_depth, d_weights)

    return g

Example #17

0

Show file

File: __init__.py Project: AndersNYC/textplot

def clumpy(path, term_depth=500, skim_depth=10, d_weights=False, **kwargs):

    """
    Use "clumpiest" terms.
    """

    t = Text.from_file(path)
    m = Matrix(t)

    print('Indexing terms:')
    m.index(t.densities(**kwargs).keys()[:term_depth], **kwargs)

    g = Skimmer()

    print('Generating graph:')
    g.build(m, skim_depth, d_weights)

    return g

Example #18

0

Show file

File: __init__.py Project: AndersNYC/textplot

def frequent(path, term_depth=500, skim_depth=10, d_weights=False, **kwargs):

    """
    Use most frequent terms.
    """

    t = Text.from_file(path)
    m = Matrix(t)

    print('Indexing terms:')
    m.index(t.most_frequent_terms(term_depth), **kwargs)

    g = Skimmer()

    print('Generating graph:')
    g.build(m, skim_depth, d_weights)

    return g

Example #19

0

Show file

File: helpers.py Project: xkuang/textplot

def build_graph(path,
                term_depth=1000,
                skim_depth=10,
                d_weights=False,
                **kwargs):
    """
    Tokenize a text, index a term matrix, and build out a graph.

    Args:
        path (str): The file path.
        term_depth (int): Consider the N most frequent terms.
        skim_depth (int): Connect each word to the N closest siblings.
        d_weights (bool): If true, give "close" nodes low weights.

    Returns:
        Skimmer: The indexed graph.
    """

    # Tokenize text.
    click.echo('\nTokenizing text...')
    t = Text.from_file(path)
    click.echo('Extracted %d tokens' % len(t.tokens))

    m = Matrix()

    # Index the term matrix.
    click.echo('\nIndexing terms:')
    m.index(t, t.most_frequent_terms(term_depth), **kwargs)

    g = Skimmer()

    # Construct the network.
    click.echo('\nGenerating graph:')
    g.build(t, m, skim_depth, d_weights)

    return g