Beispiel #1
0
def build_graph(path, term_depth=1000, skim_depth=10,
                d_weights=False, **kwargs):

    """
    Tokenize a text, index a term matrix, and build out a graph.

    Args:
        path (str): The file path.
        term_depth (int): Consider the N most frequent terms.
        skim_depth (int): Connect each word to the N closest siblings.
        d_weights (bool): If true, give "close" nodes low weights.

    Returns:
        Skimmer: The indexed graph.
    """

    # Tokenize text.
    click.echo('\nTokenizing text...')
    t = Text.from_file(path)
    click.echo('Extracted %d tokens' % len(t.tokens))

    m = Matrix()

    # Index the term matrix.
    click.echo('\nIndexing terms:')
    m.index(t, t.most_frequent_terms(term_depth), **kwargs)

    g = Skimmer()

    # Construct the network.
    click.echo('\nGenerating graph:')
    g.build(t, m, skim_depth, d_weights)

    return g
Beispiel #2
0
    def build_graph(self, term_depth=1000, skim_depth=10,
                    terms=None, **kwargs):

        """
        Construct a term graph.

        Args:
            term_depth (int): Consider the N most frequent terms.
            skim_depth (int): Connect each word to the N closest siblings.
            terms (list): Use a custom set of terms.

        Returns:
            textplot.graphs.Skimmer
        """

        # By default, use N most-frequent terms.
        terms = terms or self.most_frequent_terms(term_depth)

        # Index the term matrix.
        m = Matrix()
        m.index(self, terms, **kwargs)

        # Construct the network.
        g = Skimmer()
        g.build(self, m, skim_depth)

        return g
Beispiel #3
0
def frequent(path,
             term_depth=500,
             skim_depth=10,
             d_weights=False,
             stopwordfile=None,
             postags=None,
             disambiguate=False,
             **kwargs):
    """
    Use most frequent terms.
    """

    t = Text.from_file(path,
                       stopwordfile=stopwordfile,
                       postags=postags,
                       disambiguate=disambiguate)
    m = Matrix(t)

    print('Indexing terms:')
    m.index(t.most_frequent_terms(term_depth), **kwargs)

    g = Skimmer()

    print('Generating graph:')
    g.build(m, skim_depth, d_weights)

    return g
Beispiel #4
0
def test_missing_key():

    """
    If an unindexed key pair is passed, return None.
    """

    m = Matrix()
    m.set_pair('a', 'b', 1)

    assert m.get_pair('a', 'c') == None
Beispiel #5
0
def test_set_pair():

    """
    set_pair() should set the value under an order-independent key.
    """

    m = Matrix()
    m.set_pair('a', 'b', 1)

    assert m.get_pair('a', 'b') == 1
    assert m.get_pair('b', 'a') == 1
Beispiel #6
0
def test_update_key_set():

    """
    Keys should be added to a set of stored keys.
    """

    m = Matrix()
    m.set_pair('a', 'b', 1)
    m.set_pair('a', 'c', 2)

    assert m.keys == set(['a', 'b', 'c'])
Beispiel #7
0
def test_index():

    """
    index() should index the Bray-Curtis distances between terms.
    """

    t = Text('aa bb cc')
    m = Matrix()

    m.index(t)

    assert m.get_pair('aa', 'bb') == t.score_braycurtis('aa', 'bb')
    assert m.get_pair('aa', 'cc') == t.score_braycurtis('aa', 'cc')
    assert m.get_pair('bb', 'cc') == t.score_braycurtis('bb', 'cc')
Beispiel #8
0
def test_anchored_pairs():

    """
    For a given anchor term, anchored_pairs() should return an ordered map of
    term -> distance for all other indexed terms.
    """

    t = Text('aa bb cc dd')
    m = Matrix()

    m.index(t)

    pairs = m.anchored_pairs('aa')

    assert list(pairs.keys()) == ['bb', 'cc', 'dd']
    assert pairs['bb'] > pairs['cc'] > pairs['dd']
def test_anchored_pairs():

    """
    For a given anchor term, anchored_pairs() should return an ordered map of
    term -> distance for all other indexed terms.
    """

    t = Text("aa bb cc dd")
    m = Matrix()

    m.index(t)

    pairs = m.anchored_pairs("aa")

    assert list(pairs.keys()) == ["bb", "cc", "dd"]
    assert pairs["bb"] > pairs["cc"] > pairs["dd"]
Beispiel #10
0
def frequent(path, term_depth=500, skim_depth=10, d_weights=False, **kwargs):
    """
    Use most frequent terms.
    """

    t = Text.from_file(path)
    m = Matrix(t)

    print('Indexing terms:')
    m.index(t.most_frequent_terms(term_depth), **kwargs)

    g = Skimmer()

    print('Generating graph:')
    g.build(m, skim_depth, d_weights)

    return g
Beispiel #11
0
def test_term_subset():

    """
    When a subset of terms is passed, just those terms should be indexed.
    """

    t = Text('aa bb cc')
    m = Matrix()

    m.index(t, ['aa', 'bb'])

    # Should index 'aa' and 'bb'.
    assert m.get_pair('aa', 'bb') == t.score_braycurtis('aa', 'bb')

    # Should ignore 'cc'.
    assert not m.get_pair('aa', 'cc')
    assert not m.get_pair('bb', 'cc')
Beispiel #12
0
def clumpy(path, term_depth=500, skim_depth=10, d_weights=False, **kwargs):
    """
    Use "clumpiest" terms.
    """

    t = Text.from_file(path)
    m = Matrix(t)

    print('Indexing terms:')
    m.index(t.densities(**kwargs).keys()[:term_depth], **kwargs)

    g = Skimmer()

    print('Generating graph:')
    g.build(m, skim_depth, d_weights)

    return g
Beispiel #13
0
def clumpy(path, term_depth=500, skim_depth=10, d_weights=False, **kwargs):

    """
    Use "clumpiest" terms.
    """

    t = Text.from_file(path)
    m = Matrix(t)

    print('Indexing terms:')
    m.index(t.densities(**kwargs).keys()[:term_depth], **kwargs)

    g = Skimmer()

    print('Generating graph:')
    g.build(m, skim_depth, d_weights)

    return g
Beispiel #14
0
def frequent(path, term_depth=500, skim_depth=10, d_weights=False, **kwargs):

    """
    Use most frequent terms.
    """

    t = Text.from_file(path)
    m = Matrix(t)

    print('Indexing terms:')
    m.index(t.most_frequent_terms(term_depth), **kwargs)

    g = Skimmer()

    print('Generating graph:')
    g.build(m, skim_depth, d_weights)

    return g
Beispiel #15
0
def build_graph(path,
                term_depth=1000,
                skim_depth=10,
                d_weights=False,
                **kwargs):
    """
    Tokenize a text, index a term matrix, and build out a graph.

    Args:
        path (str): The file path.
        term_depth (int): Consider the N most frequent terms.
        skim_depth (int): Connect each word to the N closest siblings.
        d_weights (bool): If true, give "close" nodes low weights.

    Returns:
        Skimmer: The indexed graph.
    """

    # Tokenize text.
    click.echo('\nTokenizing text...')
    t = Text.from_file(path)
    click.echo('Extracted %d tokens' % len(t.tokens))

    m = Matrix()

    # Index the term matrix.
    click.echo('\nIndexing terms:')
    m.index(t, t.most_frequent_terms(term_depth), **kwargs)

    g = Skimmer()

    # Construct the network.
    click.echo('\nGenerating graph:')
    g.build(t, m, skim_depth, d_weights)

    return g
Beispiel #16
0
def test_index():
    """
    index() should index the Bray-Curtis distances between terms.
    """

    t = Text('aa bb cc')
    m = Matrix()

    m.index(t)

    assert m.get_pair('aa', 'bb') == t.score_braycurtis('aa', 'bb')
    assert m.get_pair('aa', 'cc') == t.score_braycurtis('aa', 'cc')
    assert m.get_pair('bb', 'cc') == t.score_braycurtis('bb', 'cc')
Beispiel #17
0
def test_term_subset():
    """
    When a subset of terms is passed, just those terms should be indexed.
    """

    t = Text('aa bb cc')
    m = Matrix()

    m.index(t, ['aa', 'bb'])

    # Should index 'aa' and 'bb'.
    assert m.get_pair('aa', 'bb') == t.score_braycurtis('aa', 'bb')

    # Should ignore 'cc'.
    assert not m.get_pair('aa', 'cc')
    assert not m.get_pair('bb', 'cc')