def frequent(path, term_depth=500, skim_depth=10, d_weights=False, stopwordfile=None, postags=None, disambiguate=False, **kwargs): """ Use most frequent terms. """ t = Text.from_file(path, stopwordfile=stopwordfile, postags=postags, disambiguate=disambiguate) m = Matrix(t) print('Indexing terms:') m.index(t.most_frequent_terms(term_depth), **kwargs) g = Skimmer() print('Generating graph:') g.build(m, skim_depth, d_weights) return g
def test_missing_key(): """ If an unindexed key pair is passed, return None. """ m = Matrix() m.set_pair('a', 'b', 1) assert m.get_pair('a', 'c') == None
def test_set_pair(): """ set_pair() should set the value under an order-independent key. """ m = Matrix() m.set_pair('a', 'b', 1) assert m.get_pair('a', 'b') == 1 assert m.get_pair('b', 'a') == 1
def test_update_key_set(): """ Keys should be added to a set of stored keys. """ m = Matrix() m.set_pair('a', 'b', 1) m.set_pair('a', 'c', 2) assert m.keys == set(['a', 'b', 'c'])
def test_index(): """ index() should index the Bray-Curtis distances between terms. """ t = Text('aa bb cc') m = Matrix() m.index(t) assert m.get_pair('aa', 'bb') == t.score_braycurtis('aa', 'bb') assert m.get_pair('aa', 'cc') == t.score_braycurtis('aa', 'cc') assert m.get_pair('bb', 'cc') == t.score_braycurtis('bb', 'cc')
def test_anchored_pairs(): """ For a given anchor term, anchored_pairs() should return an ordered map of term -> distance for all other indexed terms. """ t = Text('aa bb cc dd') m = Matrix() m.index(t) pairs = m.anchored_pairs('aa') assert list(pairs.keys()) == ['bb', 'cc', 'dd'] assert pairs['bb'] > pairs['cc'] > pairs['dd']
def test_term_subset(): """ When a subset of terms is passed, just those terms should be indexed. """ t = Text('aa bb cc') m = Matrix() m.index(t, ['aa', 'bb']) # Should index 'aa' and 'bb'. assert m.get_pair('aa', 'bb') == t.score_braycurtis('aa', 'bb') # Should ignore 'cc'. assert not m.get_pair('aa', 'cc') assert not m.get_pair('bb', 'cc')
def frequent(path, term_depth=500, skim_depth=10, d_weights=False, **kwargs): """ Use most frequent terms. """ t = Text.from_file(path) m = Matrix(t) print('Indexing terms:') m.index(t.most_frequent_terms(term_depth), **kwargs) g = Skimmer() print('Generating graph:') g.build(m, skim_depth, d_weights) return g
def clumpy(path, term_depth=500, skim_depth=10, d_weights=False, **kwargs): """ Use "clumpiest" terms. """ t = Text.from_file(path) m = Matrix(t) print('Indexing terms:') m.index(t.densities(**kwargs).keys()[:term_depth], **kwargs) g = Skimmer() print('Generating graph:') g.build(m, skim_depth, d_weights) return g
def build_graph(path, term_depth=1000, skim_depth=10, d_weights=False, **kwargs): """ Tokenize a text, index a term matrix, and build out a graph. Args: path (str): The file path. term_depth (int): Consider the N most frequent terms. skim_depth (int): Connect each word to the N closest siblings. d_weights (bool): If true, give "close" nodes low weights. Returns: Skimmer: The indexed graph. """ # Tokenize text. click.echo('\nTokenizing text...') t = Text.from_file(path) click.echo('Extracted %d tokens' % len(t.tokens)) m = Matrix() # Index the term matrix. click.echo('\nIndexing terms:') m.index(t, t.most_frequent_terms(term_depth), **kwargs) g = Skimmer() # Construct the network. click.echo('\nGenerating graph:') g.build(t, m, skim_depth, d_weights) return g