Python NGramSpace.NGramSpace Examples

Programming Language: Python

Namespace/Package Name: ngrams

Class/Type: NGramSpace

Method/Function: NGramSpace

Examples at hotexamples.com: 8

Python NGramSpace.NGramSpace - 8 examples found. These are the top rated real world Python examples of ngrams.NGramSpace.NGramSpace extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

NGramSpace(8)

parse(7)

Frequently Used Methods

Example #1

0

Show file

def setup(source, pdf_path):
    ngrams = NGramSpace(4)
    print "parsing documents at %s..." % source
    docs = [
        extract_row(row, pdf_path, ngrams)
        for row in csv.DictReader(open(source, 'r'))
    ]
    print "clustering %d documents..." % len(docs)
    clustering = Clustering([doc.parsed for doc in docs])
    return (clustering, docs)

Example #2

0

Show file

File: tests.py Project: sunlightlabs/regulations-scraper

    def test_pairs(self):
        ngrams = NGramSpace(1)
        docs = [ngrams.parse(raw) for raw in test_docs]
        c = Clustering(docs)

        self.assertEqual((1, 0), c.closest_pair([0, 1, 2]))
        self.assertEqual((5, 3), c.closest_pair([3, 4, 5]))
        self.assertEqual((7, 6), c.closest_pair([6, 7]))

        self.assertEqual((2, 0), c.farthest_pair([0, 1, 2]))
        self.assertEqual((5, 4), c.farthest_pair([3, 4, 5]))
        self.assertEqual((7, 6), c.farthest_pair([6, 7]))

Example #3

0

Show file

File: tests.py Project: sunlightlabs/regulations-scraper

    def test_nonseeded_clustering(self):
        ngrams = NGramSpace(1)
        docs = [ngrams.parse(raw) for raw in test_docs]
        c = Clustering(docs)

        self.assertEqual((1, 0), c.min_link())
        c.merge(1, 0)
        self.assertEqual((2, 1), c.min_link())
        c.merge(2, 1)
        self.assertTrue(c.min_link() in [(4, 3), (5, 3)])
        c.merge(3, 4)
        c.merge(3, 5)
        self.assertEqual((7, 6), c.min_link())

Example #4

0

Show file

File: tests.py Project: sunlightlabs/regulations-scraper

    def test_distance(self):
        raw_docs = ['a b c', 'b c d', 'd e f']
        ngrams = NGramSpace(1)
        docs = [ngrams.parse(raw) for raw in raw_docs]

        c = Clustering(docs)

        self.assertEqual(0, c.distance[0, 0])
        self.assertEqual(0.5, c.distance[1, 0])
        self.assertEqual(0, c.distance[1, 1])
        self.assertEqual(1.0, c.distance[2, 0])
        self.assertEqual(0.8, c.distance[2, 1])
        self.assertEqual(0, c.distance[2, 2])

Example #5

0

Show file

File: tests.py Project: sunlightlabs/regulations-scraper

    def test_nearest_neighbors(self):
        ngrams = NGramSpace(1)
        docs = [ngrams.parse(raw) for raw in test_docs]
        c = Clustering(docs)

        c.pp_distance(range(0, len(test_docs)))

        self.assertEqual([1], c.closest_neighbors([0], 1))
        self.assertEqual([1, 2], c.closest_neighbors([0], 2))
        self.assertEqual([1, 2, 3], c.closest_neighbors([0], 3))
        self.assertEqual([1, 2, 3, 5], c.closest_neighbors([0], 4))

        self.assertEqual([5], c.closest_neighbors([3, 4], 1))
        self.assertEqual([5, 1], c.closest_neighbors([3, 4], 2))

Example #6

0

Show file

File: tests.py Project: sunlightlabs/regulations-scraper

    def test_trigram(self):
        unigrams = NGramSpace(3)

        x = unigrams.parse('This is a sentence')
        y = unigrams.parse('This is another sentence')

        self.assertEqual([1, 2], x)
        self.assertEqual([3, 4], y)

        self.assertEqual(0, overlap(x, y))
        self.assertEqual(0, overlap(y, x))

        self.assertEqual(0, jaccard(x, y))
        self.assertEqual(0, jaccard(y, x))

Example #7

0

Show file

File: tests.py Project: sunlightlabs/regulations-scraper

    def test_bigram(self):
        unigrams = NGramSpace(2)

        x = unigrams.parse('This is a sentence')
        y = unigrams.parse('This is another sentence')

        self.assertEqual([1, 2, 3], x)
        self.assertEqual([1, 4, 5], y)

        self.assertEqual(1, overlap(x, y))
        self.assertEqual(1, overlap(y, x))

        self.assertEqual(1.0 / 5.0, jaccard(x, y))
        self.assertEqual(1.0 / 5.0, jaccard(y, x))

Example #8

0

Show file

File: tests.py Project: sunlightlabs/regulations-scraper

    def test_clustering(self):
        raw_docs = ['a b c', 'b c d', 'd e f']
        ngrams = NGramSpace(1)
        docs = [ngrams.parse(raw) for raw in raw_docs]

        c = Clustering(docs)

        self.assertEqual((1, 0), c.min_link())

        c.merge(1, 0)
        self.assertEqual([1, 1, 2], c.assignments)

        self.assertEqual((2, 1), c.min_link())

        c.merge(2, 0)
        self.assertEqual([2, 2, 2], c.assignments)