def test_basic(self):
     t1 = "This is a basic text. Two sentences. Maybe three?"
     t2 = "Two sentences. Maybe...three? this is a basic text."
     s = PhraseSequencer(self.corpus)
     
     p1 = sentence_parse(t1, s)
     p2 = sentence_parse(t2, s)
     
     self.assertEqual([(0, [(0, 21)]), (1, [(22, 36)]), (2, [(37, 49)])], p1)
     self.assertEqual([(0, [(30, 51)]), (1, [(0, 14)]), (2, [(15, 29)])], p2)
    def test_duplicated_phrases(self):
        doc = 'The same sentence. The same sentence. A different sentence. The SAME sentence.'

        i = DocumentIngester(self.corpus)
        
        self.assertEqual([(0, [(0, 18), (19, 37), (60, 78)]), (1, [(38, 59)])], sentence_parse(doc, i.sequencer))
        
        i.ingest([doc])
    def test_sentence_parse(self):
        s = PhraseSequencer(self.corpus)

        t = ''
        self.assertEqual([], sentence_parse(t, s))

        t = '   '
        self.assertEqual([], sentence_parse(t, s))

        t = 'A simple test case. Of two sentences.'
        self.assertEqual([(0, [(0, 19)]), (1, [(20, 37)])], sentence_parse(t, s))

        t = ' \n A simple test case. \t \t \n Of two sentences.\n'
        self.assertEqual([(0, [(3, 22)]), (1, [(29, 46)])], sentence_parse(t, s))
        
        t = 'of two sentences. of two sentences?'
        self.assertEqual([(1, [(0, 17), (18, 35)])], sentence_parse(t, s))
 def test_empty(self):
     s = PhraseSequencer(self.corpus)
     c = connection.cursor()
     
     p = sentence_parse('', s)
     
     self.assertEqual([], p)
     
     c.execute('select count(*) from phrases')
     self.assertEqual(0, c.fetchone()[0])
    def test_ingester(self):
        i = DocumentIngester(self.corpus)
        s = PhraseSequencer(self.corpus)
        
        t1 = 'This document has three sentences. One of which matches. Two of which do not.'
        t2 = 'This document has only two sentences. One of which matches.'
        
        i._record_document(t1, sentence_parse(t1, s), {})
        i._record_document(t2, sentence_parse(t2, s), {})
        
        s.upload_new_phrases()
        i._upload_new_documents()
        
        c = connection.cursor()
        
        c.execute("select count(*) from documents")
        self.assertEqual(2, c.fetchone()[0])
        
        c.execute("select count(*) from phrase_occurrences")
        self.assertEqual(5, c.fetchone()[0])

        # make sure we can add on to existing data
        i = DocumentIngester(self.corpus)
        s = PhraseSequencer(self.corpus)
        
        t3 = 'This document has only two sentences. Only one of which is new.'
        p3 = sentence_parse(t3, s)
        
        doc_id = i._record_document(t3, p3, {})
        self.assertEqual(2, doc_id)
        self.assertEqual([(3, [(0, 37)]), (4, [(38, 63)])], p3)
        
        s.upload_new_phrases()
        i._upload_new_documents()
        
        c.execute("select count(*) from documents")
        self.assertEqual(3, c.fetchone()[0])
        
        c.execute("select count(*) from phrase_occurrences")
        self.assertEqual(7, c.fetchone()[0])