Ejemplo n.º 1
0
    def testCn(self):
        text = open(SRCTEXT).read()
        self.cmd('ft.create', 'idx', 'schema', 'txt', 'text')
        self.cmd('ft.add', 'idx', 'doc1', 1.0, 'LANGUAGE', 'CHINESE', 'FIELDS', 'txt', text)
        res = self.cmd('ft.search', 'idx', '之旅', 'SUMMARIZE', 'HIGHLIGHT', 'LANGUAGE', 'chinese')
        res[2] = [safe_unicode(x) for x in res[2]]
        #self.assertEqual([long(1), 'doc1', ['txt', '2009\xe5\xb9\xb4\xef\xbc\x98\xe6\x9c\x88\xef\xbc\x96\xe6\x97\xa5\xe5\xbc\x80\xe5\xa7\x8b\xe5\xa4\xa7\xe5\xad\xa6<b>\xe4\xb9\x8b\xe6\x97\x85</b>\xef\xbc\x8c\xe5\xb2\xb3\xe9\x98\xb3\xe4\xbb\x8a\xe5\xa4\xa9\xe7\x9a\x84\xe6\xb0\x94\xe6\xb8\xa9\xe4\xb8\xba38.6\xe2\x84\x83, \xe4\xb9\x9f\xe5\xb0\xb1\xe6\x98\xaf101.48\xe2\x84\x89... \xef\xbc\x8c \xe5\x8d\x95\xe4\xbd\x8d \xe5\x92\x8c \xe5\x85\xa8\xe8\xa7\x92 : 2009\xe5\xb9\xb4 8\xe6\x9c\x88 6\xe6\x97\xa5 \xe5\xbc\x80\xe5\xa7\x8b \xe5\xa4\xa7\xe5\xad\xa6 <b>\xe4\xb9\x8b\xe6\x97\x85</b> \xef\xbc\x8c \xe5\xb2\xb3\xe9\x98\xb3 \xe4\xbb\x8a\xe5\xa4\xa9 \xe7\x9a\x84 \xe6\xb0\x94\xe6\xb8\xa9 \xe4\xb8\xba 38.6\xe2\x84\x83 , \xe4\xb9\x9f\xe5\xb0\xb1\xe6\x98\xaf 101... ')]], res)

        res = self.cmd('ft.search', 'idx', 'hacker', 'summarize', 'highlight')
        res[2] = [safe_unicode(x) for x in res[2]]
        #self.assertEqual([long(1), 'doc1', ['txt', ' visit http://code.google.com/p/jcseg, we all admire the <b>hacker</b> spirit!\xe7\x89\xb9\xe6\xae\x8a\xe6\x95\xb0\xe5\xad\x97: \xe2\x91\xa0 \xe2\x91\xa9 \xe2\x91\xbd \xe3\x88\xa9. ... p / jcseg , we all admire appreciate like love enjoy the <b>hacker</b> spirit mind ! \xe7\x89\xb9\xe6\xae\x8a \xe6\x95\xb0\xe5\xad\x97 : \xe2\x91\xa0 \xe2\x91\xa9 \xe2\x91\xbd \xe3\x88\xa9 . ~~~ ... ']], res)

        # Check that we can tokenize english with friso (sub-optimal, but don't want gibberish)
        gentxt = open(GENTXT).read()
        self.cmd('ft.add', 'idx', 'doc2', 1.0, 'LANGUAGE', 'chinese', 'FIELDS', 'txt', gentxt)
        res = self.cmd('ft.search', 'idx', 'abraham', 'summarize', 'highlight')
        self.assertEqual(long(1), res[0])
        self.assertEqual('doc2', res[1])
        res[2] = [safe_unicode(x) for x in res[2]]
        self.assertTrue(u'<b>Abraham</b>' in res[2][1])

        # Add an empty document. Hope we don't crash!
        self.cmd('ft.add', 'idx', 'doc3', 1.0, 'language', 'chinese', 'fields', 'txt1', '')

        # Check splitting. TODO - see how to actually test for matches
        self.cmd('ft.search', 'idx', 'redis客户端', 'language', 'chinese')
        self.cmd('ft.search', 'idx', '简介Redisson 是一个高级的分布式协调Redis客户端', 'language', 'chinese')
Ejemplo n.º 2
0
    def testSummarization(self):
        # Load the file
        self.setupGenesis()
        res = self.cmd('FT.SEARCH', 'idx', 'abraham isaac jacob', 'SUMMARIZE',
                       'FIELDS', 1, 'txt', 'LEN', 20, 'HIGHLIGHT', 'FIELDS', 1,
                       'txt', 'TAGS', '<b>', '</b>')
        self.assertEqual(1, res[0])
        # print res
        res_txt = res[2][1]
        # print res_txt

        self.assertTrue("<b>Abraham</b>" in res_txt)
        self.assertTrue("<b>Isaac</b>" in res_txt)
        self.assertTrue("<b>Jacob</b>" in res_txt)

        res = self.cmd('FT.SEARCH', 'idx', 'abraham isaac jacob', 'HIGHLIGHT',
                       'fields', 1, 'txt', 'TAGS', '<i>', '</i>')
        res_txt = res[2][1]
        self.assertGreaterEqual(len(res_txt), 160000)

        res = self.cmd('FT.SEARCH', 'idx', 'abraham isaac jacob', 'SUMMARIZE',
                       'FIELDS', 1, 'txt', 'FRAGS', 10000)
        # print res
        res_list = res[2][1]
        #self.assertIsInstance(res_list, list)

        # Search with custom separator
        res = self.cmd('FT.SEARCH', 'idx', 'isaac', 'SUMMARIZE', 'FIELDS', 1,
                       'txt', 'SEPARATOR', '\r\n', 'FRAGS', 4, 'LEN', 3)
        res[2] = [safe_unicode(x) for x in res[2]]
        self.assertEqual([
            long(1), u'gen1',
            [
                u'txt',
                u'name Isaac: and\r\nwith Isaac,\r\nIsaac. {21:4} And Abraham circumcised his son Isaac\r\nson Isaac was\r\n'
            ]
        ], res)

        # Attempt a query which doesn't have a corresponding matched term
        res = self.cmd('FT.SEARCH', 'idx', '-blah', 'SUMMARIZE', 'LEN', 3)
        self.assertEqual(long(1), res[0])
        self.assertEqual('gen1', res[1])
        res[2] = [safe_unicode(x) for x in res[2]]
        self.assertTrue(
            u'The First Book of Moses, called Genesis {1:1}' in res[2][1])

        # Try the same, but attempting to highlight
        res = self.cmd('FT.SEARCH', 'idx', '-blah', 'HIGHLIGHT')
        res[2] = [safe_unicode(x) for x in res[2]]
        self.assertTrue(215000 >= len(res[2][1]) >= 211000)
Ejemplo n.º 3
0
    def testSummarizationMultiField(self):
        p1 = "Redis is an open-source in-memory database project implementing a networked, in-memory key-value store with optional durability. Redis supports different kinds of abstract data structures, such as strings, lists, maps, sets, sorted sets, hyperloglogs, bitmaps and spatial indexes. The project is mainly developed by Salvatore Sanfilippo and is currently sponsored by Redis Labs.[4] Redis Labs creates and maintains the official Redis Enterprise Pack."
        p2 = "Redis typically holds the whole dataset in memory. Versions up to 2.4 could be configured to use what they refer to as virtual memory[19] in which some of the dataset is stored on disk, but this feature is deprecated. Persistence is now achieved in two different ways: one is called snapshotting, and is a semi-persistent durability mode where the dataset is asynchronously transferred from memory to disk from time to time, written in RDB dump format. Since version 1.1 the safer alternative is AOF, an append-only file (a journal) that is written as operations modifying the dataset in memory are processed. Redis is able to rewrite the append-only file in the background in order to avoid an indefinite growth of the journal."

        self.cmd('FT.CREATE', 'idx', 'SCHEMA', 'txt1', 'TEXT', 'txt2', 'TEXT')
        self.cmd('FT.ADD', 'idx', 'redis', 1.0, 'FIELDS', 'txt1', p1, 'txt2',
                 p2)

        # Now perform the multi-field search
        self.cmd('FT.SEARCH', 'idx', 'memory persistence salvatore',
                 'HIGHLIGHT', 'TAGS', '<b>', '</b>', 'SUMMARIZE', 'LEN', 5,
                 'RETURN', 2, 'txt1', 'txt2')

        # Now perform the multi-field search
        res = self.cmd('FT.SEARCH', 'idx', 'memory persistence salvatore',
                       'SUMMARIZE', 'FIELDS', 2, 'txt1', 'txt2', 'LEN', 5)
        # print res
        self.assertEqual(long(1), res[0])
        self.assertEqual('redis', res[1])
        res[2] = [safe_unicode(x) for x in res[2]]
        self.assertTrue(u'txt1' in res[2])
        self.assertTrue(
            u'memory database project implementing a networked, in-memory ... by Salvatore Sanfilippo... '
            in res[2])
        self.assertTrue(u'txt2' in res[2])
        self.assertTrue(
            u'dataset in memory. Versions... as virtual memory[19] in... persistent durability mode where the dataset is asynchronously transferred from memory... '
            in res[2])
Ejemplo n.º 4
0
 def testOverflow1(self):
     #"FT.CREATE" "netflix" "SCHEMA" "title" "TEXT" "WEIGHT" "1" "rating" "TEXT" "WEIGHT" "1" "level" "TEXT" "WEIGHT" "1" "description" "TEXT" "WEIGHT" "1" "year" "NUMERIC" "uscore" "NUMERIC" "usize" "NUMERIC"
     #FT.ADD" "netflix" "15ad80086ccc7f" "1" "FIELDS" "title" "The Vampire Diaries" "rating" "TV-14" "level" "Parents strongly cautioned. May be unsuitable for children ages 14 and under." "description" "90" "year" "2017" "uscore" "91" "usize" "80"
     self.cmd('FT.CREATE', 'netflix', 'SCHEMA', 'title', 'TEXT', 'rating',
              'TEXT', 'leve', 'TEXT', 'description', 'TEXT', 'year',
              'NUMERIC', 'uscore', 'NUMERIC', 'usize', 'NUMERIC')
     self.cmd(
         'FT.ADD', "netflix", "15ad80086ccc7f", "1.0", "FIELDS", "title",
         "The Vampire Diaries", "rating", "TV-14", "level",
         "Parents strongly cautioned. May be unsuitable for children ages 14 and under.",
         "description", "90", "year", "2017", "uscore", "91", "usize", "80")
     res = self.cmd('ft.search', 'netflix', 'vampire', 'highlight')
     self.assertTrue(res[0] == long(1))
     self.assertTrue(res[1] == u'15ad80086ccc7f')
     res[2] = [safe_unicode(x) for x in res[2]]
     self.assertTrue(u'The <b>Vampire</b> Diaries' in res[2])
Ejemplo n.º 5
0
    def testPrefixExpansion(self):
        # Search with prefix
        self.setupGenesis()
        res = self.cmd('FT.SEARCH', 'idx', 'begi*', 'HIGHLIGHT', 'FIELDS', 1,
                       'txt', 'TAGS', '<b>', '</b>', 'SUMMARIZE', 'FIELDS', 1,
                       'txt', 'LEN', 20)
        res[2] = [safe_unicode(x) for x in res[2]]

        # Prefix expansion uses "early exit" strategy, so the term highlighted won't necessarily be the
        # best term
        self.assertEqual([
            long(1), 'gen1',
            [
                u'txt',
                'is] one, and they have all one language; and this they <b>begin</b> to do: and now nothing will be restrained from them, which... '
            ]
        ], res)
Ejemplo n.º 6
0
 def annotate_exception(self, exception, number, command):
     cmd = safe_unicode(' ').join(imap(safe_unicode, command))
     msg = unicode('Command # %d (%s) of pipeline caused error: %s') % (
         number, cmd, safe_unicode(exception.args[0]))
     exception.args = (msg, ) + exception.args[1:]