Python PLexicon.termToWordIdsの例

プログラミング言語: Python

名前空間/パッケージ名: Products.ZCTextIndex.ZCTextIndex

クラス/型: PLexicon

メソッド/関数: termToWordIds

hotexamples.comのコード掲載数: 3

Python PLexicon.termToWordIds - 3件のコード例が見つかりました。すべてオープンソースプロジェクトから抽出されたPythonのProducts.ZCTextIndex.ZCTextIndex.PLexicon.termToWordIdsの実例で、最も評価が高いものを厳選しています。コード例の評価を行っていただくことで、より質の高いコード例が表示されるようになります。

よく使われるメソッド

表示非表示

PLexicon(30)

termToWordIds(2)

_pipeline(1)

よく使われるメソッド

PLexicon (30)

termToWordIds (2)

_pipeline (1)

コード例 #1

ファイルを表示

ファイル: testZCTextIndex.py プロジェクト: nacho22martin/tesis

class ZCIndexTestsBase:

    def setUp(self):
        self.lexicon = PLexicon('lexicon', '',
                                Splitter(),
                                CaseNormalizer(),
                                StopWordRemover())
        caller = LexiconHolder(self.lexicon)
        self.zc_index = ZCTextIndex('name',
                                    None,
                                    caller,
                                    self.IndexFactory,
                                    'text',
                                    'lexicon')
        self.index = self.zc_index.index


    def parserFailure(self, query):
        self.assertRaises(ParseError, self.zc_index.query, query)

    def parserSuccess(self, query, n):
        r, num = self.zc_index.query(query)
        self.assertEqual(num, n)
        if n:
            self.assertEqual(r[0][0], 1)

    def testMultipleAttributes(self):
        lexicon = PLexicon('lexicon', '',
                            Splitter(),
                            CaseNormalizer(),
                            StopWordRemover())
        caller = LexiconHolder(self.lexicon)
        zc_index = ZCTextIndex('name',
                                None,
                                caller,
                                self.IndexFactory,
                               'text1,text2',
                               'lexicon')
        doc = Indexable2('foo bar', 'alpha omega')
        zc_index.index_object(1, doc)
        nbest, total = zc_index.query('foo')
        self.assertEqual(len(nbest), 1)
        nbest, total = zc_index.query('foo alpha')
        self.assertEqual(len(nbest), 1)
        nbest, total = zc_index.query('foo alpha gamma')
        self.assertEqual(len(nbest), 0)

    def testListAttributes(self):
        lexicon = PLexicon('lexicon', '',
                            Splitter(),
                            CaseNormalizer(),
                            StopWordRemover())
        caller = LexiconHolder(self.lexicon)
        zc_index = ZCTextIndex('name',
                                None,
                                caller,
                                self.IndexFactory,
                               'text1,text2',
                               'lexicon')
        doc = Indexable2('Hello Tim', \
                         ['Now is the winter of our discontent',
                          'Made glorious summer by this sun of York', ])
        zc_index.index_object(1, doc)
        nbest, total = zc_index.query('glorious')
        self.assertEqual(len(nbest), 1)
        nbest, total = zc_index.query('York Tim')
        self.assertEqual(len(nbest), 1)
        nbest, total = zc_index.query('Tuesday Tim York')
        self.assertEqual(len(nbest), 0)

    def testStopWords(self):
        # the only non-stopword is question
        text = ("to be or not to be "
                "that is the question")
        doc = Indexable(text)
        self.zc_index.index_object(1, doc)
        for word in text.split():
            if word != "question":
                wids = self.lexicon.termToWordIds(word)
                self.assertEqual(wids, [])
        self.assertEqual(len(self.index.get_words(1)), 1)

        self.parserSuccess('question', 1)
        self.parserSuccess('question AND to AND be', 1)
        self.parserSuccess('to AND question AND be', 1)
        self.parserSuccess('question AND NOT gardenia', 1)
        self.parserSuccess('question AND gardenia', 0)
        self.parserSuccess('gardenia', 0)
        self.parserSuccess('question OR gardenia', 1)
        self.parserSuccess('question AND NOT to AND NOT be', 1)
        self.parserSuccess('question OR to OR be', 1)
        self.parserSuccess('question to be', 1)

        self.parserFailure('to be')
        self.parserFailure('to AND be')
        self.parserFailure('to OR be')
        self.parserFailure('to AND NOT be')
        self.parserFailure('to AND NOT question')
        self.parserFailure('to AND NOT gardenia')

    def testDocUpdate(self):
        docid = 1   # doesn't change -- we index the same doc repeatedly
        N = len(text)
        stop = get_stopdict()

        d = {} # word -> list of version numbers containing that word
        for version, i in zip(text, range(N)):
            # use a simple splitter rather than an official one
            words = [w for w in re.split("\W+", version.lower())
                     if len(w) > 1 and not stop.has_key(w)]
            word_seen = {}
            for w in words:
                if not word_seen.has_key(w):
                    d.setdefault(w, []).append(i)
                    word_seen[w] = 1

        unique = {} # version number -> list of words unique to that version
        common = [] # list of words common to all versions
        for w, versionlist in d.items():
            if len(versionlist) == 1:
                unique.setdefault(versionlist[0], []).append(w)
            elif len(versionlist) == N:
                common.append(w)
        self.assert_(len(common) > 0)
        self.assert_(len(unique) > 0)

        for version, i in zip(text, range(N)):
            doc = Indexable(version)
            self.zc_index.index_object(docid, doc)
            for w in common:
                nbest, total = self.zc_index.query(w)
                self.assertEqual(total, 1, "did not find %s" % w)
            for k, v in unique.items():
                if k == i:
                    continue
                for w in v:
                    nbest, total = self.zc_index.query(w)
                    self.assertEqual(total, 0, "did not expect to find %s" % w)

コード例 #2

ファイルを表示

ファイル: testZCTextIndex.py プロジェクト: icemac/Products.ZCatalog

class ZCIndexTestsBase(object):

    def setUp(self):
        self.lexicon = PLexicon('lexicon', '',
                                Splitter(),
                                CaseNormalizer(),
                                StopWordRemover())
        caller = LexiconHolder(self.lexicon)
        self.zc_index = ZCTextIndex('name',
                                    None,
                                    caller,
                                    self.IndexFactory,
                                    'text',
                                    'lexicon')
        self.index = self.zc_index.index

    def parserFailure(self, query):
        self.assertRaises(ParseError, self.zc_index.query, query)

    def parserSuccess(self, query, n):
        r, num = self.zc_index.query(query)
        self.assertEqual(num, n)
        if n:
            self.assertEqual(r[0][0], 1)

    def testMultipleAttributes(self):
        caller = LexiconHolder(self.lexicon)
        zc_index = ZCTextIndex('name',
                               None,
                               caller,
                               self.IndexFactory,
                               'text1,text2',
                               'lexicon')
        doc = Indexable2('foo bar', 'alpha omega')
        zc_index.index_object(1, doc)
        nbest, total = zc_index.query('foo')
        self.assertEqual(len(nbest), 1)
        nbest, total = zc_index.query('foo alpha')
        self.assertEqual(len(nbest), 1)
        nbest, total = zc_index.query('foo alpha gamma')
        self.assertEqual(len(nbest), 0)

    def testListAttributes(self):
        caller = LexiconHolder(self.lexicon)
        zc_index = ZCTextIndex('name',
                               None,
                               caller,
                               self.IndexFactory,
                               'text1,text2',
                               'lexicon')
        doc = Indexable2('Hello Tim',
                         ['Now is the winter of our discontent',
                          'Made glorious summer by this sun of York', ])
        zc_index.index_object(1, doc)
        nbest, total = zc_index.query('glorious')
        self.assertEqual(len(nbest), 1)
        nbest, total = zc_index.query('York Tim')
        self.assertEqual(len(nbest), 1)
        nbest, total = zc_index.query('Tuesday Tim York')
        self.assertEqual(len(nbest), 0)

    def testReindex(self):
        caller = LexiconHolder(self.lexicon)
        zc_index = ZCTextIndex('name',
                               None,
                               caller,
                               self.IndexFactory,
                               'text',
                               'lexicon')
        doc = Indexable('Hello Tim')
        zc_index.index_object(1, doc)
        nbest, total = zc_index.query('glorious')
        self.assertEqual(len(nbest), 0)
        nbest, total = zc_index.query('Tim')
        self.assertEqual(len(nbest), 1)
        # reindex with another value
        doc.text = 'Goodbye George'
        zc_index.index_object(1, doc)
        nbest, total = zc_index.query('Tim')
        self.assertEqual(len(nbest), 0)
        nbest, total = zc_index.query('Goodbye')
        self.assertEqual(len(nbest), 1)
        # reindex with an empty value
        doc.text = ''
        zc_index.index_object(1, doc)
        nbest, total = zc_index.query('George')
        self.assertEqual(len(nbest), 0)

    def testStopWords(self):
        # the only non-stopword is question
        text = ('to be or not to be '
                'that is the question')
        doc = Indexable(text)
        self.zc_index.index_object(1, doc)
        for word in text.split():
            if word != 'question':
                wids = self.lexicon.termToWordIds(word)
                self.assertEqual(wids, [])
        self.assertEqual(len(self.index.get_words(1)), 1)

        self.parserSuccess('question', 1)
        self.parserSuccess('question AND to AND be', 1)
        self.parserSuccess('to AND question AND be', 1)
        self.parserSuccess('question AND NOT gardenia', 1)
        self.parserSuccess('question AND gardenia', 0)
        self.parserSuccess('gardenia', 0)
        self.parserSuccess('question OR gardenia', 1)
        self.parserSuccess('question AND NOT to AND NOT be', 1)
        self.parserSuccess('question OR to OR be', 1)
        self.parserSuccess('question to be', 1)

        self.parserFailure('to be')
        self.parserFailure('to AND be')
        self.parserFailure('to OR be')
        self.parserFailure('to AND NOT be')
        self.parserFailure('to AND NOT question')
        self.parserFailure('to AND NOT gardenia')

    def testDocUpdate(self):
        docid = 1   # doesn't change -- we index the same doc repeatedly
        N = len(text)
        stop = get_stopdict()

        d = {}  # word -> list of version numbers containing that word
        for version, i in zip(text, range(N)):
            # use a simple splitter rather than an official one
            words = [w for w in re.split(r'\W+', version.lower())
                     if len(w) > 1 and w not in stop]
            word_seen = {}
            for w in words:
                if w not in word_seen:
                    d.setdefault(w, []).append(i)
                    word_seen[w] = 1

        unique = {}  # version number -> list of words unique to that version
        common = []  # list of words common to all versions
        for w, versionlist in d.items():
            if len(versionlist) == 1:
                unique.setdefault(versionlist[0], []).append(w)
            elif len(versionlist) == N:
                common.append(w)
        self.assertGreater(len(common), 0)
        self.assertGreater(len(unique), 0)

        for version, i in zip(text, range(N)):
            doc = Indexable(version)
            self.zc_index.index_object(docid, doc)
            for w in common:
                nbest, total = self.zc_index.query(w)
                self.assertEqual(total, 1, 'did not find {0}'.format(w))
            for k, v in unique.items():
                if k == i:
                    continue
                for w in v:
                    nbest, total = self.zc_index.query(w)
                    self.assertEqual(
                        total, 0,
                        'did not expect to find {0}'.format(w)
                    )

    def testLexiconIsNotFoundRaisesLookupError(self):
        caller = LexiconHolder(self.lexicon)
        with self.assertRaises(LookupError):
            ZCTextIndex(
                'name',
                extra=None,
                caller=caller,
            )

    def testInvalidIndexTypeRaisesValueError(self):
        caller = LexiconHolder(self.lexicon)

        class Extra(object):
            index_type = 'Some invalid index type'
        with self.assertRaises(ValueError):
            ZCTextIndex(
                'name',
                extra=Extra,
                caller=caller,
                index_factory=None,
                lexicon_id='lexicon'
            )

コード例 #3

ファイルを表示

ファイル: testZCTextIndex.py プロジェクト: bendavis78/zope

class ZCIndexTestsBase:
    def setUp(self):
        self.lexicon = PLexicon('lexicon', '', Splitter(), CaseNormalizer(),
                                StopWordRemover())
        caller = LexiconHolder(self.lexicon)
        self.zc_index = ZCTextIndex('name', None, caller, self.IndexFactory,
                                    'text', 'lexicon')
        self.index = self.zc_index.index

    def parserFailure(self, query):
        self.assertRaises(ParseError, self.zc_index.query, query)

    def parserSuccess(self, query, n):
        r, num = self.zc_index.query(query)
        self.assertEqual(num, n)
        if n:
            self.assertEqual(r[0][0], 1)

    def testMultipleAttributes(self):
        lexicon = PLexicon('lexicon', '', Splitter(), CaseNormalizer(),
                           StopWordRemover())
        caller = LexiconHolder(self.lexicon)
        zc_index = ZCTextIndex('name', None, caller, self.IndexFactory,
                               'text1,text2', 'lexicon')
        doc = Indexable2('foo bar', 'alpha omega')
        zc_index.index_object(1, doc)
        nbest, total = zc_index.query('foo')
        self.assertEqual(len(nbest), 1)
        nbest, total = zc_index.query('foo alpha')
        self.assertEqual(len(nbest), 1)
        nbest, total = zc_index.query('foo alpha gamma')
        self.assertEqual(len(nbest), 0)

    def testListAttributes(self):
        lexicon = PLexicon('lexicon', '', Splitter(), CaseNormalizer(),
                           StopWordRemover())
        caller = LexiconHolder(self.lexicon)
        zc_index = ZCTextIndex('name', None, caller, self.IndexFactory,
                               'text1,text2', 'lexicon')
        doc = Indexable2('Hello Tim', \
                         ['Now is the winter of our discontent',
                          'Made glorious summer by this sun of York', ])
        zc_index.index_object(1, doc)
        nbest, total = zc_index.query('glorious')
        self.assertEqual(len(nbest), 1)
        nbest, total = zc_index.query('York Tim')
        self.assertEqual(len(nbest), 1)
        nbest, total = zc_index.query('Tuesday Tim York')
        self.assertEqual(len(nbest), 0)

    def testStopWords(self):
        # the only non-stopword is question
        text = ("to be or not to be " "that is the question")
        doc = Indexable(text)
        self.zc_index.index_object(1, doc)
        for word in text.split():
            if word != "question":
                wids = self.lexicon.termToWordIds(word)
                self.assertEqual(wids, [])
        self.assertEqual(len(self.index.get_words(1)), 1)

        self.parserSuccess('question', 1)
        self.parserSuccess('question AND to AND be', 1)
        self.parserSuccess('to AND question AND be', 1)
        self.parserSuccess('question AND NOT gardenia', 1)
        self.parserSuccess('question AND gardenia', 0)
        self.parserSuccess('gardenia', 0)
        self.parserSuccess('question OR gardenia', 1)
        self.parserSuccess('question AND NOT to AND NOT be', 1)
        self.parserSuccess('question OR to OR be', 1)
        self.parserSuccess('question to be', 1)

        self.parserFailure('to be')
        self.parserFailure('to AND be')
        self.parserFailure('to OR be')
        self.parserFailure('to AND NOT be')
        self.parserFailure('to AND NOT question')
        self.parserFailure('to AND NOT gardenia')

    def testDocUpdate(self):
        docid = 1  # doesn't change -- we index the same doc repeatedly
        N = len(text)
        stop = get_stopdict()

        d = {}  # word -> list of version numbers containing that word
        for version, i in zip(text, range(N)):
            # use a simple splitter rather than an official one
            words = [
                w for w in re.split("\W+", version.lower())
                if len(w) > 1 and not stop.has_key(w)
            ]
            word_seen = {}
            for w in words:
                if not word_seen.has_key(w):
                    d.setdefault(w, []).append(i)
                    word_seen[w] = 1

        unique = {}  # version number -> list of words unique to that version
        common = []  # list of words common to all versions
        for w, versionlist in d.items():
            if len(versionlist) == 1:
                unique.setdefault(versionlist[0], []).append(w)
            elif len(versionlist) == N:
                common.append(w)
        self.assert_(len(common) > 0)
        self.assert_(len(unique) > 0)

        for version, i in zip(text, range(N)):
            doc = Indexable(version)
            self.zc_index.index_object(docid, doc)
            for w in common:
                nbest, total = self.zc_index.query(w)
                self.assertEqual(total, 1, "did not find %s" % w)
            for k, v in unique.items():
                if k == i:
                    continue
                for w in v:
                    nbest, total = self.zc_index.query(w)
                    self.assertEqual(total, 0, "did not expect to find %s" % w)