def make_old_index():
    from Products.PluginIndexes.TextIndex.TextIndex import TextIndex
    from Products.PluginIndexes.TextIndex.Lexicon  import Lexicon
    from Products.ZCTextIndex.StopDict import get_stopdict

    l = Lexicon(get_stopdict())
    l.SplitterFunc = MySplitter()
    return TextIndex("read", lexicon=l)
Exemple #2
0
def make_old_index():
    from Products.PluginIndexes.TextIndex.TextIndex import TextIndex
    from Products.PluginIndexes.TextIndex.Lexicon  import Lexicon
    from Products.ZCTextIndex.StopDict import get_stopdict

    l = Lexicon(get_stopdict())
    l.SplitterFunc = MySplitter()
    return TextIndex("read", lexicon=l)
Exemple #3
0
class StopWordRemover:

    dict = get_stopdict().copy()

    try:
        from Products.ZCTextIndex.stopper import process as _process
    except ImportError:

        def process(self, lst):
            has_key = self.dict.has_key
            return [w for w in lst if not has_key(w)]
    else:

        def process(self, lst):
            return self._process(self.dict, lst)
    def testDocUpdate(self):
        docid = 1   # doesn't change -- we index the same doc repeatedly
        N = len(text)
        stop = get_stopdict()

        d = {}  # word -> list of version numbers containing that word
        for version, i in zip(text, range(N)):
            # use a simple splitter rather than an official one
            words = [w for w in re.split(r'\W+', version.lower())
                     if len(w) > 1 and w not in stop]
            word_seen = {}
            for w in words:
                if w not in word_seen:
                    d.setdefault(w, []).append(i)
                    word_seen[w] = 1

        unique = {}  # version number -> list of words unique to that version
        common = []  # list of words common to all versions
        for w, versionlist in d.items():
            if len(versionlist) == 1:
                unique.setdefault(versionlist[0], []).append(w)
            elif len(versionlist) == N:
                common.append(w)
        self.assertGreater(len(common), 0)
        self.assertGreater(len(unique), 0)

        for version, i in zip(text, range(N)):
            doc = Indexable(version)
            self.zc_index.index_object(docid, doc)
            for w in common:
                nbest, total = self.zc_index.query(w)
                self.assertEqual(total, 1, 'did not find {0}'.format(w))
            for k, v in unique.items():
                if k == i:
                    continue
                for w in v:
                    nbest, total = self.zc_index.query(w)
                    self.assertEqual(
                        total, 0,
                        'did not expect to find {0}'.format(w)
                    )
 def __init__(self, datafs, writable=0, trans=0, pack=0):
     self.trans_limit = trans
     self.pack_limit = pack
     self.trans_count = 0
     self.pack_count = 0
     self.stopdict = get_stopdict()
     self.mh = mhlib.MH()
     self.filestorage = FileStorage(datafs, read_only=(not writable))
     self.database = DB(self.filestorage)
     self.connection = self.database.open()
     self.root = self.connection.root()
     try:
         self.index = self.root["index"]
     except KeyError:
         self.index = self.root["index"] = TextIndex()
     try:
         self.docpaths = self.root["docpaths"]
     except KeyError:
         self.docpaths = self.root["docpaths"] = IOBTree()
     try:
         self.doctimes = self.root["doctimes"]
     except KeyError:
         self.doctimes = self.root["doctimes"] = IIBTree()
     try:
         self.watchfolders = self.root["watchfolders"]
     except KeyError:
         self.watchfolders = self.root["watchfolders"] = {}
     self.path2docid = OIBTree()
     for docid in self.docpaths.keys():
         path = self.docpaths[docid]
         self.path2docid[path] = docid
     try:
         self.maxdocid = max(self.docpaths.keys())
     except ValueError:
         self.maxdocid = 0
     print len(self.docpaths), "Document ids"
     print len(self.path2docid), "Pathnames"
     print self.index.lexicon.length(), "Words"
 def __init__(self, datafs, writable=0, trans=0, pack=0):
     self.trans_limit = trans
     self.pack_limit = pack
     self.trans_count = 0
     self.pack_count = 0
     self.stopdict = get_stopdict()
     self.mh = mhlib.MH()
     self.filestorage = FileStorage(datafs, read_only=(not writable))
     self.database = DB(self.filestorage)
     self.connection = self.database.open()
     self.root = self.connection.root()
     try:
         self.index = self.root["index"]
     except KeyError:
         self.index = self.root["index"] = TextIndex()
     try:
         self.docpaths = self.root["docpaths"]
     except KeyError:
         self.docpaths = self.root["docpaths"] = IOBTree()
     try:
         self.doctimes = self.root["doctimes"]
     except KeyError:
         self.doctimes = self.root["doctimes"] = IIBTree()
     try:
         self.watchfolders = self.root["watchfolders"]
     except KeyError:
         self.watchfolders = self.root["watchfolders"] = {}
     self.path2docid = OIBTree()
     for docid in self.docpaths.keys():
         path = self.docpaths[docid]
         self.path2docid[path] = docid
     try:
         self.maxdocid = max(self.docpaths.keys())
     except ValueError:
         self.maxdocid = 0
     print len(self.docpaths), "Document ids"
     print len(self.path2docid), "Pathnames"
     print self.index.lexicon.length(), "Words"
Exemple #7
0
class StopWordAndSingleCharRemover(StopWordRemover):

    dict = get_stopdict().copy()
    for c in range(255):
        dict[chr(c)] = None
Exemple #8
0
class StopWordRemover(object):

    dict = get_stopdict().copy()

    def process(self, lst):
        return [w for w in lst if w not in self.dict]