Example #1
0
class Lexicon(Persistent):
    """
    Implementation of :class:`zope.index.text.interfaces.ILexicon`.
    """

    def __init__(self, *pipeline):
        self._wids = OIBTree()  # word -> wid
        self._words = IOBTree() # wid -> word
        # wid 0 is reserved for words that aren't in the lexicon (OOV -- out
        # of vocabulary).  This can happen, e.g., if a query contains a word
        # we never saw before, and that isn't a known stopword (or otherwise
        # filtered out).  Returning a special wid value for OOV words is a
        # way to let clients know when an OOV word appears.
        self.wordCount = Length()
        self._pipeline = pipeline

    def wordCount(self):
        """Return the number of unique terms in the lexicon."""
        # overridden per instance
        return len(self._wids)

    def words(self):
        return self._wids.keys()

    def wids(self):
        return self._words.keys()

    def items(self):
        return self._wids.items()

    def sourceToWordIds(self, text):
        if text is None:
            text = ''
        last = _text2list(text)
        for element in self._pipeline:
            last = element.process(last)
        if not isinstance(self.wordCount, Length):
            # Make sure wordCount is overridden with a BTrees.Length.Length
            self.wordCount = Length(self.wordCount())
        # Strategically unload the length value so that we get the most
        # recent value written to the database to minimize conflicting wids
        # Because length is independent, this will load the most
        # recent value stored, regardless of whether MVCC is enabled
        self.wordCount._p_deactivate()
        return list(map(self._getWordIdCreate, last))

    def termToWordIds(self, text):
        last = _text2list(text)
        for element in self._pipeline:
            last = element.process(last)
        wids = []
        for word in last:
            wids.append(self._wids.get(word, 0))
        return wids

    def parseTerms(self, text):
        last = _text2list(text)
        for element in self._pipeline:
            process = getattr(element, "processGlob", element.process)
            last = process(last)
        return last

    def isGlob(self, word):
        return "*" in word or "?" in word

    def get_word(self, wid):
        return self._words[wid]

    def get_wid(self, word):
        return self._wids.get(word, 0)

    def globToWordIds(self, pattern):
        # Implement * and ? just as in the shell, except the pattern
        # must not start with either of these
        prefix = ""
        while pattern and pattern[0] not in "*?":
            prefix += pattern[0]
            pattern = pattern[1:]
        if not pattern:
            # There were no globbing characters in the pattern
            wid = self._wids.get(prefix, 0)
            if wid:
                return [wid]
            else:
                return []
        if not prefix:
            # The pattern starts with a globbing character.
            # This is too efficient, so we raise an exception.
            raise QueryError(
                "pattern %r shouldn't start with glob character" % pattern)
        pat = prefix
        for c in pattern:
            if c == "*":
                pat += ".*"
            elif c == "?":
                pat += "."
            else:
                pat += re.escape(c)
        pat += "$"
        prog = re.compile(pat)
        keys = self._wids.keys(prefix) # Keys starting at prefix
        wids = []
        for key in keys:
            if not key.startswith(prefix):
                break
            if prog.match(key):
                wids.append(self._wids[key])
        return wids

    def _getWordIdCreate(self, word):
        wid = self._wids.get(word)
        if wid is None:
            wid = self._new_wid()
            self._wids[word] = wid
            self._words[wid] = word
        return wid

    def _new_wid(self):
        count = self.wordCount
        count.change(1)
        while count() in self._words:
            # just to be safe
            count.change(1)
        return count()
Example #2
0
class UUIDIndex(UnIndex):
    """Index for uuid fields with an unique value per key.

    The internal structure is:

    self._index = {datum:documentId]}
    self._unindex = {documentId:datum}

    For each datum only one documentId can exist.
    """

    meta_type = "UUIDIndex"

    manage_options = (
        {
            'label': 'Settings',
            'action': 'manage_main'
        },
        {
            'label': 'Browse',
            'action': 'manage_browse'
        },
    )

    query_options = ["query", "range"]

    manage = manage_main = DTMLFile('dtml/manageUUIDIndex', globals())
    manage_main._setName('manage_main')
    manage_browse = DTMLFile('../dtml/browseIndex', globals())

    def clear(self):
        self._length = Length()
        self._index = OIBTree()
        self._unindex = IOBTree()

    def numObjects(self):
        """Return the number of indexed objects. Since we have a 1:1 mapping
        from documents to values, we can reuse the stored length.
        """
        return self.indexSize()

    def uniqueValues(self, name=None, withLengths=0):
        """returns the unique values for name

        if withLengths is true, returns a sequence of
        tuples of (value, length)
        """
        if name is None:
            name = self.id
        elif name != self.id:
            return []

        if not withLengths:
            return tuple(self._index.keys())
        # We know the length for each value is one
        return [(k, 1) for k in self._index.keys()]

    def insertForwardIndexEntry(self, entry, documentId):
        """Take the entry provided and put it in the correct place
        in the forward index.
        """
        if entry is None:
            return

        old_docid = self._index.get(entry, _marker)
        if old_docid is _marker:
            self._index[entry] = documentId
            self._length.change(1)
        elif old_docid != documentId:
            logger.error("A different document with value '%s' already "
                         "exists in the index.'" % entry)

    def removeForwardIndexEntry(self, entry, documentId):
        """Take the entry provided and remove any reference to documentId
        in its entry in the index.
        """
        old_docid = self._index.get(entry, _marker)
        if old_docid is not _marker:
            del self._index[entry]
            self._length.change(-1)

    def _get_object_datum(self, obj, attr):
        # for a uuid it never makes sense to acquire a parent value via
        # Acquisition
        has_attr = getattr(aq_base(obj), attr, _marker)
        if has_attr is _marker:
            return _marker
        return super(UUIDIndex, self)._get_object_datum(obj, attr)
Example #3
0
class Indexer(object):

    filestorage = database = connection = root = None

    def __init__(self, datafs, writable=0, trans=0, pack=0):
        self.trans_limit = trans
        self.pack_limit = pack
        self.trans_count = 0
        self.pack_count = 0
        self.stopdict = get_stopdict()
        self.mh = mhlib.MH()
        self.filestorage = FileStorage(datafs, read_only=(not writable))
        self.database = DB(self.filestorage)
        self.connection = self.database.open()
        self.root = self.connection.root()
        try:
            self.index = self.root["index"]
        except KeyError:
            self.index = self.root["index"] = TextIndexWrapper()
        try:
            self.docpaths = self.root["docpaths"]
        except KeyError:
            self.docpaths = self.root["docpaths"] = IOBTree()
        try:
            self.doctimes = self.root["doctimes"]
        except KeyError:
            self.doctimes = self.root["doctimes"] = IIBTree()
        try:
            self.watchfolders = self.root["watchfolders"]
        except KeyError:
            self.watchfolders = self.root["watchfolders"] = {}
        self.path2docid = OIBTree()
        for docid in self.docpaths.keys():
            path = self.docpaths[docid]
            self.path2docid[path] = docid
        try:
            self.maxdocid = max(self.docpaths.keys())
        except ValueError:
            self.maxdocid = 0
        print(len(self.docpaths), "Document ids")
        print(len(self.path2docid), "Pathnames")
        print(self.index.lexicon.length(), "Words")

    def dumpfreqs(self):
        lexicon = self.index.lexicon
        index = self.index.index
        assert isinstance(index, OkapiIndex)
        L = []
        for wid in lexicon.wids():
            freq = 0
            for f in index._wordinfo.get(wid, {}).values():
                freq += f
            L.append((freq, wid, lexicon.get_word(wid)))
        L.sort()
        L.reverse()
        for freq, wid, word in L:
            print("%10d %10d %s" % (wid, freq, word))

    def dumpwids(self):
        lexicon = self.index.lexicon
        index = self.index.index
        assert isinstance(index, OkapiIndex)
        for wid in lexicon.wids():
            freq = 0
            for f in index._wordinfo.get(wid, {}).values():
                freq += f
            print("%10d %10d %s" % (wid, freq, lexicon.get_word(wid)))

    def dumpwords(self):
        lexicon = self.index.lexicon
        index = self.index.index
        assert isinstance(index, OkapiIndex)
        for word in lexicon.words():
            wid = lexicon.get_wid(word)
            freq = 0
            for f in index._wordinfo.get(wid, {}).values():
                freq += f
            print("%10d %10d %s" % (wid, freq, word))

    def close(self):
        self.root = None
        if self.connection is not None:
            self.connection.close()
            self.connection = None
        if self.database is not None:
            self.database.close()
            self.database = None
        if self.filestorage is not None:
            self.filestorage.close()
            self.filestorage = None

    def interact(self, nbest=NBEST, maxlines=MAXLINES):
        try:
            import readline
        except ImportError:
            pass
        text = ""
        top = 0
        results = []
        while 1:
            try:
                line = raw_input("Query: ")
            except EOFError:
                print("\nBye.")
                break
            line = line.strip()
            if line.startswith("/"):
                self.specialcommand(line, results, top - nbest)
                continue
            if line:
                text = line
                top = 0
            else:
                if not text:
                    continue
            try:
                results, n = self.timequery(text, top + nbest)
            except KeyboardInterrupt:
                raise
            except:
                reportexc()
                text = ""
                continue
            if len(results) <= top:
                if not n:
                    print("No hits for %r." % text)
                else:
                    print("No more hits for %r." % text)
                text = ""
                continue
            print("[Results %d-%d from %d" % (top+1, min(n, top+nbest), n),
                  end=" ")
            print("for query %s]" % repr(text))
            self.formatresults(text, results, maxlines, top, top+nbest)
            top += nbest

    def specialcommand(self, line, results, first):
        assert line.startswith("/")
        line = line[1:]
        if not line:
            n = first
        else:
            try:
                n = int(line) - 1
            except:
                print("Huh?")
                return
        if n < 0 or n >= len(results):
            print("Out of range")
            return
        docid, score = results[n]
        path = self.docpaths[docid]
        i = path.rfind("/")
        assert i > 0
        folder = path[:i]
        n = path[i+1:]
        cmd = "show +%s %s" % (folder, n)
        if os.getenv("DISPLAY"):
            os.system("xterm -e  sh -c '%s | less' &" % cmd)
        else:
            os.system(cmd)

    def query(self, text, nbest=NBEST, maxlines=MAXLINES):
        results, n = self.timequery(text, nbest)
        if not n:
            print("No hits for %r." % text)
            return
        print("[Results 1-%d from %d]" % (len(results), n))
        self.formatresults(text, results, maxlines)

    def timequery(self, text, nbest):
        t0 = time.time()
        c0 = time.clock()
        results, n = self.index.query(text, 0, nbest)
        t1 = time.time()
        c1 = time.clock()
        print("[Query time: %.3f real, %.3f user]" % (t1-t0, c1-c0))
        return results, n

    def formatresults(self, text, results, maxlines=MAXLINES,
                      lo=0, hi=sys.maxint):
        stop = self.stopdict.has_key
        words = [w for w in re.findall(r"\w+\*?", text.lower()) if not stop(w)]
        pattern = r"\b(" + "|".join(words) + r")\b"
        pattern = pattern.replace("*", ".*") # glob -> re syntax
        prog = re.compile(pattern, re.IGNORECASE)
        print('='*70)
        rank = lo
        for docid, score in results[lo:hi]:
            rank += 1
            path = self.docpaths[docid]
            score *= 100.0
            print("Rank:    %d   Score: %d%%   File: %s" % (rank, score, path))
            path = os.path.join(self.mh.getpath(), path)
            try:
                fp = open(path)
            except (IOError, OSError) as msg:
                print("Can't open:", msg)
                continue
            msg = mhlib.Message("<folder>", 0, fp)
            for header in "From", "To", "Cc", "Bcc", "Subject", "Date":
                h = msg.getheader(header)
                if h:
                    print("%-8s %s" % (header+":", h))
            text = self.getmessagetext(msg)
            if text:
                print()
                nleft = maxlines
                for part in text:
                    for line in part.splitlines():
                        if prog.search(line):
                            print(line)
                            nleft -= 1
                            if nleft <= 0:
                                break
                    if nleft <= 0:
                        break
            print('-'*70)

    def update(self, args):
        folder = None
        seqs = []

        for arg in args:
            if arg.startswith("+"):
                if folder is None:
                    folder = arg[1:]
                else:
                    print("only one folder at a time")
                    return
            else:
                seqs.append(arg)

        if not folder:
            folder = self.mh.getcontext()
        if not seqs:
            seqs = ['all']

        try:
            f = self.mh.openfolder(folder)
        except mhlib.Error as msg:
            print(msg)
            return

        dict = {}
        for seq in seqs:
            try:
                nums = f.parsesequence(seq)
            except mhlib.Error as msg:
                print(msg or "unparsable message sequence: %s" % repr(seq))
                return
            for n in nums:
                dict[n] = n
        msgs = dict.keys()
        msgs.sort()

        self.updatefolder(f, msgs)
        self.commit()

    def optimize(self, args):
        uniqwords = {}
        for folder in args:
            if folder.startswith("+"):
                folder = folder[1:]
            print("\nOPTIMIZE FOLDER", folder)
            try:
                f = self.mh.openfolder(folder)
            except mhlib.Error as msg:
                print(msg)
                continue
            self.prescan(f, f.listmessages(), uniqwords)
        L = [(uniqwords[word], word) for word in uniqwords.keys()]
        L.sort()
        L.reverse()
        for i in range(100):
            print("%3d. %6d %s" % ((i+1,) + L[i]))
        self.index.lexicon.sourceToWordIds([word for (count, word) in L])

    def prescan(self, f, msgs, uniqwords):
        pipeline = [Splitter(), CaseNormalizer(), StopWordRemover()]
        for n in msgs:
            print("prescanning", n)
            m = f.openmessage(n)
            text = self.getmessagetext(m, f.name)
            for p in pipeline:
                text = p.process(text)
            for word in text:
                uniqwords[word] = uniqwords.get(word, 0) + 1

    def bulkupdate(self, args):
        if not args:
            print("No folders specified; use ALL to bulk-index all folders")
            return
        if "ALL" in args:
            i = args.index("ALL")
            args[i:i+1] = self.mh.listfolders()
        for folder in args:
            if folder.startswith("+"):
                folder = folder[1:]
            print("\nFOLDER", folder)
            try:
                f = self.mh.openfolder(folder)
            except mhlib.Error as msg:
                print(msg)
                continue
            self.updatefolder(f, f.listmessages())
            print("Total", len(self.docpaths))
        self.commit()
        print("Indexed", self.index.lexicon._nbytes, "bytes and",)
        print(self.index.lexicon._nwords, "words;",)
        print(len(self.index.lexicon._words), "unique words.")

    def updatefolder(self, f, msgs):
        self.watchfolders[f.name] = self.getmtime(f.name)
        for n in msgs:
            path = "%s/%s" % (f.name, n)
            docid = self.path2docid.get(path, 0)
            if docid and self.getmtime(path) == self.doctimes.get(docid, 0):
                print("unchanged", docid, path)
                continue
            docid = self.newdocid(path)
            try:
                m = f.openmessage(n)
            except IOError:
                print("disappeared", docid, path)
                self.unindexpath(path)
                continue
            text = self.getmessagetext(m, f.name)
            if not text:
                self.unindexpath(path)
                continue
            print("indexing", docid, path)
            self.index.index_doc(docid, text)
            self.maycommit()
        # Remove messages from the folder that no longer exist
        for path in list(self.path2docid.keys(f.name)):
            if not path.startswith(f.name + "/"):
                break
            if self.getmtime(path) == 0:
                self.unindexpath(path)
        print("done.")

    def unindexpath(self, path):
        if self.path2docid.has_key(path):
            docid = self.path2docid[path]
            print("unindexing", docid, path)
            del self.docpaths[docid]
            del self.doctimes[docid]
            del self.path2docid[path]
            try:
                self.index.unindex_doc(docid)
            except KeyError as msg:
                print("KeyError", msg)
            self.maycommit()

    def getmessagetext(self, m, name=None):
        L = []
        if name:
            L.append("_folder " + name) # To restrict search to a folder
            self.getheaders(m, L)
        try:
            self.getmsgparts(m, L, 0)
        except KeyboardInterrupt:
            raise
        except:
            print("(getmsgparts failed:)")
            reportexc()
        return L

    def getmsgparts(self, m, L, level):
        ctype = m.gettype()
        if level or ctype != "text/plain":
            print(". "*level + str(ctype))
        if ctype == "text/plain":
            L.append(m.getbodytext())
        elif ctype in ("multipart/alternative", "multipart/mixed"):
            for part in m.getbodyparts():
                self.getmsgparts(part, L, level+1)
        elif ctype == "message/rfc822":
            f = StringIO(m.getbodytext())
            m = mhlib.Message("<folder>", 0, f)
            self.getheaders(m, L)
            self.getmsgparts(m, L, level+1)

    def getheaders(self, m, L):
        H = []
        for key in "from", "to", "cc", "bcc", "subject":
            value = m.get(key)
            if value:
                H.append(value)
        if H:
            L.append("\n".join(H))

    def newdocid(self, path):
        docid = self.path2docid.get(path)
        if docid is not None:
            self.doctimes[docid] = self.getmtime(path)
            return docid
        docid = self.maxdocid + 1
        self.maxdocid = docid
        self.docpaths[docid] = path
        self.doctimes[docid] = self.getmtime(path)
        self.path2docid[path] = docid
        return docid

    def getmtime(self, path):
        path = os.path.join(self.mh.getpath(), path)
        try:
            st = os.stat(path)
        except os.error as msg:
            return 0
        return int(st[ST_MTIME])

    def maycommit(self):
        self.trans_count += 1
        if self.trans_count >= self.trans_limit > 0:
            self.commit()

    def commit(self):
        if self.trans_count > 0:
            print("committing...")
            transaction.commit()
            self.trans_count = 0
            self.pack_count += 1
            if self.pack_count >= self.pack_limit > 0:
                self.pack()

    def pack(self):
        if self.pack_count > 0:
            print("packing...")
            self.database.pack()
            self.pack_count = 0
class UUIDIndex(UnIndex):
    """Index for uuid fields with an unique value per key.

    The internal structure is:

    self._index = {datum:documentId]}
    self._unindex = {documentId:datum}

    For each datum only one documentId can exist.
    """

    meta_type = "UUIDIndex"

    manage_options = (
        {'label': 'Settings', 'action': 'manage_main'},
        {'label': 'Browse', 'action': 'manage_browse'},
    )

    query_options = ["query", "range"]

    manage = manage_main = DTMLFile('dtml/manageUUIDIndex', globals())
    manage_main._setName('manage_main')
    manage_browse = DTMLFile('../dtml/browseIndex', globals())

    def clear(self):
        self._length = Length()
        self._index = OIBTree()
        self._unindex = IOBTree()
        self._counter = Length()

    def numObjects(self):
        """Return the number of indexed objects. Since we have a 1:1 mapping
        from documents to values, we can reuse the stored length.
        """
        return self.indexSize()

    def uniqueValues(self, name=None, withLengths=0):
        """returns the unique values for name

        if withLengths is true, returns a sequence of
        tuples of (value, length)
        """
        if name is None:
            name = self.id
        elif name != self.id:
            raise StopIteration

        if not withLengths:
            for key in self._index.keys():
                yield key
        else:
            # We know the length for each value is one
            for key in self._index.keys():
                yield (key, 1)

    def insertForwardIndexEntry(self, entry, documentId):
        """Take the entry provided and put it in the correct place
        in the forward index.
        """
        if entry is None:
            return

        old_docid = self._index.get(entry, _marker)
        if old_docid is _marker:
            self._index[entry] = documentId
            self._length.change(1)
        elif old_docid != documentId:
            logger.error("A different document with value '%s' already "
                "exists in the index.'" % entry)

    def removeForwardIndexEntry(self, entry, documentId):
        """Take the entry provided and remove any reference to documentId
        in its entry in the index.
        """
        old_docid = self._index.get(entry, _marker)
        if old_docid is not _marker:
            del self._index[entry]
            self._length.change(-1)

    def _get_object_datum(self, obj, attr):
        # for a uuid it never makes sense to acquire a parent value via
        # Acquisition
        has_attr = getattr(aq_base(obj), attr, _marker)
        if has_attr is _marker:
            return _marker
        return super(UUIDIndex, self)._get_object_datum(obj, attr)
Example #5
0
class Lexicon(Persistent):

    implements(ILexicon)

    def __init__(self, *pipeline):
        self._wids = OIBTree()  # word -> wid
        self._words = IOBTree() # wid -> word
        # wid 0 is reserved for words that aren't in the lexicon (OOV -- out
        # of vocabulary).  This can happen, e.g., if a query contains a word
        # we never saw before, and that isn't a known stopword (or otherwise
        # filtered out).  Returning a special wid value for OOV words is a
        # way to let clients know when an OOV word appears.
        self._nextwid = 1
        self._pipeline = pipeline

        # Keep some statistics about indexing
        self._nbytes = 0 # Number of bytes indexed (at start of pipeline)
        self._nwords = 0 # Number of words indexed (after pipeline)

    def wordCount(self):
        """Return the number of unique terms in the lexicon."""
        return self._nextwid - 1

    def words(self):
        return self._wids.keys()

    def wids(self):
        return self._words.keys()

    def items(self):
        return self._wids.items()

    def sourceToWordIds(self, text):
        last = _text2list(text)
        for t in last:
            self._nbytes += len(t)
        for element in self._pipeline:
            last = element.process(last)
        self._nwords += len(last)
        return map(self._getWordIdCreate, last)

    def termToWordIds(self, text):
        last = _text2list(text)
        for element in self._pipeline:
            last = element.process(last)
        wids = []
        for word in last:
            wids.append(self._wids.get(word, 0))
        return wids

    def parseTerms(self, text):
        last = _text2list(text)
        for element in self._pipeline:
            process = getattr(element, "processGlob", element.process)
            last = process(last)
        return last

    def isGlob(self, word):
        return "*" in word or "?" in word

    def get_word(self, wid):
        return self._words[wid]

    def get_wid(self, word):
        return self._wids.get(word, 0)

    def globToWordIds(self, pattern):
        # Implement * and ? just as in the shell, except the pattern
        # must not start with either of these
        prefix = ""
        while pattern and pattern[0] not in "*?":
            prefix += pattern[0]
            pattern = pattern[1:]
        if not pattern:
            # There were no globbing characters in the pattern
            wid = self._wids.get(prefix, 0)
            if wid:
                return [wid]
            else:
                return []
        if not prefix:
            # The pattern starts with a globbing character.
            # This is too efficient, so we raise an exception.
            raise QueryError(
                "pattern %r shouldn't start with glob character" % pattern)
        pat = prefix
        for c in pattern:
            if c == "*":
                pat += ".*"
            elif c == "?":
                pat += "."
            else:
                pat += re.escape(c)
        pat += "$"
        prog = re.compile(pat)
        keys = self._wids.keys(prefix) # Keys starting at prefix
        wids = []
        for key in keys:
            if not key.startswith(prefix):
                break
            if prog.match(key):
                wids.append(self._wids[key])
        return wids

    def _getWordIdCreate(self, word):
        wid = self._wids.get(word)
        if wid is None:
            wid = self._new_wid()
            self._wids[word] = wid
            self._words[wid] = word
        return wid

    def _new_wid(self):
        wid = self._nextwid
        self._nextwid += 1
        return wid
Example #6
0
class Lexicon(Persistent):

    _v_nextid = None
    _wid_length_based = True  # Flag to distinguish new and old lexica

    def __init__(self, *pipeline):
        self.clear()
        self._pipeline = pipeline

    def clear(self):
        """Empty the lexicon.
        """
        self.length = Length()
        self._wid_length_based = False
        self._wids = OIBTree()  # word -> wid
        self._words = IOBTree()  # wid -> word
        # wid 0 is reserved for words that aren't in the lexicon (OOV -- out
        # of vocabulary).  This can happen, e.g., if a query contains a word
        # we never saw before, and that isn't a known stopword (or otherwise
        # filtered out).  Returning a special wid value for OOV words is a
        # way to let clients know when an OOV word appears.

    def length(self):
        """Return the number of unique terms in the lexicon.
        """
        # Overridden in instances with a BTrees.Length.Length
        raise NotImplementedError

    def words(self):
        return self._wids.keys()

    def wids(self):
        return self._words.keys()

    def items(self):
        return self._wids.items()

    def sourceToWordIds(self, text):
        last = _text2list(text)
        for element in self._pipeline:
            last = element.process(last)
        return list(map(self._getWordIdCreate, last))

    def termToWordIds(self, text):
        last = _text2list(text)
        for element in self._pipeline:
            process = getattr(element, "process_post_glob", element.process)
            last = process(last)
        wids = []
        for word in last:
            wids.append(self._wids.get(word, 0))
        return wids

    def parseTerms(self, text):
        last = _text2list(text)
        for element in self._pipeline:
            process = getattr(element, "processGlob", element.process)
            last = process(last)
        return last

    def isGlob(self, word):
        return "*" in word or "?" in word

    def get_word(self, wid):
        return self._words[wid]

    def get_wid(self, word):
        return self._wids.get(word, 0)

    def globToWordIds(self, pattern):
        # Implement * and ? just as in the shell, except the pattern
        # must not start with either of these
        prefix = ""
        while pattern and pattern[0] not in "*?":
            prefix += pattern[0]
            pattern = pattern[1:]
        if not pattern:
            # There were no globbing characters in the pattern
            wid = self._wids.get(prefix, 0)
            if wid:
                return [wid]
            else:
                return []
        if not prefix:
            # The pattern starts with a globbing character.
            # This is too efficient, so we raise an exception.
            raise QueryError(
                "pattern %r shouldn't start with glob character" % pattern)
        pat = prefix
        for c in pattern:
            if c == "*":
                pat += ".*"
            elif c == "?":
                pat += "."
            else:
                pat += re.escape(c)
        pat += "$"
        prog = re.compile(pat)
        keys = self._wids.keys(prefix)  # Keys starting at prefix
        wids = []
        for key in keys:
            if not key.startswith(prefix):
                break
            if prog.match(key):
                wids.append(self._wids[key])
        return wids

    def _getWordIdCreate(self, word):
        wid = self._wids.get(word)
        if wid is None:
            # WidCode requires us to use at least 0x4000 as a base number.
            # The algorithm in versions before 2.13 used the length as a base
            # number. So we don't even try to generate numbers below the
            # length as they are likely all taken
            minimum = 0x4000
            if self._wid_length_based:
                minimum = max(self.length(), 0x4000)

            while True:
                if self._v_nextid is None:
                    self._v_nextid = randrange(minimum, 0x10000000)

                wid = self._v_nextid
                self._v_nextid += 1

                if wid not in self._words:
                    break

                self._v_nextid = None

            self.length.change(1)
            self._wids[word] = wid
            self._words[wid] = word
        return wid
Example #7
0
class ContentTypeScopeManager(BTreeScopeManager):
    """
    A scope manager based on content types.

    This scope manager validates the request using the content type of
    the accessed object and the subpath of the request against a content
    type mapping.  The content type mapping to be used will be one of
    specified by the resource access key, the client key or default, and
    is resolved in this order.

    One more restriction imposed by this scope manager: mappings are
    enforced absolutely for access keys.  This allows clients to request
    new default scopes for themselves at will and/or have site-wide
    default scope changes without compromising the scopes already
    granted by the resource owner referenced by the access key.

    This however does not address the case where additional global
    restrictions that may be placed by the site owner as the focus is
    ultimately on the access keys.  Workaround is to revoke those keys
    and have the content owners issue new ones regardless of changes.

    Pruning of unused scope is not implemented.
    """

    zope.interface.implements(IContentTypeScopeManager)

    default_mapping_id = fieldproperty.FieldProperty(
        IContentTypeScopeManager['default_mapping_id'])

    def __init__(self):
        super(ContentTypeScopeManager, self).__init__()
        self._mappings = IOBTree()

        # Methods permitted to access this mapping with.  Originally
        # I wanted to provide alternative sets of mapping on a per
        # mapping_id basis, however this proved to be complex and
        # complicated due to extra relationships involved.
        self._methods = IOBTree()

        # For metadata related to the above.
        self._mappings_metadata = IOBTree()

        # To ease the usage of scopes, the mappings are referenced by
        # names and are called profiles which add a few useful fields to
        # allow slightly easier usage.  This separates the name from the
        # already active tokens such that once a token is instantiated
        # with a scope, the mapping is stuck until the token is revoked.
        self._named_mappings = OIBTree()  # name to id.

        # To not overburden the named mappings with work-in-progress
        # profiles, instantiate one here also.
        self._edit_mappings = OOBTree()

        self.default_mapping_id = self.addMapping({})

    # Main mapping related management methods.

    def addMapping(self, mapping, methods='GET HEAD OPTIONS', metadata=None):
        key = 0  # default?
        if len(self._mappings) > 0:
            # Can calculate the next key.
            key = self._mappings.maxKey() + 1
        self._mappings[key] = mapping
        self._methods[key] = methods.split()
        if metadata is not None:
            self._mappings_metadata[key] = metadata
        return key

    def getMapping(self, mapping_id, default=_marker):
        result = self._mappings.get(mapping_id, default)
        if result is _marker:
            raise KeyError()
        return result

    def getMappingMetadata(self, mapping_id, default=None):
        result = self._mappings_metadata.get(mapping_id, default)
        return result

    def getMappingId(self, name):
        # Returned ID could potentially not exist, what do?
        return self._named_mappings[name]

    def getMappingMethods(self, mapping_id, default=_marker):
        result = self._methods.get(mapping_id, default)
        if result is _marker:
            raise KeyError()
        return result

    def checkMethodPermission(self, mapping_id, method):
        methods = self.getMappingMethods(mapping_id, ())
        return method in methods

    def setMappingNameToId(self, name, mapping_id):
        self._named_mappings[name] = mapping_id

    def delMappingName(self, name):
        saved = self._named_mappings.pop(name, None)
        edits = self._edit_mappings.pop(name, None)
        return (saved, edits)

    def getMappingByName(self, name, default=_marker):
        try:
            mapping_id = self.getMappingId(name)
            mapping = self.getMapping(mapping_id)
        except KeyError:
            if default == _marker:
                raise
            mapping = default
        return mapping

    def getMappingNames(self):
        return self._named_mappings.keys()

    # Temporary/edited mapping profiles

    def getEditProfile(self, name, default=None):
        return self._edit_mappings.get(name, default)

    def setEditProfile(self, name, value):
        assert IContentTypeScopeProfile.providedBy(value) or value is None
        self._edit_mappings[name] = value

    def commitEditProfile(self, name):
        profile = self.getEditProfile(name)
        if not (IContentTypeScopeProfile.providedBy(profile)):
            raise KeyError('edit profile does not exist')
        new_mapping = profile.mapping
        methods = profile.methods
        metadata = {
            'title': profile.title,
            'description': profile.description,
            # Should really not duplicate this there but this is easy
            # shortcut to take for now.
            'methods': methods,
        }
        new_id = self.addMapping(new_mapping, methods=methods,
            metadata=metadata)
        self.setMappingNameToId(name, new_id)

    def getEditProfileNames(self):
        return self._edit_mappings.keys()

    def isProfileModified(self, name):
        # TODO I would like some way to compare the two profiles in a
        # sane way but only using active types and types that have
        # stuff assigned.  So for now just use this naive method.
        profile = self.getEditProfile(name)
        try:
            mapping_id = self.getMappingId(name)
            mapping = self.getMapping(mapping_id)
            metadata = self.getMappingMetadata(mapping_id, {})
        except KeyError:
            # If profile exists, no associated ID, definitely modified.
            return True

        return not (profile.mapping == mapping and 
            profile.title == metadata.get('title') and
            profile.description == metadata.get('description') and
            profile.methods == metadata.get('methods')
        )

    # Scope handling.

    def requestScope(self, request_key, raw_scope):
        """
        This manager references scope by ids internally.  Resolve the
        raw scope id by the client into the mapping ids.
        """

        raw_scopes = raw_scope and raw_scope.split(',') or []
        result = set()
        for rs in raw_scopes:
            # Ignoring the current site URI and just capture the final
            # fragment.
            name = rs.split('/')[-1]
            try:
                mapping_id = self.getMappingId(name)
                # This verifies the existence of the mapping with id.
                mapping = self.getMapping(mapping_id)
            except KeyError:
                # Failed to fulfill the requested scope.
                return False
            result.add(mapping_id)

        if not result:
            result.add(self.default_mapping_id)

        self.setScope(request_key, result)
        return True

    def validate(self, request, client_key, access_key,
            accessed, container, name, value):
        """
        See IScopeManager.
        """

        mappings = self.resolveMapping(client_key, access_key)
        # multiple rights were requested, check through all of them.
        for mapping_id in mappings:
            mapping = self.getMapping(mapping_id, default={})
            result = self.validateTargetWithMapping(accessed, name, mapping)
            method_allowed = self.checkMethodPermission(mapping_id,
                request.method)
            if result and method_allowed:
                return True

        # no matching mappings.
        return False

    def resolveMapping(self, client_key, access_key):
        """
        See IDefaultScopeManager.
        """

        # As all mappings are referenced byh access keys.
        return self.getAccessScope(access_key, None)

    def resolveTarget(self, accessed, name):
        """
        Accessed target resolution.

        Find the type of the container object of the accessed object by
        traversing upwards, and gather the path to resolve into the 
        content type id.  Return both these values.
        """

        logger.debug('resolving %s into types', accessed)
        # use getSite() instead of container?
        pt_tool = getToolByName(accessed, 'portal_types', None)
        if pt_tool is None:
            return None, None

        context = aq_inner(accessed)
        typeinfo = None
        subpath = [name]

        while context is not None:
            typeinfo = pt_tool.getTypeInfo(context)
            if typeinfo:
                subpath.reverse()
                return typeinfo.id, '/'.join(subpath)
            # It should have a name...
            subpath.append(context.__name__)
            context = aq_parent(context)

        logger.debug('parent of %s failed to resolve into typeinfo', accessed)
        return None, None

    def validateTargetWithMapping(self, accessed, name, mapping):
        atype, subpath = self.resolveTarget(accessed, name)
        return self.validateTypeSubpathMapping(atype, subpath, mapping)

    def validateTypeSubpathMapping(self, accessed_type, subpath, mapping):
        # A simple lookup method.
        valid_scopes = mapping.get(accessed_type, {})
        if not valid_scopes:
            logger.debug('out of scope: %s has no mapping', accessed_type)
            return False
        logger.debug('%s got mapping', accessed_type)

        for vs in valid_scopes:
            # XXX ignores second last asterisk, preventing validation
            # against items that have an asterisk in its name for 
            # whatever reason...
            if vs.endswith('*') and '/' in vs:
                match = subpath.startswith(vs[:vs.rindex('*')])
            else:
                match = subpath == vs
            if match:
                logger.debug('subpath:%s within scope', subpath)
                return True
        logger.debug('out of scope: %s not a subpath in mapping for %s',
            subpath, accessed_type)
        return False
Example #8
0
class Indexer(object):

    filestorage = database = connection = root = None

    def __init__(self, datafs, writable=0, trans=0, pack=0):
        self.trans_limit = trans
        self.pack_limit = pack
        self.trans_count = 0
        self.pack_count = 0
        self.stopdict = get_stopdict()
        self.mh = mhlib.MH()
        self.filestorage = FileStorage(datafs, read_only=(not writable))
        self.database = DB(self.filestorage)
        self.connection = self.database.open()
        self.root = self.connection.root()
        try:
            self.index = self.root["index"]
        except KeyError:
            self.index = self.root["index"] = TextIndexWrapper()
        try:
            self.docpaths = self.root["docpaths"]
        except KeyError:
            self.docpaths = self.root["docpaths"] = IOBTree()
        try:
            self.doctimes = self.root["doctimes"]
        except KeyError:
            self.doctimes = self.root["doctimes"] = IIBTree()
        try:
            self.watchfolders = self.root["watchfolders"]
        except KeyError:
            self.watchfolders = self.root["watchfolders"] = {}
        self.path2docid = OIBTree()
        for docid in self.docpaths.keys():
            path = self.docpaths[docid]
            self.path2docid[path] = docid
        try:
            self.maxdocid = max(self.docpaths.keys())
        except ValueError:
            self.maxdocid = 0
        print(len(self.docpaths), "Document ids")
        print(len(self.path2docid), "Pathnames")
        print(self.index.lexicon.length(), "Words")

    def dumpfreqs(self):
        lexicon = self.index.lexicon
        index = self.index.index
        assert isinstance(index, OkapiIndex)
        L = []
        for wid in lexicon.wids():
            freq = 0
            for f in index._wordinfo.get(wid, {}).values():
                freq += f
            L.append((freq, wid, lexicon.get_word(wid)))
        L.sort()
        L.reverse()
        for freq, wid, word in L:
            print("%10d %10d %s" % (wid, freq, word))

    def dumpwids(self):
        lexicon = self.index.lexicon
        index = self.index.index
        assert isinstance(index, OkapiIndex)
        for wid in lexicon.wids():
            freq = 0
            for f in index._wordinfo.get(wid, {}).values():
                freq += f
            print("%10d %10d %s" % (wid, freq, lexicon.get_word(wid)))

    def dumpwords(self):
        lexicon = self.index.lexicon
        index = self.index.index
        assert isinstance(index, OkapiIndex)
        for word in lexicon.words():
            wid = lexicon.get_wid(word)
            freq = 0
            for f in index._wordinfo.get(wid, {}).values():
                freq += f
            print("%10d %10d %s" % (wid, freq, word))

    def close(self):
        self.root = None
        if self.connection is not None:
            self.connection.close()
            self.connection = None
        if self.database is not None:
            self.database.close()
            self.database = None
        if self.filestorage is not None:
            self.filestorage.close()
            self.filestorage = None

    def interact(self, nbest=NBEST, maxlines=MAXLINES):
        try:
            import readline
        except ImportError:
            pass
        text = ""
        top = 0
        results = []
        while 1:
            try:
                line = raw_input("Query: ")
            except EOFError:
                print("\nBye.")
                break
            line = line.strip()
            if line.startswith("/"):
                self.specialcommand(line, results, top - nbest)
                continue
            if line:
                text = line
                top = 0
            else:
                if not text:
                    continue
            try:
                results, n = self.timequery(text, top + nbest)
            except KeyboardInterrupt:
                raise
            except:
                reportexc()
                text = ""
                continue
            if len(results) <= top:
                if not n:
                    print("No hits for %r." % text)
                else:
                    print("No more hits for %r." % text)
                text = ""
                continue
            print("[Results %d-%d from %d" % (top+1, min(n, top+nbest), n), end=' ')
            print("for query %s]" % repr(text))
            self.formatresults(text, results, maxlines, top, top+nbest)
            top += nbest

    def specialcommand(self, line, results, first):
        assert line.startswith("/")
        line = line[1:]
        if not line:
            n = first
        else:
            try:
                n = int(line) - 1
            except:
                print("Huh?")
                return
        if n < 0 or n >= len(results):
            print("Out of range")
            return
        docid, score = results[n]
        path = self.docpaths[docid]
        i = path.rfind("/")
        assert i > 0
        folder = path[:i]
        n = path[i+1:]
        cmd = "show +%s %s" % (folder, n)
        if os.getenv("DISPLAY"):
            os.system("xterm -e  sh -c '%s | less' &" % cmd)
        else:
            os.system(cmd)

    def query(self, text, nbest=NBEST, maxlines=MAXLINES):
        results, n = self.timequery(text, nbest)
        if not n:
            print("No hits for %r." % text)
            return
        print("[Results 1-%d from %d]" % (len(results), n))
        self.formatresults(text, results, maxlines)

    def timequery(self, text, nbest):
        t0 = time.time()
        c0 = time.clock()
        results, n = self.index.query(text, 0, nbest)
        t1 = time.time()
        c1 = time.clock()
        print("[Query time: %.3f real, %.3f user]" % (t1-t0, c1-c0))
        return results, n

    def formatresults(self, text, results, maxlines=MAXLINES,
                      lo=0, hi=sys.maxint):
        stop = self.stopdict.has_key
        words = [w for w in re.findall(r"\w+\*?", text.lower()) if not stop(w)]
        pattern = r"\b(" + "|".join(words) + r")\b"
        pattern = pattern.replace("*", ".*") # glob -> re syntax
        prog = re.compile(pattern, re.IGNORECASE)
        print('='*70)
        rank = lo
        for docid, score in results[lo:hi]:
            rank += 1
            path = self.docpaths[docid]
            score *= 100.0
            print("Rank:    %d   Score: %d%%   File: %s" % (rank, score, path))
            path = os.path.join(self.mh.getpath(), path)
            try:
                fp = open(path)
            except (IOError, OSError) as msg:
                print("Can't open:", msg)
                continue
            msg = mhlib.Message("<folder>", 0, fp)
            for header in "From", "To", "Cc", "Bcc", "Subject", "Date":
                h = msg.getheader(header)
                if h:
                    print("%-8s %s" % (header+":", h))
            text = self.getmessagetext(msg)
            if text:
                print()
                nleft = maxlines
                for part in text:
                    for line in part.splitlines():
                        if prog.search(line):
                            print(line)
                            nleft -= 1
                            if nleft <= 0:
                                break
                    if nleft <= 0:
                        break
            print('-'*70)

    def update(self, args):
        folder = None
        seqs = []

        for arg in args:
            if arg.startswith("+"):
                if folder is None:
                    folder = arg[1:]
                else:
                    print("only one folder at a time")
                    return
            else:
                seqs.append(arg)

        if not folder:
            folder = self.mh.getcontext()
        if not seqs:
            seqs = ['all']

        try:
            f = self.mh.openfolder(folder)
        except mhlib.Error as msg:
            print(msg)
            return

        dict = {}
        for seq in seqs:
            try:
                nums = f.parsesequence(seq)
            except mhlib.Error as msg:
                print(msg or "unparsable message sequence: %s" % repr(seq))
                return
            for n in nums:
                dict[n] = n
        msgs = dict.keys()
        msgs.sort()

        self.updatefolder(f, msgs)
        self.commit()

    def optimize(self, args):
        uniqwords = {}
        for folder in args:
            if folder.startswith("+"):
                folder = folder[1:]
            print("\nOPTIMIZE FOLDER", folder)
            try:
                f = self.mh.openfolder(folder)
            except mhlib.Error as msg:
                print(msg)
                continue
            self.prescan(f, f.listmessages(), uniqwords)
        L = [(uniqwords[word], word) for word in uniqwords.keys()]
        L.sort()
        L.reverse()
        for i in range(100):
            print("%3d. %6d %s" % ((i+1,) + L[i]))
        self.index.lexicon.sourceToWordIds([word for (count, word) in L])

    def prescan(self, f, msgs, uniqwords):
        pipeline = [Splitter(), CaseNormalizer(), StopWordRemover()]
        for n in msgs:
            print("prescanning", n)
            m = f.openmessage(n)
            text = self.getmessagetext(m, f.name)
            for p in pipeline:
                text = p.process(text)
            for word in text:
                uniqwords[word] = uniqwords.get(word, 0) + 1

    def bulkupdate(self, args):
        if not args:
            print("No folders specified; use ALL to bulk-index all folders")
            return
        if "ALL" in args:
            i = args.index("ALL")
            args[i:i+1] = self.mh.listfolders()
        for folder in args:
            if folder.startswith("+"):
                folder = folder[1:]
            print("\nFOLDER", folder)
            try:
                f = self.mh.openfolder(folder)
            except mhlib.Error as msg:
                print(msg)
                continue
            self.updatefolder(f, f.listmessages())
            print("Total", len(self.docpaths))
        self.commit()
        print("Indexed", self.index.lexicon._nbytes, "bytes and", end=' ')
        print(self.index.lexicon._nwords, "words;", end=' ')
        print(len(self.index.lexicon._words), "unique words.")

    def updatefolder(self, f, msgs):
        self.watchfolders[f.name] = self.getmtime(f.name)
        for n in msgs:
            path = "%s/%s" % (f.name, n)
            docid = self.path2docid.get(path, 0)
            if docid and self.getmtime(path) == self.doctimes.get(docid, 0):
                print("unchanged", docid, path)
                continue
            docid = self.newdocid(path)
            try:
                m = f.openmessage(n)
            except IOError:
                print("disappeared", docid, path)
                self.unindexpath(path)
                continue
            text = self.getmessagetext(m, f.name)
            if not text:
                self.unindexpath(path)
                continue
            print("indexing", docid, path)
            self.index.index_doc(docid, text)
            self.maycommit()
        # Remove messages from the folder that no longer exist
        for path in list(self.path2docid.keys(f.name)):
            if not path.startswith(f.name + "/"):
                break
            if self.getmtime(path) == 0:
                self.unindexpath(path)
        print("done.")

    def unindexpath(self, path):
        if path in self.path2docid:
            docid = self.path2docid[path]
            print("unindexing", docid, path)
            del self.docpaths[docid]
            del self.doctimes[docid]
            del self.path2docid[path]
            try:
                self.index.unindex_doc(docid)
            except KeyError as msg:
                print("KeyError", msg)
            self.maycommit()

    def getmessagetext(self, m, name=None):
        L = []
        if name:
            L.append("_folder " + name) # To restrict search to a folder
            self.getheaders(m, L)
        try:
            self.getmsgparts(m, L, 0)
        except KeyboardInterrupt:
            raise
        except:
            print("(getmsgparts failed:)")
            reportexc()
        return L

    def getmsgparts(self, m, L, level):
        ctype = m.gettype()
        if level or ctype != "text/plain":
            print(". "*level + str(ctype))
        if ctype == "text/plain":
            L.append(m.getbodytext())
        elif ctype in ("multipart/alternative", "multipart/mixed"):
            for part in m.getbodyparts():
                self.getmsgparts(part, L, level+1)
        elif ctype == "message/rfc822":
            f = StringIO(m.getbodytext())
            m = mhlib.Message("<folder>", 0, f)
            self.getheaders(m, L)
            self.getmsgparts(m, L, level+1)

    def getheaders(self, m, L):
        H = []
        for key in "from", "to", "cc", "bcc", "subject":
            value = m.get(key)
            if value:
                H.append(value)
        if H:
            L.append("\n".join(H))

    def newdocid(self, path):
        docid = self.path2docid.get(path)
        if docid is not None:
            self.doctimes[docid] = self.getmtime(path)
            return docid
        docid = self.maxdocid + 1
        self.maxdocid = docid
        self.docpaths[docid] = path
        self.doctimes[docid] = self.getmtime(path)
        self.path2docid[path] = docid
        return docid

    def getmtime(self, path):
        path = os.path.join(self.mh.getpath(), path)
        try:
            st = os.stat(path)
        except os.error as msg:
            return 0
        return int(st[ST_MTIME])

    def maycommit(self):
        self.trans_count += 1
        if self.trans_count >= self.trans_limit > 0:
            self.commit()

    def commit(self):
        if self.trans_count > 0:
            print("committing...")
            transaction.commit()
            self.trans_count = 0
            self.pack_count += 1
            if self.pack_count >= self.pack_limit > 0:
                self.pack()

    def pack(self):
        if self.pack_count > 0:
            print("packing...")
            self.database.pack()
            self.pack_count = 0
Example #9
0
class Lexicon(Persistent):
    """Maps words to word ids """

    __implements__ = LexiconInterface

    def __init__(self, truncate_left=0):
        self.truncate_left = truncate_left
        self.clear()

    def clear(self):
        self._nextid      = BTrees.Length.Length()
        self._forward_idx = OIBTree()
        self._inverse_idx = IOBTree()
        if self.truncate_left:
            self._lforward_idx = OIBTree()
        else:
            self._lforward_idx = None

    def getWordIdList(self, words):
        """ return a list of wordIds for a list of words """
    
        fw_idx = self._forward_idx
        fw_idx_get = fw_idx.get
        rev_idx = self._inverse_idx
        if self.truncate_left: lfw_idx = self._lforward_idx
        nextid = self._nextid

        wids = []
        append = wids.append

        for word in words:
            wid = fw_idx_get(word)
            if not wid:         
                nextid.change(1)
                wid = nextid()
                fw_idx[word] = wid
                rev_idx[wid] = word
                if self.truncate_left:
                    lfw_idx[word[::-1]] = wid
            append(wid)
        return wids

    def getWordId(self, word, default=None):
        """Return the matched word against the key."""
        return  self._forward_idx.get(word, default)

    def getWord(self, wid):
        """ return a word by its wid"""
        return self._inverse_idx[wid]

    def deleteWord(self, word):
        wid = self._forward_idx[word]
        del self._inverse_idx[wid]
        del self._forward_idx[word]

    def deleteWordId(self, wid):
        word = self._inverse_idx[wid]
        del self._forward_idx[word]
        del self._inverse_idx[wid]

    def getWordsForRightTruncation(self, prefix):
        """ Return a list for wordIds that match against prefix.
            We use the BTrees range search to perform the search
        """
        assert isinstance(prefix, unicode)
        return  self._forward_idx.keys(prefix, prefix + u'\uffff') 

    def getWordsForLeftTruncation(self, suffix):
        """ Return a sequence of word ids for a common suffix """
        suffix = suffix[::-1]
        assert isinstance(suffix, unicode)
        return  [w[::-1] for w in  self._lforward_idx.keys(suffix, suffix + u'\uffff') ] 

    def createRegex(self, pattern):
        """Translate a PATTERN to a regular expression """
        return '%s$' % pattern.replace( '*', '.*').replace( '?', '.')

    def getSimiliarWords(self, term, threshold=0.75): 
        """ return a list of similar words based on the levenshtein distance """
        return [ (w, ratio(w,term)) for w in self._forward_idx.keys() if ratio(w, term) > threshold  ]

    def getWordsForPattern(self, pattern):
        """ perform full pattern matching """

        # search for prefix in word
        mo = re.search('([\?\*])', pattern)
        if mo is None:
            return [ pattern ] 

        pos = mo.start(1)
        if pos==0: raise QueryParserError, \
            'word "%s" should not start with a globbing character' % pattern

        prefix = pattern[:pos]
        words = self._forward_idx.keys(prefix, prefix + u'\uffff')
        regex = re.compile( self.createRegex(pattern) )
        return [word  for word in words if regex.match(word) ] 

    def getWordsInRange(self, w1, w2):
        """ return all words within w1...w2 """
        return self._forward_idx.keys(w1, w2)

    def getWordsForSubstring(self, sub):
        """ return all words that match *sub* """
        return [word for word in self._forward_idx.keys() if sub in word]

    def getWordIds(self):
        """ return all wids """
        return self._inverse_idx.keys()

    def removeWordId(self, wid):
        """ remove word id 'wid' """
        word = self._inverse_idx[wid]
        del self._inverse_idx[wid]
        del self._forward_idx[word]

    def __len__(self):
        return len(self._inverse_idx.keys())