コード例 #1
0
ファイル: text.py プロジェクト: yonglehou/zerodb
class Lexicon(_Lexicon):
    family = trees.family32  # In comparison with standard Lexicon, use bigger buckets

    def __init__(self, *pipeline):
        self._wids = self.family.OI.BTree()
        self._words = self.family.IO.BTree()
        self.wordCount = Length()
        self._pipeline = pipeline

    def sourceToWordIds(self, text):
        if text is None:
            text = ''
        last = _text2list(text)
        for element in self._pipeline:
            last = element.process(last)
        if not isinstance(self.wordCount, Length):
            # Make sure wordCount is overridden with a BTrees.Length.Length
            self.wordCount = Length(self.wordCount())
        # Strategically unload the length value so that we get the most
        # recent value written to the database to minimize conflicting wids
        # Because length is independent, this will load the most
        # recent value stored, regardless of whether MVCC is enabled
        self.wordCount._p_deactivate()
        parallel_traversal(self._wids, last)
        return list(map(self._getWordIdCreate, last))
コード例 #2
0
ファイル: text.py プロジェクト: barkinet/zerodb
class Lexicon(_Lexicon):
    family = trees.family32  # In comparison with standard Lexicon, use bigger buckets

    def __init__(self, *pipeline):
        self._wids = self.family.OI.BTree()
        self._words = self.family.IO.BTree()
        self.wordCount = Length()
        self._pipeline = pipeline

    def sourceToWordIds(self, text):
        if text is None:
            text = ''
        last = _text2list(text)
        for element in self._pipeline:
            last = element.process(last)
        if not isinstance(self.wordCount, Length):
            # Make sure wordCount is overridden with a BTrees.Length.Length
            self.wordCount = Length(self.wordCount())
        # Strategically unload the length value so that we get the most
        # recent value written to the database to minimize conflicting wids
        # Because length is independent, this will load the most
        # recent value stored, regardless of whether MVCC is enabled
        self.wordCount._p_deactivate()
        parallel_traversal(self._wids, last)
        return list(map(self._getWordIdCreate, last))
コード例 #3
0
class CanopyLexicon(Lexicon):  # pragma: no cover
    def sourceToWordIds(self, last):
        if last is None:
            last = []
        if not isinstance(self.wordCount, Length):
            self.wordCount = Length(self.wordCount())
        self.wordCount._p_deactivate()
        return list(map(self._getWordIdCreate, last))
コード例 #4
0
ファイル: canopy_index.py プロジェクト: pombredanne/dedupe
class CanopyLexicon(Lexicon) : # pragma: no cover
    def sourceToWordIds(self, last): 
        if last is None:
            last = [] 
        if not isinstance(self.wordCount, Length):
            self.wordCount = Length(self.wordCount())
        self.wordCount._p_deactivate()
        return list(map(self._getWordIdCreate, last))
コード例 #5
0
ファイル: index.py プロジェクト: shahin/dedupe
class CanopyLexicon(Lexicon):  # pragma : no cover
    def __init__(self, stop_words):
        super(CanopyLexicon, self).__init__()
        self._pipeline = [CustomStopWordRemover(stop_words)]

    def sourceToWordIds(self, last):
        if last is None:
            last = []
        for element in self._pipeline:
            last = element.process(last)
        if not isinstance(self.wordCount, Length):
            self.wordCount = Length(self.wordCount())
        self.wordCount._p_deactivate()
        return list(map(self._getWordIdCreate, last))
コード例 #6
0
ファイル: index.py プロジェクト: DATAQC/dedupe
class CanopyLexicon(Lexicon) : # pragma : no cover
    def __init__(self, stop_words) : 
        super(CanopyLexicon, self).__init__()
        self._pipeline = [CustomStopWordRemover(stop_words)]

    def sourceToWordIds(self, last): 
        if last is None:
            last = [] 
        for element in self._pipeline:
            last = element.process(last)
        if not isinstance(self.wordCount, Length):
            self.wordCount = Length(self.wordCount())
        self.wordCount._p_deactivate()
        return list(map(self._getWordIdCreate, last))
コード例 #7
0
ファイル: index.py プロジェクト: lazymike/dedupe
class CanopyLexicon(Lexicon) : # pragma : no cover
    def __init__(self, stop_words) : 
        super(CanopyLexicon, self).__init__()
        self._pipeline = [Splitter(),
                          CustomStopWordRemover(stop_words),
                          OperatorEscaper()]


    def sourceToWordIds(self, doc): 
        if doc is None:
            doc = ''
        last = stringify(doc) # this is changed line
        for element in self._pipeline:
            last = element.process(last)
        if not isinstance(self.wordCount, Length):
            self.wordCount = Length(self.wordCount())
        self.wordCount._p_deactivate()
        return list(map(self._getWordIdCreate, last))
コード例 #8
0
ファイル: index.py プロジェクト: rkiddy/dedupe
class CanopyLexicon(Lexicon):  # pragma : no cover
    def __init__(self, stop_words):
        super(CanopyLexicon, self).__init__()
        self._pipeline = [
            Splitter(),
            CustomStopWordRemover(stop_words),
            OperatorEscaper()
        ]

    def sourceToWordIds(self, doc):
        if doc is None:
            doc = ''
        last = stringify(doc)  # this is changed line
        for element in self._pipeline:
            last = element.process(last)
        if not isinstance(self.wordCount, Length):
            self.wordCount = Length(self.wordCount())
        self.wordCount._p_deactivate()
        return list(map(self._getWordIdCreate, last))
コード例 #9
0
ファイル: lexicon.py プロジェクト: the-cc-dev/zope.index
class Lexicon(Persistent):
    """
    Implementation of :class:`zope.index.text.interfaces.ILexicon`.
    """

    def __init__(self, *pipeline):
        self._wids = OIBTree()  # word -> wid
        self._words = IOBTree() # wid -> word
        # wid 0 is reserved for words that aren't in the lexicon (OOV -- out
        # of vocabulary).  This can happen, e.g., if a query contains a word
        # we never saw before, and that isn't a known stopword (or otherwise
        # filtered out).  Returning a special wid value for OOV words is a
        # way to let clients know when an OOV word appears.
        self.wordCount = Length()
        self._pipeline = pipeline

    def wordCount(self):
        """Return the number of unique terms in the lexicon."""
        # overridden per instance
        return len(self._wids)

    def words(self):
        return self._wids.keys()

    def wids(self):
        return self._words.keys()

    def items(self):
        return self._wids.items()

    def sourceToWordIds(self, text):
        if text is None:
            text = ''
        last = _text2list(text)
        for element in self._pipeline:
            last = element.process(last)
        if not isinstance(self.wordCount, Length):
            # Make sure wordCount is overridden with a BTrees.Length.Length
            self.wordCount = Length(self.wordCount())
        # Strategically unload the length value so that we get the most
        # recent value written to the database to minimize conflicting wids
        # Because length is independent, this will load the most
        # recent value stored, regardless of whether MVCC is enabled
        self.wordCount._p_deactivate()
        return list(map(self._getWordIdCreate, last))

    def termToWordIds(self, text):
        last = _text2list(text)
        for element in self._pipeline:
            last = element.process(last)
        wids = []
        for word in last:
            wids.append(self._wids.get(word, 0))
        return wids

    def parseTerms(self, text):
        last = _text2list(text)
        for element in self._pipeline:
            process = getattr(element, "processGlob", element.process)
            last = process(last)
        return last

    def isGlob(self, word):
        return "*" in word or "?" in word

    def get_word(self, wid):
        return self._words[wid]

    def get_wid(self, word):
        return self._wids.get(word, 0)

    def globToWordIds(self, pattern):
        # Implement * and ? just as in the shell, except the pattern
        # must not start with either of these
        prefix = ""
        while pattern and pattern[0] not in "*?":
            prefix += pattern[0]
            pattern = pattern[1:]
        if not pattern:
            # There were no globbing characters in the pattern
            wid = self._wids.get(prefix, 0)
            if wid:
                return [wid]
            else:
                return []
        if not prefix:
            # The pattern starts with a globbing character.
            # This is too efficient, so we raise an exception.
            raise QueryError(
                "pattern %r shouldn't start with glob character" % pattern)
        pat = prefix
        for c in pattern:
            if c == "*":
                pat += ".*"
            elif c == "?":
                pat += "."
            else:
                pat += re.escape(c)
        pat += "$"
        prog = re.compile(pat)
        keys = self._wids.keys(prefix) # Keys starting at prefix
        wids = []
        for key in keys:
            if not key.startswith(prefix):
                break
            if prog.match(key):
                wids.append(self._wids[key])
        return wids

    def _getWordIdCreate(self, word):
        wid = self._wids.get(word)
        if wid is None:
            wid = self._new_wid()
            self._wids[word] = wid
            self._words[wid] = word
        return wid

    def _new_wid(self):
        count = self.wordCount
        count.change(1)
        while count() in self._words:
            # just to be safe
            count.change(1)
        return count()