コード例 #1
ファイル: wordnet.py プロジェクト: archatas/whoosh
def parse_file(f):
    """Parses the WordNet wn_s.pl prolog file and returns two dictionaries:
    word2nums and num2words.
    word2nums = defaultdict(list)
    num2words = defaultdict(list)
    for line in f:
        if not line.startswith("s("):
        line = line[2:]
        num = int(line[:line.find(",")])
        qt = line.find("'")
        line = line[qt+1:]
        qt = line.find("'")
        word = line[:qt].lower()

        if not word.isalpha():
    return word2nums, num2words
コード例 #2
ファイル: wordnet.py プロジェクト: ra2003/whoosh
def parse_file(f):
    """Parses the WordNet wn_s.pl prolog file and returns two dictionaries:
    word2nums and num2words.

    word2nums = defaultdict(list)
    num2words = defaultdict(list)

    for line in f:
        if not line.startswith("s("):

        line = line[2:]
        num = int(line[:line.find(",")])
        qt = line.find("'")
        line = line[qt + 1:]
        qt = line.find("'")
        word = line[:qt].lower()

        if not word.isalpha():


    return word2nums, num2words
コード例 #3
ファイル: paicehusk.py プロジェクト: ra2003/whoosh
 def __init__(self, ruletable):
     :param ruletable: a string containing the rule data, separated
         by newlines.
     self.rules = defaultdict(list)
コード例 #4
ファイル: classify.py プロジェクト: archatas/whoosh
 def __init__(self, ixreader, fieldname, model=Bo1Model):
     :param reader: A :class:whoosh.reading.IndexReader object.
     :param fieldname: The name of the field in which to search.
     :param model: (classify.ExpansionModel) The model to use for expanding
         the query terms. If you omit this parameter, the expander uses
         scoring.Bo1Model by default.
     self.fieldname = fieldname
     if type(model) is type:
         model = model(ixreader, fieldname)
     self.model = model
     # Cache the collection frequency of every term in this field. This
     # turns out to be much faster than reading each individual weight from
     # the term index as we add words.
     self.collection_freq = dict((word, freq) for word, _, freq
                                   in ixreader.iter_field(fieldname))
     # Maps words to their weight in the top N documents.
     self.topN_weight = defaultdict(float)
     # Total weight of all terms in the top N documents.
     self.top_total = 0
コード例 #5
ファイル: classify.py プロジェクト: ra2003/whoosh
    def __init__(self, ixreader, fieldname, model=Bo1Model):
        :param reader: A :class:whoosh.reading.IndexReader object.
        :param fieldname: The name of the field in which to search.
        :param model: (classify.ExpansionModel) The model to use for expanding
            the query terms. If you omit this parameter, the expander uses
            scoring.Bo1Model by default.

        self.fieldname = fieldname

        if type(model) is type:
            model = model(ixreader, fieldname)
        self.model = model

        # Cache the collection frequency of every term in this field. This
        # turns out to be much faster than reading each individual weight from
        # the term index as we add words.
        self.collection_freq = dict(
            (word, freq) for word, _, freq in ixreader.iter_field(fieldname))

        # Maps words to their weight in the top N documents.
        self.topN_weight = defaultdict(float)

        # Total weight of all terms in the top N documents.
        self.top_total = 0
コード例 #6
ファイル: paicehusk.py プロジェクト: archatas/whoosh
 def __init__(self, ruletable):
     :param ruletable: a string containing the rule data, separated
         by newlines.
     self.rules = defaultdict(list)
コード例 #7
    def word_values(self, value, doc_boost=1.0, **kwargs):
        seen = defaultdict(int)
        for t in unstopped(self.analyzer(value, **kwargs)):
            seen[t.text] += 1

        encode = self.encode
        return ((w, freq, encode((freq, doc_boost)))
                for w, freq in seen.iteritems())
コード例 #8
ファイル: formats.py プロジェクト: archatas/whoosh
 def word_values(self, value, doc_boost=1.0, **kwargs):
     seen = defaultdict(int)
     for t in unstopped(self.analyzer(value, **kwargs)):
         seen[t.text] += 1
     encode = self.encode
     return ((w, freq, encode((freq, doc_boost)))
             for w, freq in seen.iteritems())
コード例 #9
ファイル: formats.py プロジェクト: archatas/whoosh
 def word_values(self, value, start_pos=0, **kwargs):
     seen = defaultdict(list)
     for t in unstopped(self.analyzer(value, positions=True,
                                      start_pos=start_pos, **kwargs)):
         seen[t.text].append(start_pos + t.pos)
     encode = self.encode
     return ((w, len(poslist), encode(poslist))
             for w, poslist in seen.iteritems())
コード例 #10
ファイル: ramindex.py プロジェクト: archatas/whoosh
 def __init__(self, schema):
     self.schema = schema
     self.maxdoc = 0
     self._sync_lock = Lock()
     self.termlists = defaultdict(list)
     self.invertedindex = {}
     for fieldnum in xrange(len(schema)):
         self.invertedindex[fieldnum] = defaultdict(list)
     self.indexfreqs = defaultdict(int)
     self.storedfields = {}
     self.fieldlengths = defaultdict(int)
     self.fieldlength_totals = defaultdict(int)
     self.vectors = {}
     self.deleted = set()
     self._stored_to_pos = dict((fnum, i) for i, fnum in enumerate(self.schema.stored_fields()))
コード例 #11
ファイル: ramindex.py プロジェクト: ra2003/whoosh
    def __init__(self, schema):
        self.schema = schema
        self.maxdoc = 0
        self._sync_lock = Lock()

        self.termlists = defaultdict(list)
        self.invertedindex = {}
        for fieldnum in xrange(len(schema)):
            self.invertedindex[fieldnum] = defaultdict(list)
        self.indexfreqs = defaultdict(int)

        self.storedfields = {}
        self.fieldlengths = defaultdict(int)
        self.fieldlength_totals = defaultdict(int)
        self.vectors = {}
        self.deleted = set()

        self._stored_to_pos = dict(
            (fnum, i) for i, fnum in enumerate(self.schema.stored_fields()))
コード例 #12
ファイル: formats.py プロジェクト: archatas/whoosh
 def word_values(self, value, **kwargs):
     seen = defaultdict(int)
     if self.boost_as_freq:
         for t in unstopped(self.analyzer(value, boosts=True, **kwargs)):
             seen[t.text] += int(t.boost)
         for t in unstopped(self.analyzer(value, **kwargs)):
             seen[t.text] += 1
     encode = self.encode
     return ((w, freq, encode(freq)) for w, freq in seen.iteritems())
コード例 #13
ファイル: formats.py プロジェクト: archatas/whoosh
 def word_values(self, value, start_pos=0, start_char=0, **kwargs):
     seen = defaultdict(list)
     for t in unstopped(self.analyzer(value, positions=True, chars=True,
                                      start_char=start_char, **kwargs)):
         seen[t.text].append((t.pos, start_char + t.startchar,
                              start_char + t.endchar))
     encode = self.encode
     return ((w, len(ls), encode(ls)) for w, ls in seen.iteritems())
コード例 #14
    def word_values(self, value, **kwargs):
        seen = defaultdict(int)
        if self.boost_as_freq:
            for t in unstopped(self.analyzer(value, boosts=True, **kwargs)):
                seen[t.text] += int(t.boost)
            for t in unstopped(self.analyzer(value, **kwargs)):
                seen[t.text] += 1

        encode = self.encode
        return ((w, freq, encode(freq)) for w, freq in seen.iteritems())
コード例 #15
    def word_values(self, value, start_pos=0, **kwargs):
        seen = defaultdict(list)
        for t in unstopped(
            seen[t.text].append(start_pos + t.pos)

        encode = self.encode
        return ((w, len(poslist), encode(poslist))
                for w, poslist in seen.iteritems())
コード例 #16
ファイル: spelling.py プロジェクト: ra2003/whoosh
    def suggestions_and_scores(self, text, weighting=None):
        """Returns a list of possible alternative spellings of 'text', as
        ('word', score, weight) triples, where 'word' is the suggested
        word, 'score' is the score that was assigned to the word using
        :meth:`SpellChecker.add_field` or :meth:`SpellChecker.add_scored_words`,
        and 'weight' is the score the word received in the search for the
        original word's ngrams.
        You must add words to the dictionary (using add_field, add_words,
        and/or add_scored_words) before you can use this.
        This is a lower-level method, in case an expert user needs access to
        the raw scores, for example to implement a custom suggestion ranking
        algorithm. Most people will want to call :meth:`~SpellChecker.suggest`
        instead, which simply returns the top N valued words.
        :param text: The word to check.
        :rtype: list

        if weighting is None:
            weighting = TF_IDF()

        grams = defaultdict(list)
        for size in xrange(self.mingram, self.maxgram + 1):
            key = "gram%s" % size
            nga = analysis.NgramAnalyzer(size)
            for t in nga(text):

        queries = []
        for size in xrange(self.mingram, min(self.maxgram + 1, len(text))):
            key = "gram%s" % size
            gramlist = grams[key]
                query.Term("start%s" % size,
                query.Term("end%s" % size, gramlist[-1], boost=self.boostend))
            for gram in gramlist:
                queries.append(query.Term(key, gram))

        q = query.Or(queries)
        ix = self.index()
        s = ix.searcher(weighting=weighting)
            result = s.search(q)
            return [(fs["word"], fs["score"], result.score(i))
                    for i, fs in enumerate(result) if fs["word"] != text]
コード例 #17
ファイル: filewriting.py プロジェクト: archatas/whoosh
    def __init__(self, ix, postlimit, blocklimit, name=None):
        :param ix: the Index object in which to write the new segment.
        :param postlimit: the maximum size for a run in the posting pool.
        :param blocklimit: the maximum number of postings in a posting block.
        :param name: the name of the segment.

        self.index = ix
        self.schema = ix.schema
        self.storage = storage = ix.storage
        self.name = name or ix._next_segment_name()

        self.max_doc = 0

        self.pool = postpool.PostingPool(postlimit)

        # Create mappings of field numbers to the position of that field in the
        # lists of scorable and stored fields. For example, consider a schema
        # with fields (A, B, C, D, E, F). If B, D, and E are scorable, then the
        # list of scorable fields is (B, D, E). The _scorable_to_pos dictionary
        # would then map B -> 0, D -> 1, and E -> 2.
        self._scorable_to_pos = dict((fnum, i)
                                     for i, fnum
                                     in enumerate(self.schema.scorable_fields()))
        self._stored_to_pos = dict((fnum, i)
                                   for i, fnum
                                   in enumerate(self.schema.stored_fields()))

        # Create a temporary segment object just so we can access its
        # *_filename attributes (so if we want to change the naming convention,
        # we only have to do it in one place).
        tempseg = Segment(self.name, 0, 0, None)
        self.termtable = create_terms(storage, tempseg)
        self.docslist = create_storedfields(storage, tempseg)
        self.doclengths = None
        if self.schema.scorable_fields():
            self.doclengths = create_doclengths(storage, tempseg, len(self._scorable_to_pos))

        postfile = storage.create_file(tempseg.posts_filename)
        self.postwriter = FilePostingWriter(postfile, blocklimit=blocklimit)

        self.vectortable = None
        if self.schema.has_vectored_fields():
            # Table associating document fields with (postoffset, postcount)
            self.vectortable = create_vectors(storage, tempseg)
            vpostfile = storage.create_file(tempseg.vectorposts_filename)
            self.vpostwriter = FilePostingWriter(vpostfile, stringids=True)

        # Keep track of the total number of tokens (across all docs)
        # in each field
        self.field_length_totals = defaultdict(int)
コード例 #18
ファイル: filewriting.py プロジェクト: ra2003/whoosh
    def __init__(self, ix, postlimit, blocklimit, name=None):
        :param ix: the Index object in which to write the new segment.
        :param postlimit: the maximum size for a run in the posting pool.
        :param blocklimit: the maximum number of postings in a posting block.
        :param name: the name of the segment.

        self.index = ix
        self.schema = ix.schema
        self.storage = storage = ix.storage
        self.name = name or ix._next_segment_name()

        self.max_doc = 0

        self.pool = postpool.PostingPool(postlimit)

        # Create mappings of field numbers to the position of that field in the
        # lists of scorable and stored fields. For example, consider a schema
        # with fields (A, B, C, D, E, F). If B, D, and E are scorable, then the
        # list of scorable fields is (B, D, E). The _scorable_to_pos dictionary
        # would then map B -> 0, D -> 1, and E -> 2.
        self._scorable_to_pos = dict(
            (fnum, i) for i, fnum in enumerate(self.schema.scorable_fields()))
        self._stored_to_pos = dict(
            (fnum, i) for i, fnum in enumerate(self.schema.stored_fields()))

        # Create a temporary segment object just so we can access its
        # *_filename attributes (so if we want to change the naming convention,
        # we only have to do it in one place).
        tempseg = Segment(self.name, 0, 0, None)
        self.termtable = create_terms(storage, tempseg)
        self.docslist = create_storedfields(storage, tempseg)
        self.doclengths = None
        if self.schema.scorable_fields():
            self.doclengths = create_doclengths(storage, tempseg,

        postfile = storage.create_file(tempseg.posts_filename)
        self.postwriter = FilePostingWriter(postfile, blocklimit=blocklimit)

        self.vectortable = None
        if self.schema.has_vectored_fields():
            # Table associating document fields with (postoffset, postcount)
            self.vectortable = create_vectors(storage, tempseg)
            vpostfile = storage.create_file(tempseg.vectorposts_filename)
            self.vpostwriter = FilePostingWriter(vpostfile, stringids=True)

        # Keep track of the total number of tokens (across all docs)
        # in each field
        self.field_length_totals = defaultdict(int)
コード例 #19
ファイル: spelling.py プロジェクト: archatas/whoosh
    def suggestions_and_scores(self, text, weighting=None):
        """Returns a list of possible alternative spellings of 'text', as
        ('word', score, weight) triples, where 'word' is the suggested
        word, 'score' is the score that was assigned to the word using
        :meth:`SpellChecker.add_field` or :meth:`SpellChecker.add_scored_words`,
        and 'weight' is the score the word received in the search for the
        original word's ngrams.
        You must add words to the dictionary (using add_field, add_words,
        and/or add_scored_words) before you can use this.
        This is a lower-level method, in case an expert user needs access to
        the raw scores, for example to implement a custom suggestion ranking
        algorithm. Most people will want to call :meth:`~SpellChecker.suggest`
        instead, which simply returns the top N valued words.
        :param text: The word to check.
        :rtype: list

        if weighting is None:
            weighting = TF_IDF()

        grams = defaultdict(list)
        for size in xrange(self.mingram, self.maxgram + 1):
            key = "gram%s" % size
            nga = analysis.NgramAnalyzer(size)
            for t in nga(text):

        queries = []
        for size in xrange(self.mingram, min(self.maxgram + 1, len(text))):
            key = "gram%s" % size
            gramlist = grams[key]
            queries.append(query.Term("start%s" % size, gramlist[0],
            queries.append(query.Term("end%s" % size, gramlist[-1],
            for gram in gramlist:
                queries.append(query.Term(key, gram))

        q = query.Or(queries)
        ix = self.index()
        s = ix.searcher(weighting=weighting)
            result = s.search(q)
            return [(fs["word"], fs["score"], result.score(i))
                    for i, fs in enumerate(result)
                    if fs["word"] != text]
コード例 #20
    def word_values(self, value, start_pos=0, start_char=0, **kwargs):
        seen = defaultdict(list)

        for t in unstopped(
                (t.pos, start_char + t.startchar, start_char + t.endchar))

        encode = self.encode
        return ((w, len(ls), encode(ls)) for w, ls in seen.iteritems())
コード例 #21
ファイル: __init__.py プロジェクト: ra2003/whoosh
        def wrapper(self, *args):
            if not hasattr(self, prefix + "cache"):
                cache = {}
                queue = deque()
                refcount = defaultdict(int)
                setattr(self, prefix + "cache", cache)
                setattr(self, prefix + "queue", queue)
                setattr(self, prefix + "refcount", refcount)
                cache = getattr(self, prefix + "cache")
                queue = getattr(self, prefix + "queue")
                refcount = getattr(self, prefix + "refcount")
            qpend = queue.append
            qpop = queue.popleft

            # Get cache entry or compute if not found
                result = cache[args]
            except KeyError:
                result = cache[args] = func(self, *args)

            # Record that this key was recently accessed
            refcount[args] += 1

            # Purge least recently accessed cache contents
            while len(cache) > size:
                k = qpop()
                refcount[k] -= 1
                if not refcount[k]:
                    del cache[k]
                    del refcount[k]

            # Periodically compact the queue by removing duplicate keys
            if len(queue) > size * 4:
                for _ in xrange(len(queue)):
                    k = qpop()
                    if refcount[k] == 1:
                        refcount[k] -= 1
                #assert len(queue) == len(cache) == len(refcount) == sum(refcount.itervalues())

            return result
コード例 #22
ファイル: __init__.py プロジェクト: archatas/whoosh
        def wrapper(self, *args):
            if not hasattr(self, prefix + "cache"):
                cache = {}
                queue = deque()
                refcount = defaultdict(int)
                setattr(self, prefix + "cache", cache)
                setattr(self, prefix + "queue", queue)
                setattr(self, prefix + "refcount", refcount)
                cache = getattr(self, prefix + "cache")
                queue = getattr(self, prefix + "queue")
                refcount = getattr(self, prefix + "refcount")
            qpend = queue.append
            qpop = queue.popleft

            # Get cache entry or compute if not found
                result = cache[args]
            except KeyError:
                result = cache[args] = func(self, *args)

            # Record that this key was recently accessed
            refcount[args] += 1

            # Purge least recently accessed cache contents
            while len(cache) > size:
                k = qpop()
                refcount[k] -= 1
                if not refcount[k]:
                    del cache[k]
                    del refcount[k]

            # Periodically compact the queue by removing duplicate keys
            if len(queue) > size * 4:
                for _ in xrange(len(queue)):
                    k = qpop()
                    if refcount[k] == 1:
                        refcount[k] -= 1
                #assert len(queue) == len(cache) == len(refcount) == sum(refcount.itervalues())

            return result
コード例 #23
ファイル: filetables.py プロジェクト: archatas/whoosh
 def __init__(self, dbfile):
     self.dbfile = dbfile
     self.hashes = defaultdict(list)
コード例 #24
ファイル: lovins.py プロジェクト: ra2003/whoosh
            ("erid", "eris"),
            ("pand", "pans"),
            ("end", "ens", "s"),
            ("ond", "ons"),
            ("lud", "lus"),
            ("rud", "rus"),
            ("her", "hes", "pt"),
            ("mit", "mis"),
            ("ent", "ens", "m"),
            ("ert", "ers"),
            ("et", "es", "n"),
            ("yt", "ys"),
            ("yz", "ys"))

# Hash the ending rules by the last letter of the target ending
_endingrules = defaultdict(list)
for rule in _endings:

_doubles = frozenset(("dd", "gg", "ll", "mm", "nn", "pp", "rr", "ss", "tt"))

def fix_ending(word):
    if word[-2:] in _doubles:
        word = word[:-1]
    for endingrule in _endingrules[word[-1]]:
        target, newend = endingrule[:2]
        if word.endswith(target):
            if len(endingrule) > 2:
                exceptafter = endingrule[2]
コード例 #25
 def __init__(self, dbfile):
     self.dbfile = dbfile
     self.hashes = defaultdict(list)