Ejemplo n.º 1
0
 def embed(self, other):
     # Copy all transitions from the other NFA into this one
     for s, othertrans in iteritems(other.transitions):
         trans = self.transitions.setdefault(s, {})
         for label, otherdests in iteritems(othertrans):
             dests = trans.setdefault(label, set())
             dests.update(otherdests)
Ejemplo n.º 2
0
def test_bigtable():
    with TempStorage("bigtable") as st:

        def randstring(min, max):
            return "".join(
                chr(randint(1, 255)) for _ in xrange(randint(min, max)))

        count = 100000
        samp = dict(
            (randstring(1, 50), randstring(1, 50)) for _ in xrange(count))

        fhw = HashWriter(st.create_file("big.hsh"))
        fhw.add_all(iteritems(samp))
        fhw.close()

        fhr = HashReader(st.open_file("big.hsh"))
        keys = list(samp.keys())
        shuffle(keys)
        for key in keys:
            assert_equal(samp[key], fhr[key])

        set1 = set(iteritems(samp))
        set2 = set(fhr.items())
        assert_equal(set1, set2)

        fhr.close()
Ejemplo n.º 3
0
def test_bigtable():
    with TempStorage("bigtable") as st:
        def randstring(min, max):
            return "".join(chr(randint(1, 255))
                           for _ in xrange(randint(min, max)))

        count = 100000
        samp = dict((randstring(1,50), randstring(1,50))
                    for _ in xrange(count))

        fhw = HashWriter(st.create_file("big.hsh"))
        fhw.add_all(iteritems(samp))
        fhw.close()

        fhr = HashReader(st.open_file("big.hsh"))
        keys = list(samp.keys())
        shuffle(keys)
        for key in keys:
            assert_equal(samp[key], fhr[key])

        set1 = set(iteritems(samp))
        set2 = set(fhr.items())
        assert_equal(set1, set2)

        fhr.close()
Ejemplo n.º 4
0
 def embed(self, other):
     # Copy all transitions from the other NFA into this one
     for s, othertrans in iteritems(other.transitions):
         trans = self.transitions.setdefault(s, {})
         for label, otherdests in iteritems(othertrans):
             dests = trans.setdefault(label, set())
             dests.update(otherdests)
Ejemplo n.º 5
0
def reverse_nfa(n):
    s = object()
    nfa = NFA(s)
    for src, trans in iteritems(n.transitions):
        for label, destset in iteritems(trans):
            for dest in destset:
                nfa.add_transition(dest, label, src)
    for finalstate in n.final_states:
        nfa.add_transition(s, EPSILON, finalstate)
    nfa.add_final_state(n.initial)
    return nfa
Ejemplo n.º 6
0
def reverse_nfa(n):
    s = object()
    nfa = NFA(s)
    for src, trans in iteritems(n.transitions):
        for label, destset in iteritems(trans):
            for dest in destset:
                nfa.add_transition(dest, label, src)
    for finalstate in n.final_states:
        nfa.add_transition(s, EPSILON, finalstate)
    nfa.add_final_state(n.initial)
    return nfa
Ejemplo n.º 7
0
 def __repr__(self):
     attrs = ""
     if self.__dict__:
         attrs = ", ".join("%s=%r" % (key, value)
                           for key, value
                           in iteritems(self.__dict__))
     return self.__class__.__name__ + "(%s)" % attrs
Ejemplo n.º 8
0
def test_random_hash():
    with TempStorage("randomhash") as st:
        domain = "abcdefghijklmnopqrstuvwxyz"
        domain += domain.upper()
        times = 1000
        minlen = 1
        maxlen = len(domain)
        
        samp = dict((randstring(domain, minlen, maxlen),
                     randstring(domain, minlen, maxlen)) for _ in xrange(times))
        
        hwf = st.create_file("test.hsh")
        hw = HashWriter(hwf)
        for k, v in iteritems(samp):
            hw.add(k, v)
        hw.close()
        
        keys = list(samp.keys())
        random.shuffle(keys)
        hrf = st.open_file("test.hsh")
        hr = HashReader(hrf)
        for k in keys:
            v = hr[k]
            assert_equal(v, b(samp[k]))
        hr.close()
Ejemplo n.º 9
0
    def expanded_terms(self, number, normalize=True):
        """Returns the N most important terms in the vectors added so far.
        
        :param number: The number of terms to return.
        :param normalize: Whether to normalize the weights.
        :returns: A list of ("term", weight) tuples.
        """

        model = self.model
        fieldname = self.fieldname
        ixreader = self.ixreader
        tlist = []
        maxweight = 0

        # If no terms have been added, return an empty list
        if not self.topN_weight:
            return []

        for word, weight in iteritems(self.topN_weight):
            if (fieldname, word) in ixreader:
                cf = ixreader.frequency(fieldname, word)
                score = model.score(weight, cf, self.top_total)
                if score > maxweight:
                    maxweight = score
                tlist.append((score, word))

        if normalize:
            norm = model.normalizer(maxweight, self.top_total)
        else:
            norm = maxweight
        tlist = [(weight / norm, t) for weight, t in tlist]
        tlist.sort(key=lambda x: (0 - x[0], x[1]))

        return [(t, weight) for weight, t in tlist[:number]]
Ejemplo n.º 10
0
    def expanded_terms(self, number, normalize=True):
        """Returns the N most important terms in the vectors added so far.
        
        :param number: The number of terms to return.
        :param normalize: Whether to normalize the weights.
        :returns: A list of ("term", weight) tuples.
        """

        model = self.model
        tlist = []
        maxweight = 0
        collection_freq = self.collection_freq

        for word, weight in iteritems(self.topN_weight):
            if word in collection_freq:
                score = model.score(weight, collection_freq[word],
                                    self.top_total)
                if score > maxweight:
                    maxweight = score
                tlist.append((score, word))

        if normalize:
            norm = model.normalizer(maxweight, self.top_total)
        else:
            norm = maxweight
        tlist = [(weight / norm, t) for weight, t in tlist]
        tlist.sort(key=lambda x: (0 - x[0], x[1]))

        return [(t, weight) for weight, t in tlist[:number]]
Ejemplo n.º 11
0
 def from_file(cls, dbfile, doccount=None):
     obj = cls()
     obj._read_header(dbfile, doccount)
     for fieldname, start in iteritems(obj.starts):
         obj.lengths[fieldname] = dbfile.get_array(start, "B", obj._count)
     dbfile.close()
     return obj
Ejemplo n.º 12
0
def test_random_hash():
    from string import ascii_letters as domain

    times = 1000
    minlen = 1
    maxlen = len(domain)

    def randstring():
        s = "".join(random.sample(domain, random.randint(minlen, maxlen)))
        return b(s)

    with TempStorage("randomhash") as st:
        samp = dict((randstring(), randstring()) for _ in xrange(times))

        hwf = st.create_file("test.hsh")
        hw = HashWriter(hwf)
        for k, v in iteritems(samp):
            hw.add(k, v)
        hw.close()

        keys = list(samp.keys())
        random.shuffle(keys)
        hrf = st.open_file("test.hsh")
        hr = HashReader(hrf)
        for k in keys:
            assert_equal(hr[k], samp[k])
        hr.close()
Ejemplo n.º 13
0
def test_random_hash():
    from string import ascii_letters as domain

    times = 1000
    minlen = 1
    maxlen = len(domain)

    def randstring():
        s = "".join(random.sample(domain, random.randint(minlen, maxlen)))
        return b(s)

    with TempStorage("randomhash") as st:
        samp = dict((randstring(), randstring()) for _ in xrange(times))

        hw = HashWriter(st.create_file("test.hsh"))
        for k, v in iteritems(samp):
            hw.add(k, v)
        hw.close()

        keys = list(samp.keys())
        random.shuffle(keys)
        hr = HashReader.open(st, "test.hsh")
        for k in keys:
            assert hr[k] == samp[k]
        hr.close()
Ejemplo n.º 14
0
def test_random_hash():
    with TempStorage("randomhash") as st:
        domain = "abcdefghijklmnopqrstuvwxyz"
        domain += domain.upper()
        times = 1000
        minlen = 1
        maxlen = len(domain)

        samp = dict((randstring(domain, minlen, maxlen),
                     randstring(domain, minlen, maxlen))
                    for _ in xrange(times))

        hwf = st.create_file("test.hsh")
        hw = HashWriter(hwf)
        for k, v in iteritems(samp):
            hw.add(k, v)
        hw.close()

        keys = list(samp.keys())
        random.shuffle(keys)
        hrf = st.open_file("test.hsh")
        hr = HashReader(hrf)
        for k in keys:
            v = hr[k]
            assert_equal(v, b(samp[k]))
        hr.close()
Ejemplo n.º 15
0
 def from_file(cls, dbfile, doccount=None):
     obj = cls()
     obj._read_header(dbfile, doccount)
     for fieldname, start in iteritems(obj.starts):
         obj.lengths[fieldname] = dbfile.get_array(start, "B", obj._count)
     dbfile.close()
     return obj
Ejemplo n.º 16
0
    def expanded_terms(self, number, normalize=True):
        """Returns the N most important terms in the vectors added so far.
        
        :param number: The number of terms to return.
        :param normalize: Whether to normalize the weights.
        :returns: A list of ("term", weight) tuples.
        """

        model = self.model
        fieldname = self.fieldname
        ixreader = self.ixreader
        tlist = []
        maxweight = 0

        # If no terms have been added, return an empty list
        if not self.topN_weight:
            return []

        for word, weight in iteritems(self.topN_weight):
            if (fieldname, word) in ixreader:
                cf = ixreader.frequency(fieldname, word)
                score = model.score(weight, cf, self.top_total)
                if score > maxweight:
                    maxweight = score
                tlist.append((score, word))

        if normalize:
            norm = model.normalizer(maxweight, self.top_total)
        else:
            norm = maxweight
        tlist = [(weight / norm, t) for weight, t in tlist]
        tlist.sort(key=lambda x: (0 - x[0], x[1]))

        return [(t, weight) for weight, t in tlist[:number]]
Ejemplo n.º 17
0
    def word_values(self, value, analyzer, **kwargs):
        fb = self.field_boost
        seen = defaultdict(list)

        kwargs["positions"] = True
        kwargs["chars"] = True
        kwargs["boosts"] = True
        for t in tokens(value, analyzer, kwargs):
            seen[t.text].append((t.pos, t.startchar, t.endchar, t.boost))

        for w, poses in iteritems(seen):
            # posns_chars_boosts = [(pos, startchar, endchar, boost), ...]
            codes = []
            posbase = 0
            charbase = 0
            summedboost = 0
            for pos, startchar, endchar, boost in poses:
                codes.append((pos - posbase, startchar - charbase,
                              endchar - startchar, boost))
                posbase = pos
                charbase = endchar
                summedboost += boost

            value = (pack_uint(len(poses)) + pack_float(summedboost * fb)
                     + dumps(codes, -1)[2:-1])

            yield (w, len(poses), summedboost * fb, value)
Ejemplo n.º 18
0
 def stored_fields(self, docnum):
     if self.is_closed:
         raise ReaderClosed
     assert docnum >= 0
     schema = self.schema
     sfs = self._perdoc.stored_fields(docnum)
     # Double-check with schema to filter out removed fields
     return dict(item for item in iteritems(sfs) if item[0] in schema)
 def stored_fields(self, docnum):
     if self.is_closed:
         raise ReaderClosed
     assert docnum >= 0
     schema = self.schema
     sfs = self._perdoc.stored_fields(docnum)
     # Double-check with schema to filter out removed fields
     return dict(item for item in iteritems(sfs) if item[0] in schema)
Ejemplo n.º 20
0
 def __hash__(self):
     if self._hash is not None:
         return self._hash
     h = int(self.final)
     for key, node in iteritems(self._edges):
         h ^= hash(key) ^ hash(node)
     self._hash = h
     return h
Ejemplo n.º 21
0
    def __init__(self, dbfile):
        super(TermIndexReader, self).__init__(dbfile)

        dbfile.seek(self.indexbase + self.length * _LONG_SIZE)
        self.fieldmap = dbfile.read_pickle()
        self.names = [None] * len(self.fieldmap)
        for name, num in iteritems(self.fieldmap):
            self.names[num] = name
Ejemplo n.º 22
0
 def __hash__(self):
     if self._hash is not None:
         return self._hash
     h = int(self.final)
     for key, node in iteritems(self._edges):
         h ^= hash(key) ^ hash(node)
     self._hash = h
     return h
 def extract(self, match):
     d = match.groupdict()
     for key, value in iteritems(d):
         try:
             value = int(value)
             d[key] = value
         except (ValueError, TypeError):
             pass
     return Props(**d)
Ejemplo n.º 24
0
 def flush(self):
     for fieldname, lst in iteritems(self.postbuf):
         con = self._con(fieldname)
         con.executemany("insert into postings values (?, ?, ?, ?)", lst)
         con.commit()
         con.close()
     self.postbuf = defaultdict(list)
     self.bufsize = 0
     self._flushed = True
Ejemplo n.º 25
0
 def flush(self):
     for fieldname, lst in iteritems(self.postbuf):
         con = self._con(fieldname)
         con.executemany("insert into postings values (?, ?, ?, ?)", lst)
         con.commit()
         con.close()
     self.postbuf = defaultdict(list)
     self.bufsize = 0
     self._flushed = True
Ejemplo n.º 26
0
    def __init__(self, B=0.75, K1=1.2, **kwargs):
        self.B = B
        self.K1 = K1

        self._field_B = {}
        for k, v in iteritems(kwargs):
            if k.endswith("_B"):
                fieldname = k[:-2]
                self._field_B[fieldname] = v
Ejemplo n.º 27
0
    def __init__(self, dbfile, postfile):
        CodedOrderedReader.__init__(self, dbfile)
        self.postfile = postfile

        dbfile.seek(self.indexbase + self.length * _LONG_SIZE)
        self.fieldmap = dbfile.read_pickle()
        self.names = [None] * len(self.fieldmap)
        for name, num in iteritems(self.fieldmap):
            self.names[num] = name
Ejemplo n.º 28
0
 def extract(self, match):
     d = match.groupdict()
     for key, value in iteritems(d):
         try:
             value = int(value)
             d[key] = value
         except (ValueError, TypeError):
             pass
     return Props(**d)
Ejemplo n.º 29
0
    def __init__(self, dbfile, postfile):
        CodedOrderedReader.__init__(self, dbfile)
        self.postfile = postfile

        dbfile.seek(self.indexbase + self.length * _LONG_SIZE)
        self.fieldmap = dbfile.read_pickle()
        self.names = [None] * len(self.fieldmap)
        for name, num in iteritems(self.fieldmap):
            self.names[num] = name
Ejemplo n.º 30
0
 def _suggestions(self, text, maxdist, prefix):
     op = self.op
     seen = {}
     for corr in self.correctors:
         for score, sug in corr._suggestions(text, maxdist, prefix):
             if sug in seen:
                 seen[sug] = op(seen[sug], score)
             else:
                 seen[sug] = score
     return iteritems(seen)
 def _print_line(self, indent, command, **kwargs):
     self._dbfile.write(b("  ") * indent)
     self._dbfile.write(command.encode("latin1"))
     for k, v in iteritems(kwargs):
         if isinstance(v, memoryview):
             v = bytes(v)
         if v is not None and not isinstance(v, _reprable):
             raise TypeError(type(v))
         self._dbfile.write(("\t%s=%r" % (k, v)).encode("latin1"))
     self._dbfile.write(b("\n"))
Ejemplo n.º 32
0
    def __init__(self, codec, dbfile, length, postfile):
        self._codec = codec
        self._dbfile = dbfile
        self._tindex = filetables.OrderedHashReader(dbfile, length)
        self._fieldmap = self._tindex.extras["fieldmap"]
        self._postfile = postfile

        self._fieldunmap = [None] * len(self._fieldmap)
        for fieldname, num in iteritems(self._fieldmap):
            self._fieldunmap[num] = fieldname
    def __init__(self, codec, dbfile, length, postfile):
        self._codec = codec
        self._dbfile = dbfile
        self._tindex = filetables.OrderedHashReader(dbfile, length)
        self._fieldmap = self._tindex.extras["fieldmap"]
        self._postfile = postfile

        self._fieldunmap = [None] * len(self._fieldmap)
        for fieldname, num in iteritems(self._fieldmap):
            self._fieldunmap[num] = fieldname
Ejemplo n.º 34
0
 def _print_line(self, indent, command, **kwargs):
     self._dbfile.write(b("  ") * indent)
     self._dbfile.write(command.encode("latin1"))
     for k, v in iteritems(kwargs):
         if isinstance(v, memoryview):
             v = bytes(v)
         if v is not None and not isinstance(v, _reprable):
             raise TypeError(type(v))
         self._dbfile.write(("\t%s=%r" % (k, v)).encode("latin1"))
     self._dbfile.write(b("\n"))
Ejemplo n.º 35
0
 def _suggestions(self, text, maxdist, prefix):
     op = self.op
     seen = {}
     for corr in self.correctors:
         for score, sug in corr._suggestions(text, maxdist, prefix):
             if sug in seen:
                 seen[sug] = op(seen[sug], score)
             else:
                 seen[sug] = score
     return iteritems(seen)
Ejemplo n.º 36
0
def renumber_dfa(dfa, base=0):
    c = itertools.count(base)
    mapping = {}

    def remap(state):
        if state in mapping:
            newnum = mapping[state]
        else:
            newnum = next(c)
            mapping[state] = newnum
        return newnum

    newdfa = DFA(remap(dfa.initial))
    for src, trans in iteritems(dfa.transitions):
        for label, dest in iteritems(trans):
            newdfa.add_transition(remap(src), label, remap(dest))
    for finalstate in dfa.final_states:
        newdfa.add_final_state(remap(finalstate))
    for src, dest in iteritems(dfa.defaults):
        newdfa.set_default_transition(remap(src), remap(dest))
    return newdfa
Ejemplo n.º 37
0
    def word_values(self, value, analyzer, **kwargs):
        seen = defaultdict(list)

        kwargs["positions"] = True
        kwargs["chars"] = True
        kwargs["boosts"] = True
        for t in tokens(value, analyzer, kwargs):
            seen[t.text].append((t.pos, t.startchar, t.endchar, t.boost))

        for w, poses in iteritems(seen):
            value, summedboost = self.encode(poses)
            yield (w, len(poses), summedboost, value)
Ejemplo n.º 38
0
def renumber_dfa(dfa, base=0):
    c = itertools.count(base)
    mapping = {}

    def remap(state):
        if state in mapping:
            newnum = mapping[state]
        else:
            newnum = next(c)
            mapping[state] = newnum
        return newnum

    newdfa = DFA(remap(dfa.initial))
    for src, trans in iteritems(dfa.transitions):
        for label, dest in iteritems(trans):
            newdfa.add_transition(remap(src), label, remap(dest))
    for finalstate in dfa.final_states:
        newdfa.add_final_state(remap(finalstate))
    for src, dest in iteritems(dfa.defaults):
        newdfa.set_default_transition(remap(src), remap(dest))
    return newdfa
Ejemplo n.º 39
0
    def word_values(self, value, analyzer, **kwargs):
        seen = defaultdict(list)

        kwargs["positions"] = True
        kwargs["chars"] = True
        kwargs["boosts"] = True
        for t in tokens(value, analyzer, kwargs):
            seen[t.text].append((t.pos, t.startchar, t.endchar, t.boost))

        for w, poses in iteritems(seen):
            value, summedboost = self.encode(poses)
            yield (w, len(poses), summedboost, value)
Ejemplo n.º 40
0
    def __init__(self, dbfile):
        self.dbfile = dbfile

        dbfile.seek(0)
        pos = dbfile.read_long()
        self.length = dbfile.read_uint()

        dbfile.seek(pos)
        name_map = dbfile.read_pickle()
        self.names = [None] * len(name_map)
        for name, pos in iteritems(name_map):
            self.names[pos] = name
        self.directory_offset = dbfile.tell()
Ejemplo n.º 41
0
    def word_values(self, value, analyzer, **kwargs):
        fb = self.field_boost
        freqs = defaultdict(int)
        weights = defaultdict(float)

        kwargs["boosts"] = True
        for t in tokens(value, analyzer, kwargs):
            freqs[t.text] += 1
            weights[t.text] += t.boost

        encode = self.encode
        return ((w, freq, weights[w] * fb, encode(freq))
                for w, freq in iteritems(freqs))
Ejemplo n.º 42
0
    def word_values(self, value, analyzer, **kwargs):
        fb = self.field_boost
        poses = defaultdict(list)
        weights = defaultdict(float)
        kwargs["positions"] = True
        kwargs["boosts"] = True
        for t in tokens(value, analyzer, kwargs):
            poses[t.text].append(t.pos)
            weights[t.text] += t.boost

        for w, poslist in iteritems(poses):
            value = self.encode(poslist)
            yield (w, len(poslist), weights[w] * fb, value)
Ejemplo n.º 43
0
    def word_values(self, value, analyzer, **kwargs):
        fb = self.field_boost
        seen = defaultdict(iter)

        kwargs["positions"] = True
        kwargs["chars"] = True
        kwargs["boosts"] = True
        for t in tokens(value, analyzer, kwargs):
            seen[t.text].append((t.pos, t.startchar, t.endchar, t.boost))

        encode = self.encode
        return ((w, len(poses), sum(p[3] for p in poses) * fb, encode(poses))
                for w, poses in iteritems(seen))
Ejemplo n.º 44
0
    def word_values(self, value, analyzer, **kwargs):
        fb = self.field_boost
        freqs = defaultdict(int)
        weights = defaultdict(float)

        kwargs["boosts"] = True
        for t in tokens(value, analyzer, kwargs):
            freqs[t.text] += 1
            weights[t.text] += t.boost

        encode = self.encode
        return ((w, freq, weights[w] * fb, encode(freq))
                for w, freq in iteritems(freqs))
Ejemplo n.º 45
0
def u_to_utf8(dfa, base=0):
    c = itertools.count(base)
    transitions = dfa.transitions

    for src, trans in iteritems(transitions):
        trans = transitions[src]
        for label, dest in list(iteritems(trans)):
            if label is EPSILON:
                continue
            elif label is ANY:
                raise Exception
            else:
                assert isinstance(label, text_type)
                label8 = label.encode("utf8")
                for i, byte in enumerate(label8):
                    if i < len(label8) - 1:
                        st = next(c)
                        dfa.add_transition(src, byte, st)
                        src = st
                    else:
                        dfa.add_transition(src, byte, dest)
                del trans[label]
Ejemplo n.º 46
0
    def word_values(self, value, analyzer, **kwargs):
        fb = self.field_boost
        poses = defaultdict(list)
        weights = defaultdict(float)
        kwargs["positions"] = True
        kwargs["boosts"] = True
        for t in tokens(value, analyzer, kwargs):
            poses[t.text].append(t.pos)
            weights[t.text] += t.boost

        for w, poslist in iteritems(poses):
            value = self.encode(poslist)
            yield (w, len(poslist), weights[w] * fb, value)
Ejemplo n.º 47
0
    def word_values(self, value, analyzer, **kwargs):
        fb = self.field_boost
        seen = defaultdict(iter)

        kwargs["positions"] = True
        kwargs["chars"] = True
        kwargs["boosts"] = True
        for t in tokens(value, analyzer, kwargs):
            seen[t.text].append((t.pos, t.startchar, t.endchar, t.boost))

        encode = self.encode
        return ((w, len(poses), sum(p[3] for p in poses) * fb, encode(poses))
                for w, poses in iteritems(seen))
Ejemplo n.º 48
0
def u_to_utf8(dfa, base=0):
    c = itertools.count(base)
    transitions = dfa.transitions

    for src, trans in iteritems(transitions):
        trans = transitions[src]
        for label, dest in list(iteritems(trans)):
            if label is EPSILON:
                continue
            elif label is ANY:
                raise Exception
            else:
                assert isinstance(label, text_type)
                label8 = label.encode("utf8")
                for i, byte in enumerate(label8):
                    if i < len(label8) - 1:
                        st = next(c)
                        dfa.add_transition(src, byte, st)
                        src = st
                    else:
                        dfa.add_transition(src, byte, dest)
                del trans[label]
Ejemplo n.º 49
0
    def word_values(self, value, analyzer, **kwargs):
        fb = self.field_boost
        seen = defaultdict(list)

        kwargs["positions"] = True
        kwargs["boosts"] = True
        for t in tokens(value, analyzer, kwargs):
            pos = t.pos
            boost = t.boost
            seen[t.text].append((pos, boost))

        for w, poses in iteritems(seen):
            value = self.encode(poses)
            yield (w, len(poses), sum(p[1] for p in poses) * fb, value)
Ejemplo n.º 50
0
    def word_values(self, value, analyzer, **kwargs):
        fb = self.field_boost
        seen = defaultdict(list)

        kwargs["positions"] = True
        kwargs["boosts"] = True
        for t in tokens(value, analyzer, kwargs):
            pos = t.pos
            boost = t.boost
            seen[t.text].append((pos, boost))

        for w, poses in iteritems(seen):
            value = self.encode(poses)
            yield (w, len(poses), sum(p[1] for p in poses) * fb, value)
Ejemplo n.º 51
0
    def word_values(self, value, analyzer, **kwargs):
        fb = self.field_boost
        length = 0
        freqs = defaultdict(int)
        weights = defaultdict(float)

        kwargs["boosts"] = True
        for t in tokens(value, analyzer, kwargs):
            length += 1
            freqs[t.text] += 1
            weights[t.text] += t.boost

        wvs = ((w, freq, weights[w] * fb, pack_uint(freq)) for w, freq
               in iteritems(freqs))
        return wvs
Ejemplo n.º 52
0
    def word_values(self, value, analyzer, **kwargs):
        fb = self.field_boost
        seen = defaultdict(list)
        weights = defaultdict(float)

        kwargs["positions"] = True
        kwargs["chars"] = True
        kwargs["boosts"] = True
        for t in tokens(value, analyzer, kwargs):
            seen[t.text].append((t.pos, t.startchar, t.endchar))
            weights[t.text] += t.boost

        encode = self.encode
        return ((w, len(ls), weights[w] * fb, encode(ls))
                for w, ls in iteritems(seen))
Ejemplo n.º 53
0
Archivo: cache.py Proyecto: jnm/whoosh
 def wrapper(*args):
     try:
         result = data[args]
         stats[0] += 1  # Hit
     except KeyError:
         stats[1] += 1  # Miss
         if len(data) == maxsize:
             for k, _ in nsmallest(maxsize // 10 or 1, iteritems(usecount), key=itemgetter(1)):
                 del data[k]
                 del usecount[k]
         data[args] = user_function(*args)
         result = data[args]
     finally:
         usecount[args] += 1
     return result
Ejemplo n.º 54
0
    def word_values(self, value, analyzer, **kwargs):
        fb = self.field_boost
        length = 0
        freqs = defaultdict(int)
        weights = defaultdict(float)

        kwargs["boosts"] = True
        for t in tokens(value, analyzer, kwargs):
            length += 1
            freqs[t.text] += 1
            weights[t.text] += t.boost

        wvs = ((w, freq, weights[w] * fb, pack_uint(freq))
               for w, freq in iteritems(freqs))
        return wvs
Ejemplo n.º 55
0
    def word_values(self, value, analyzer, **kwargs):
        fb = self.field_boost
        seen = defaultdict(list)
        weights = defaultdict(float)

        kwargs["positions"] = True
        kwargs["chars"] = True
        kwargs["boosts"] = True
        for t in tokens(value, analyzer, kwargs):
            seen[t.text].append((t.pos, t.startchar, t.endchar))
            weights[t.text] += t.boost

        encode = self.encode
        return ((w, len(ls), weights[w] * fb, encode(ls))
                for w, ls in iteritems(seen))
Ejemplo n.º 56
0
    def replace(self, **kwargs):
        """Returns a copy of this object with the attributes given as keyword
        arguments replaced.
        
        >>> adt = adatetime(year=2009, month=10, day=31)
        >>> adt.replace(year=2010)
        (2010, 10, 31, None, None, None, None)
        """

        newadatetime = self.copy()
        for key, value in iteritems(kwargs):
            if key in self.units:
                setattr(newadatetime, key, value)
            else:
                raise KeyError("Unknown argument %r" % key)
        return newadatetime
Ejemplo n.º 57
0
    def reachable_from(self, src, inclusive=True):
        transitions = self.transitions

        reached = set()
        if inclusive:
            reached.add(src)

        stack = [src]
        seen = set()
        while stack:
            src = stack.pop()
            seen.add(src)
            for _, dest in iteritems(transitions[src]):
                reached.add(dest)
                if dest not in seen:
                    stack.append(dest)
        return reached
Ejemplo n.º 58
0
 def wrapper(*args):
     try:
         result = data[args]
         stats[0] += 1  # Hit
     except KeyError:
         stats[1] += 1  # Miss
         if len(data) == maxsize:
             for k, _ in nsmallest(maxsize // 10 or 1,
                                   iteritems(lastused),
                                   key=itemgetter(1)):
                 del data[k]
                 del lastused[k]
         data[args] = user_function(*args)
         result = data[args]
     finally:
         lastused[args] = time()
     return result