def embed(self, other): # Copy all transitions from the other NFA into this one for s, othertrans in iteritems(other.transitions): trans = self.transitions.setdefault(s, {}) for label, otherdests in iteritems(othertrans): dests = trans.setdefault(label, set()) dests.update(otherdests)
def test_bigtable(): with TempStorage("bigtable") as st: def randstring(min, max): return "".join( chr(randint(1, 255)) for _ in xrange(randint(min, max))) count = 100000 samp = dict( (randstring(1, 50), randstring(1, 50)) for _ in xrange(count)) fhw = HashWriter(st.create_file("big.hsh")) fhw.add_all(iteritems(samp)) fhw.close() fhr = HashReader(st.open_file("big.hsh")) keys = list(samp.keys()) shuffle(keys) for key in keys: assert_equal(samp[key], fhr[key]) set1 = set(iteritems(samp)) set2 = set(fhr.items()) assert_equal(set1, set2) fhr.close()
def test_bigtable(): with TempStorage("bigtable") as st: def randstring(min, max): return "".join(chr(randint(1, 255)) for _ in xrange(randint(min, max))) count = 100000 samp = dict((randstring(1,50), randstring(1,50)) for _ in xrange(count)) fhw = HashWriter(st.create_file("big.hsh")) fhw.add_all(iteritems(samp)) fhw.close() fhr = HashReader(st.open_file("big.hsh")) keys = list(samp.keys()) shuffle(keys) for key in keys: assert_equal(samp[key], fhr[key]) set1 = set(iteritems(samp)) set2 = set(fhr.items()) assert_equal(set1, set2) fhr.close()
def reverse_nfa(n): s = object() nfa = NFA(s) for src, trans in iteritems(n.transitions): for label, destset in iteritems(trans): for dest in destset: nfa.add_transition(dest, label, src) for finalstate in n.final_states: nfa.add_transition(s, EPSILON, finalstate) nfa.add_final_state(n.initial) return nfa
def __repr__(self): attrs = "" if self.__dict__: attrs = ", ".join("%s=%r" % (key, value) for key, value in iteritems(self.__dict__)) return self.__class__.__name__ + "(%s)" % attrs
def test_random_hash(): with TempStorage("randomhash") as st: domain = "abcdefghijklmnopqrstuvwxyz" domain += domain.upper() times = 1000 minlen = 1 maxlen = len(domain) samp = dict((randstring(domain, minlen, maxlen), randstring(domain, minlen, maxlen)) for _ in xrange(times)) hwf = st.create_file("test.hsh") hw = HashWriter(hwf) for k, v in iteritems(samp): hw.add(k, v) hw.close() keys = list(samp.keys()) random.shuffle(keys) hrf = st.open_file("test.hsh") hr = HashReader(hrf) for k in keys: v = hr[k] assert_equal(v, b(samp[k])) hr.close()
def expanded_terms(self, number, normalize=True): """Returns the N most important terms in the vectors added so far. :param number: The number of terms to return. :param normalize: Whether to normalize the weights. :returns: A list of ("term", weight) tuples. """ model = self.model fieldname = self.fieldname ixreader = self.ixreader tlist = [] maxweight = 0 # If no terms have been added, return an empty list if not self.topN_weight: return [] for word, weight in iteritems(self.topN_weight): if (fieldname, word) in ixreader: cf = ixreader.frequency(fieldname, word) score = model.score(weight, cf, self.top_total) if score > maxweight: maxweight = score tlist.append((score, word)) if normalize: norm = model.normalizer(maxweight, self.top_total) else: norm = maxweight tlist = [(weight / norm, t) for weight, t in tlist] tlist.sort(key=lambda x: (0 - x[0], x[1])) return [(t, weight) for weight, t in tlist[:number]]
def expanded_terms(self, number, normalize=True): """Returns the N most important terms in the vectors added so far. :param number: The number of terms to return. :param normalize: Whether to normalize the weights. :returns: A list of ("term", weight) tuples. """ model = self.model tlist = [] maxweight = 0 collection_freq = self.collection_freq for word, weight in iteritems(self.topN_weight): if word in collection_freq: score = model.score(weight, collection_freq[word], self.top_total) if score > maxweight: maxweight = score tlist.append((score, word)) if normalize: norm = model.normalizer(maxweight, self.top_total) else: norm = maxweight tlist = [(weight / norm, t) for weight, t in tlist] tlist.sort(key=lambda x: (0 - x[0], x[1])) return [(t, weight) for weight, t in tlist[:number]]
def from_file(cls, dbfile, doccount=None): obj = cls() obj._read_header(dbfile, doccount) for fieldname, start in iteritems(obj.starts): obj.lengths[fieldname] = dbfile.get_array(start, "B", obj._count) dbfile.close() return obj
def test_random_hash(): from string import ascii_letters as domain times = 1000 minlen = 1 maxlen = len(domain) def randstring(): s = "".join(random.sample(domain, random.randint(minlen, maxlen))) return b(s) with TempStorage("randomhash") as st: samp = dict((randstring(), randstring()) for _ in xrange(times)) hwf = st.create_file("test.hsh") hw = HashWriter(hwf) for k, v in iteritems(samp): hw.add(k, v) hw.close() keys = list(samp.keys()) random.shuffle(keys) hrf = st.open_file("test.hsh") hr = HashReader(hrf) for k in keys: assert_equal(hr[k], samp[k]) hr.close()
def test_random_hash(): from string import ascii_letters as domain times = 1000 minlen = 1 maxlen = len(domain) def randstring(): s = "".join(random.sample(domain, random.randint(minlen, maxlen))) return b(s) with TempStorage("randomhash") as st: samp = dict((randstring(), randstring()) for _ in xrange(times)) hw = HashWriter(st.create_file("test.hsh")) for k, v in iteritems(samp): hw.add(k, v) hw.close() keys = list(samp.keys()) random.shuffle(keys) hr = HashReader.open(st, "test.hsh") for k in keys: assert hr[k] == samp[k] hr.close()
def word_values(self, value, analyzer, **kwargs): fb = self.field_boost seen = defaultdict(list) kwargs["positions"] = True kwargs["chars"] = True kwargs["boosts"] = True for t in tokens(value, analyzer, kwargs): seen[t.text].append((t.pos, t.startchar, t.endchar, t.boost)) for w, poses in iteritems(seen): # posns_chars_boosts = [(pos, startchar, endchar, boost), ...] codes = [] posbase = 0 charbase = 0 summedboost = 0 for pos, startchar, endchar, boost in poses: codes.append((pos - posbase, startchar - charbase, endchar - startchar, boost)) posbase = pos charbase = endchar summedboost += boost value = (pack_uint(len(poses)) + pack_float(summedboost * fb) + dumps(codes, -1)[2:-1]) yield (w, len(poses), summedboost * fb, value)
def stored_fields(self, docnum): if self.is_closed: raise ReaderClosed assert docnum >= 0 schema = self.schema sfs = self._perdoc.stored_fields(docnum) # Double-check with schema to filter out removed fields return dict(item for item in iteritems(sfs) if item[0] in schema)
def __hash__(self): if self._hash is not None: return self._hash h = int(self.final) for key, node in iteritems(self._edges): h ^= hash(key) ^ hash(node) self._hash = h return h
def __init__(self, dbfile): super(TermIndexReader, self).__init__(dbfile) dbfile.seek(self.indexbase + self.length * _LONG_SIZE) self.fieldmap = dbfile.read_pickle() self.names = [None] * len(self.fieldmap) for name, num in iteritems(self.fieldmap): self.names[num] = name
def extract(self, match): d = match.groupdict() for key, value in iteritems(d): try: value = int(value) d[key] = value except (ValueError, TypeError): pass return Props(**d)
def flush(self): for fieldname, lst in iteritems(self.postbuf): con = self._con(fieldname) con.executemany("insert into postings values (?, ?, ?, ?)", lst) con.commit() con.close() self.postbuf = defaultdict(list) self.bufsize = 0 self._flushed = True
def __init__(self, B=0.75, K1=1.2, **kwargs): self.B = B self.K1 = K1 self._field_B = {} for k, v in iteritems(kwargs): if k.endswith("_B"): fieldname = k[:-2] self._field_B[fieldname] = v
def __init__(self, dbfile, postfile): CodedOrderedReader.__init__(self, dbfile) self.postfile = postfile dbfile.seek(self.indexbase + self.length * _LONG_SIZE) self.fieldmap = dbfile.read_pickle() self.names = [None] * len(self.fieldmap) for name, num in iteritems(self.fieldmap): self.names[num] = name
def _suggestions(self, text, maxdist, prefix): op = self.op seen = {} for corr in self.correctors: for score, sug in corr._suggestions(text, maxdist, prefix): if sug in seen: seen[sug] = op(seen[sug], score) else: seen[sug] = score return iteritems(seen)
def _print_line(self, indent, command, **kwargs): self._dbfile.write(b(" ") * indent) self._dbfile.write(command.encode("latin1")) for k, v in iteritems(kwargs): if isinstance(v, memoryview): v = bytes(v) if v is not None and not isinstance(v, _reprable): raise TypeError(type(v)) self._dbfile.write(("\t%s=%r" % (k, v)).encode("latin1")) self._dbfile.write(b("\n"))
def __init__(self, codec, dbfile, length, postfile): self._codec = codec self._dbfile = dbfile self._tindex = filetables.OrderedHashReader(dbfile, length) self._fieldmap = self._tindex.extras["fieldmap"] self._postfile = postfile self._fieldunmap = [None] * len(self._fieldmap) for fieldname, num in iteritems(self._fieldmap): self._fieldunmap[num] = fieldname
def renumber_dfa(dfa, base=0): c = itertools.count(base) mapping = {} def remap(state): if state in mapping: newnum = mapping[state] else: newnum = next(c) mapping[state] = newnum return newnum newdfa = DFA(remap(dfa.initial)) for src, trans in iteritems(dfa.transitions): for label, dest in iteritems(trans): newdfa.add_transition(remap(src), label, remap(dest)) for finalstate in dfa.final_states: newdfa.add_final_state(remap(finalstate)) for src, dest in iteritems(dfa.defaults): newdfa.set_default_transition(remap(src), remap(dest)) return newdfa
def word_values(self, value, analyzer, **kwargs): seen = defaultdict(list) kwargs["positions"] = True kwargs["chars"] = True kwargs["boosts"] = True for t in tokens(value, analyzer, kwargs): seen[t.text].append((t.pos, t.startchar, t.endchar, t.boost)) for w, poses in iteritems(seen): value, summedboost = self.encode(poses) yield (w, len(poses), summedboost, value)
def __init__(self, dbfile): self.dbfile = dbfile dbfile.seek(0) pos = dbfile.read_long() self.length = dbfile.read_uint() dbfile.seek(pos) name_map = dbfile.read_pickle() self.names = [None] * len(name_map) for name, pos in iteritems(name_map): self.names[pos] = name self.directory_offset = dbfile.tell()
def word_values(self, value, analyzer, **kwargs): fb = self.field_boost freqs = defaultdict(int) weights = defaultdict(float) kwargs["boosts"] = True for t in tokens(value, analyzer, kwargs): freqs[t.text] += 1 weights[t.text] += t.boost encode = self.encode return ((w, freq, weights[w] * fb, encode(freq)) for w, freq in iteritems(freqs))
def word_values(self, value, analyzer, **kwargs): fb = self.field_boost poses = defaultdict(list) weights = defaultdict(float) kwargs["positions"] = True kwargs["boosts"] = True for t in tokens(value, analyzer, kwargs): poses[t.text].append(t.pos) weights[t.text] += t.boost for w, poslist in iteritems(poses): value = self.encode(poslist) yield (w, len(poslist), weights[w] * fb, value)
def word_values(self, value, analyzer, **kwargs): fb = self.field_boost seen = defaultdict(iter) kwargs["positions"] = True kwargs["chars"] = True kwargs["boosts"] = True for t in tokens(value, analyzer, kwargs): seen[t.text].append((t.pos, t.startchar, t.endchar, t.boost)) encode = self.encode return ((w, len(poses), sum(p[3] for p in poses) * fb, encode(poses)) for w, poses in iteritems(seen))
def u_to_utf8(dfa, base=0): c = itertools.count(base) transitions = dfa.transitions for src, trans in iteritems(transitions): trans = transitions[src] for label, dest in list(iteritems(trans)): if label is EPSILON: continue elif label is ANY: raise Exception else: assert isinstance(label, text_type) label8 = label.encode("utf8") for i, byte in enumerate(label8): if i < len(label8) - 1: st = next(c) dfa.add_transition(src, byte, st) src = st else: dfa.add_transition(src, byte, dest) del trans[label]
def word_values(self, value, analyzer, **kwargs): fb = self.field_boost seen = defaultdict(list) kwargs["positions"] = True kwargs["boosts"] = True for t in tokens(value, analyzer, kwargs): pos = t.pos boost = t.boost seen[t.text].append((pos, boost)) for w, poses in iteritems(seen): value = self.encode(poses) yield (w, len(poses), sum(p[1] for p in poses) * fb, value)
def word_values(self, value, analyzer, **kwargs): fb = self.field_boost length = 0 freqs = defaultdict(int) weights = defaultdict(float) kwargs["boosts"] = True for t in tokens(value, analyzer, kwargs): length += 1 freqs[t.text] += 1 weights[t.text] += t.boost wvs = ((w, freq, weights[w] * fb, pack_uint(freq)) for w, freq in iteritems(freqs)) return wvs
def word_values(self, value, analyzer, **kwargs): fb = self.field_boost seen = defaultdict(list) weights = defaultdict(float) kwargs["positions"] = True kwargs["chars"] = True kwargs["boosts"] = True for t in tokens(value, analyzer, kwargs): seen[t.text].append((t.pos, t.startchar, t.endchar)) weights[t.text] += t.boost encode = self.encode return ((w, len(ls), weights[w] * fb, encode(ls)) for w, ls in iteritems(seen))
def wrapper(*args): try: result = data[args] stats[0] += 1 # Hit except KeyError: stats[1] += 1 # Miss if len(data) == maxsize: for k, _ in nsmallest(maxsize // 10 or 1, iteritems(usecount), key=itemgetter(1)): del data[k] del usecount[k] data[args] = user_function(*args) result = data[args] finally: usecount[args] += 1 return result
def replace(self, **kwargs): """Returns a copy of this object with the attributes given as keyword arguments replaced. >>> adt = adatetime(year=2009, month=10, day=31) >>> adt.replace(year=2010) (2010, 10, 31, None, None, None, None) """ newadatetime = self.copy() for key, value in iteritems(kwargs): if key in self.units: setattr(newadatetime, key, value) else: raise KeyError("Unknown argument %r" % key) return newadatetime
def reachable_from(self, src, inclusive=True): transitions = self.transitions reached = set() if inclusive: reached.add(src) stack = [src] seen = set() while stack: src = stack.pop() seen.add(src) for _, dest in iteritems(transitions[src]): reached.add(dest) if dest not in seen: stack.append(dest) return reached
def wrapper(*args): try: result = data[args] stats[0] += 1 # Hit except KeyError: stats[1] += 1 # Miss if len(data) == maxsize: for k, _ in nsmallest(maxsize // 10 or 1, iteritems(lastused), key=itemgetter(1)): del data[k] del lastused[k] data[args] = user_function(*args) result = data[args] finally: lastused[args] = time() return result