def save_result(self, furi): if 'a' not in furi: return if 'id' in furi: key = int(furi['id']) else: key = urihash.urikey(furi['u']) self.coll.update({'_id': key}, {'$set':{'a': furi['a'], 'u': furi['u']}}, upsert=True, multi=False)
def already_seen(self, furi): self.ready.wait() key = furi.get('id') or urihash.urikey(furi['u']) if key in self.memhash: return {'_id': key, 'e': self.memhash[key]} v = self.seendb.get(key) if not v: self._mark_seen(key) return {'_id': key, 'e': 0} else: return {'_id': key, 'e': self.EXPIRE_NEVER}
def already_seen(self, furi): key = furi.get("id") if key is None: key = urihash.urikey(furi["u"]) if isinstance(key, long): raise ValueError if key in self.db: return {"_id": furi.get("id"), "e": (1 << 32) - 1} else: self.db[key] = (1 << 32) - 1 return {"_id": furi.get("id"), "e": 0}
def create_seen(dispatcher, urls): seendir = dispatcher.seendir if not os.path.isdir(seendir): os.makedirs(seendir) seenfile = os.path.join(seendir, 'SEEN') for url in urls: hash = urihash.urikey(url['u']) url['id'] = hash urls.sort(key=lambda url: url['id']) with SeenFile(seenfile, 'wb') as sw: for url in urls: sw.write((url['id'], 0)) return seenfile
def do_seen(self): p = web.input(u=None, j=None) r = dict(u=p.u, j=p.j) url, job = p.u, p.j if url is None or job is None: return json.dumps(r) h = urihash.urikey(url) # self._fp64.sfp(url) d = db.seen.find_one({"_id": h}) if d is None: r.update(d=None, msg="not found") elif d["u"] != url: r.update(d=None, msg="fp conflict", alt=d["u"]) else: r.update(d=d) return json.dumps(r)
def already_seen(self, furi): self.ready.wait() key = furi.get('id') or urihash.urikey(furi['u']) v = self.seendb.get(key) if not v: #self.seendb.put(key, '1') while 1: try: self.putqueue.put_nowait(key) self.addedcount += 1 break except Full: self.drain_putqueue() return {'_id': key, 'e': 0} else: return {'_id': key, 'e': self.EXPIRE_NEVER}
def urikey(o): return urihash.urikey(o['u'])
def mark_seen(self, furi): self.ready.wait() key = furi.get('id') if key is None: key = furi['id'] = urihash.urikey(furi['u']) self._mark_seen(key)
def uriquery(uri): return Seen.keyquery(urihash.urikey(uri))
def put(self, curi): curi['id'] = urihash.urikey(curi['u']) self.buffer.append(curi) if len(self.buffer) >= options.batchsize: self.flush()