Exemple #1
0
 def save_result(self, furi):
     if 'a' not in furi: return
     if 'id' in furi:
         key = int(furi['id'])
     else:
         key = urihash.urikey(furi['u'])
     self.coll.update({'_id': key},
                      {'$set':{'a': furi['a'], 'u': furi['u']}},
                      upsert=True, multi=False)
Exemple #2
0
 def already_seen(self, furi):
     self.ready.wait()
     key = furi.get('id') or urihash.urikey(furi['u'])
     if key in self.memhash:
         return {'_id': key, 'e': self.memhash[key]}
     v = self.seendb.get(key)
     if not v:
         self._mark_seen(key)
         return {'_id': key, 'e': 0}
     else:
         return {'_id': key, 'e': self.EXPIRE_NEVER}
Exemple #3
0
 def already_seen(self, furi):
     key = furi.get("id")
     if key is None:
         key = urihash.urikey(furi["u"])
     if isinstance(key, long):
         raise ValueError
     if key in self.db:
         return {"_id": furi.get("id"), "e": (1 << 32) - 1}
     else:
         self.db[key] = (1 << 32) - 1
         return {"_id": furi.get("id"), "e": 0}
def create_seen(dispatcher, urls):
    seendir = dispatcher.seendir
    if not os.path.isdir(seendir):
        os.makedirs(seendir)
    seenfile = os.path.join(seendir, 'SEEN')
    for url in urls:
        hash = urihash.urikey(url['u'])
        url['id'] = hash
    urls.sort(key=lambda url: url['id'])
    with SeenFile(seenfile, 'wb') as sw:
        for url in urls:
            sw.write((url['id'], 0))
    return seenfile
Exemple #5
0
 def do_seen(self):
     p = web.input(u=None, j=None)
     r = dict(u=p.u, j=p.j)
     url, job = p.u, p.j
     if url is None or job is None:
         return json.dumps(r)
     h = urihash.urikey(url)  # self._fp64.sfp(url)
     d = db.seen.find_one({"_id": h})
     if d is None:
         r.update(d=None, msg="not found")
     elif d["u"] != url:
         r.update(d=None, msg="fp conflict", alt=d["u"])
     else:
         r.update(d=d)
     return json.dumps(r)
Exemple #6
0
 def already_seen(self, furi):
     self.ready.wait()
     key = furi.get('id') or urihash.urikey(furi['u'])
     v = self.seendb.get(key)
     if not v:
         #self.seendb.put(key, '1')
         while 1:
             try:
                 self.putqueue.put_nowait(key)
                 self.addedcount += 1
                 break
             except Full:
                 self.drain_putqueue()
         return {'_id': key, 'e': 0}
     else:
         return {'_id': key, 'e': self.EXPIRE_NEVER}
Exemple #7
0
 def urikey(o):
     return urihash.urikey(o['u'])
Exemple #8
0
 def mark_seen(self, furi):
     self.ready.wait()
     key = furi.get('id')
     if key is None:
         key = furi['id'] = urihash.urikey(furi['u'])
     self._mark_seen(key)
Exemple #9
0
 def uriquery(uri):
     return Seen.keyquery(urihash.urikey(uri))
Exemple #10
0
 def put(self, curi):
     curi['id'] = urihash.urikey(curi['u'])
     self.buffer.append(curi)
     if len(self.buffer) >= options.batchsize:
         self.flush()