def save_to_db(self, url, html): urlhash = farmhash.hash64(url) sql = 'select url from crawler_html where urlhash=%s' d = self.db.get(sql, urlhash) if d: if d['url'] != url: msg = 'farmhash collision: %s <=> %s' % (url, d['url']) self.logger.error(msg) return True if isinstance(html, str): html = html.encode('utf8') html_lzma = lzma.compress(html) sql = ('insert into crawler_html(urlhash, url, html_lzma) ' 'values(%s, %s, %s)') good = False try: self.db.execute(sql, urlhash, url, html_lzma) good = True except Exception as e: if e.args[0] == 1062: # Duplicate entry good = True pass else: traceback.print_exc() raise e return good
def save_qo_sql(self,url,html): urlhash = farmhash.hash64(url) sql = 'select url from create_html where urlhash = "%s"' d = self.db.get(sql,urlhash) if d: if d['url'] != url: msg = 'farmhash collision:%s <=> %s'%(url,d['url']) self.logger.error(msg) return True if isinstance(html,str): html = html.encode('utf-8') html_zlib = zlib.compress(html) # 压缩文本 sql = "insert into crawler_html(urlhash,url,html_zlib) values (%s,%s,%s)" good = False try: self.db.execute(sql,urlhash,url,html_zlib) good = True except Exception as e: if e.args[0] == 1062: """重复""" good = True pass else: traceback.print_exc() raise e return good
def get_entity(schema_name, key, source=True): resp = requests.get('{0}/{1}/entity/{2}'.format(options.es, schema_name, farmhash.hash64(key))) if resp.status_code >= 300 and resp.status_code != 404: raise EntityError('get entity {0} error: {1}'.format(key, resp.text)) if source: return resp.json().get('_source') return resp.json()
def transform(self, doc): retVal = np.array([0] * self.size) for x in doc: idx = (x + self.salt) % self.size delta = 1 if farmhash.hash64(str(x + self.salt)) % 2 == 0 else -1 retVal[idx] += delta return retVal
def make_cache_key(f, *args, **kwargs): _timeout = getattr(timeout, 'cache_timeout', timeout) fname, version_data = self._memoize_version(f, args=args, timeout=_timeout) #: this should have to be after version_data, so that it #: does not break the delete_memoized functionality. if callable(make_name): altfname = make_name(fname) else: altfname = fname if callable(f): keyargs, keykwargs = self._memoize_kwargs_to_args( f, *args, **kwargs) else: keyargs, keykwargs = args, kwargs try: updated = "{0}{1}{2}{3}".format(altfname, keyargs, keykwargs, version_data) except AttributeError: updated = "%s%s%s%s" % (altfname, keyargs, keykwargs, version_data) #cache_key = hashlib.md5() #cache_key.update(updated.encode('utf-8')) #cache_key = base64.b64encode(cache_key.digest())[:16] #cache_key = cache_key.decode('utf-8') #cache_key += version_data return farmhash.hash64('{0}{1}'.format(updated, version_data)) #cache_key
def make_template_fragment_key(fragment_name, vary_on=[]): """ Make a cache key for a specific fragment name """ if vary_on: fragment_name = "%s_" % fragment_name return farmhash.hash64(TEMPLATE_FRAGMENT_KEY_TEMPLATE % (fragment_name, "_".join(str(vary_on))))
def make_cache_key(*args, **kwargs): if callable(key_prefix): cache_key = key_prefix() elif '%s' in key_prefix: cache_key = key_prefix % request.path else: cache_key = key_prefix return farmhash.hash64(cache_key)
def minhash(self, doc, salts): #return a list of signature assert len(doc) > 0, "empty signature found; hashing aborted" retVal = [] perms = [ lambda x: farmhash.hash64(str(x + salt)) % self.col_size for salt in salts ] for perm in perms: retVal.append(doc[np.argmin([perm(x) for x in doc])]) return retVal
def __init__(self, tiles=None): self.current_ver = farmhash.hash64(datetime.utcnow().isoformat()) if tiles is None: # Generate a new farm with some debris obj_points = np.random.randint(0, 10, (20, 16)) for (x, y), value in np.ndenumerate(obj_points): tile = Tile(x, y, 'untilled') if value == 7: tile.obj = Obj('tree') elif value == 6: tile.obj = Obj('weeds') elif value == 5: tile.obj = Obj('brown_leaves') self.tiles.append(tile) else: self.tiles = tiles
def post(self, schema_name): node = os.path.join(options.root, schema_name) try: self.application.zk.create(node) schema = SchemaHandler.get_schema(schema_name) payload = self.get_payload() EntityHandler.validate_entity(schema, payload) entity = EntityHandler.get_entity(schema_name, payload[schema['pk']]) if entity is None: payload['_meta'] = { 'schema': schema_name, 'version': 0, 'timestamp': int(datetime.datetime.now().timestamp() * 1000) } else: payload['_meta'] = { 'schema': schema_name, 'version': entity['_meta']['version'] + 1, 'timestamp': int(datetime.datetime.now().timestamp() * 1000) } r = requests.post('{0}/{1}/entity_history'.format(options.es, schema_name), json=payload) if r.status_code >= 300: logging.error('put entity history error: {0}'.format(r.text)) raise HTTPError(status_code=500, reason='put entity history error: {0}'.format(r.text)) key = farmhash.hash64(payload[schema['pk']]) r = requests.put('{0}/{1}/entity/{2}'.format(options.es, schema_name, key), json=payload) if r.status_code >= 300: logging.error('put entity error: {0}'.format(r.text)) raise HTTPError(status_code=500, reason='put entity error: {0}'.format(r.text)) self.jsonify(code=200, entity=EntityHandler.get_entity(schema_name, payload[schema['pk']])) except NodeExistsError: node = None raise HTTPError(status_code=422, reason='schema {0} is locked'.format(schema_name)) finally: if node is not None: self.application.zk.delete(node)
def generate_hashes(peaks, fan_value=DEFAULT_FAN_VALUE): """ Hash list structure: sha1_hash[0:20] time_offset [(e05b341a9b77a51fd26, 32), ... ] """ if PEAK_SORT: peaks = sorted(peaks, key=itemgetter(1)) lenPeaks = len(peaks) for i in range(lenPeaks): for j in range(1, fan_value): if (i + j) < lenPeaks: freq1 = peaks[i][IDX_FREQ_I] freq2 = peaks[i + j][IDX_FREQ_I] t1 = peaks[i][IDX_TIME_J] t2 = peaks[i + j][IDX_TIME_J] t_delta = t2 - t1 if t_delta >= MIN_HASH_TIME_DELTA and t_delta <= MAX_HASH_TIME_DELTA: h = farmhash.hash64("%s|%s|%s" % (str(freq1), str(freq2), str(t_delta))) yield (format(h, '016X'), t1)
def get_hash_for_key(key): return str(farmhash.hash64(key.encode('ascii','ignore')))
# import leveldb # db = leveldb.LevelDB('./db') # db.Put('hello'.encode('utf8'), 'world'.encode('utf8')) # print(db.Get('hello'.encode('utf8'))) import farmhash print(farmhash.hash64('abc'))
def get_hash_for_key(key): return str(farmhash.hash64(key.encode('ascii', 'ignore')))
def _hash_func(self, d): return farmhash.hash64(d)
def _hashfunc64(self, str_value): return farmhash.hash64(str_value)