def writeWebMatrix(self, item): ''' Builds web graph in matrix market format file ''' u = self.r.get("%s:%s" % (self.URL2ID, hashxx(item['url']))) v = 0 for link in set(item['link_set']): v = self.r.get("%s:%s" % (self.URL2ID, hashxx(link))) self.f_mat.write("%s\t%s\t1\n" % (u, v))
def hash_one_value(val): ''' Use the hashxx function to initialize, add data, and compute the digest in one go. Add the seed parameter if you need to control the initial state of the digest. ''' print("Hash of %s = %d" % (repr(val), hashxx(val, seed=0)))
def g(key, obj): for pat, e in extractors.items(): if fnmatch.fnmatch(key, pat): unhashable = False vals = [] for f in e: v = f(obj) if isinstance(v, bool): if not v: unhashable = True break else: vals.append(f(obj)) if unhashable: logger.debug('pat %s obj %s is unhashable' % (pat, key)) continue for v in itertools.product(*vals): s = str('\0'.join(v)) logger.debug('hash string for %s %s' % (key, s)) yield pyhashxx.hashxx(s), s break
def push(self, batch): """""" if batch is None: map(lambda i: self._qs[i].push(None), range(len(self._qs))) return self._lock.acquire() self._records += len(batch) self._lock.release() # [todo] - performance: splitting batch too small? rdef = batch.record_def() # distribute records using hash function partitioned_records = [ [] # array of Record for i in range(len(self._qs)) ] key_idx = rdef.colindex_by_colname(self._key) for rec in batch: val = rec[key_idx] h = pyhashxx.hashxx(bytes(val)) # [todo] - customize hash function? records = partitioned_records[h % len(self._qs)] records.append(rec) # really push distributed records into BatchQueue for i in range(len(self._qs)): self._qs[i].push(Batch(rdef, partitioned_records[i]))
def get(self): # Looks up literally every filter anyone has ever looked up in descending order by how often # For now...let's build all of them! allFilterSetsWithCounts = db.session.query( Filter.filter_params, func.count(Filter.filter_params).label('times_filtered')).group_by( Filter.filter_params).order_by('times_filtered DESC').all() results = {} for (filter_string, times_filtered) in allFilterSetsWithCounts: print 'running ' + filter_string print 'has been run ' + str(times_filtered) + ' times.' filter_params = json.loads(filter_string) rules = filter_params['rules'] cache_key = str(hashxx(json.dumps(filter_params))) from_cache = client.get(cache_key) results[cache_key] = {} results[cache_key]['rules'] = rules if from_cache is None: tasks.buildCache.delay(cache_key, rules) results[cache_key]['existing-cache'] = False else: results[cache_key]['existing-cache'] = True return results
def hash_strings(strings, seed, mod, encoded=False): if not encoded: strings = map(str.encode, strings) arr = [hashxx(s, seed=seed) for s in strings] return np.mod(arr, mod)
def get_acmuevent_hash(glpatt="/sys/class/tty/ttyACM?/device/uevent"): f = glob(glpatt) if len(f) == 0: return "" else: with open(f[0], "rb") as ue: return str(hashxx(ue.read()))
def predict(self, target, static_data, temporal_data): """ Predicts the model's output (loss, prediction) for the given target and instance. In general, at least one of static_data and temporal_data must be non-empty. In this case, the model only uses static_data. Args: target: classification label or regression output (scalar value) static_data: static data (vector) temporal_data: temporal data (matrix, where number of rows are variable across instances) Returns: loss: model's output loss prediction: model's output prediction, only used for classifiers """ hashval = hashxx(static_data.data.tobytes()) self.rng.seed(hashval) if self.noise_type == constants.NO_NOISE: prediction = self.model_fn(static_data, []) elif self.noise_type == constants.EPSILON_IRRELEVANT: # Add noise - small random non-zero coefficients for irrelevant features noise = self.noise_multiplier * self.rng.uniform( -1, 1, static_data.size) prediction = self.model_fn(static_data, noise) elif self.noise_type == constants.ADDITIVE_GAUSSIAN: # Add noise - additive Gaussian, sampled for every instance/perturbed instance prediction = self.model_fn( static_data, self.rng.normal(0, self.noise_multiplier)) else: raise NotImplementedError("Unknown noise type") loss = self.loss(prediction, target) return (loss, prediction)
def post(self): try: parser = reqparse.RequestParser() parser.add_argument('filter_params', type=str) args = parser.parse_args() filter_params = json.loads(args['filter_params']) rules = filter_params['rules'] if 'prediction_threshold' in filter_params.keys(): metaseek_power = filter_params['prediction_threshold'] print "getting metaseek power" else: metaseek_power = 0.9 print metaseek_power cache_key = str(hashxx(json.dumps(filter_params))) from_cache = client.get(cache_key) if from_cache: print "cached" db.session.add(Filter(args['filter_params'])) db.session.commit() if from_cache is None: summary = summarizeDatasets(Dataset.query, rules, sampleRate=0.05, metaseek_power=metaseek_power) client.set(cache_key, summary) return summary else: return from_cache except Exception as e: return {'error': str(e)}
def request_seen(self, request): print "filter:", request.url uid = self.r.get("%s:%s" % (self.URL2ID, hashxx(request.url))) if not uid or int(uid) > 0: pass else: log.msg("FILTER SEEN:%s" % request.url, level = log.CRITICAL) return True
def makeHashes(self,inp): self.inp = inp partial = [] self.spooky = hash64(inp) % size partial.append(self.spooky) self.hashxx = hashxx(inp) % size partial.append(self.hashxx) self.mmh = abs(mmh3.hash(inp)) % size partial.append(self.mmh) return partial
def get_identifier(entity_name, data): """ Get identifier from BrAPI object or generate one from hashed string json representation """ entity_id = entity_name + 'DbId' data_id = data.get(entity_id) if not data_id: simplified_object = remove_falsey( data, predicate=lambda x: x and not isinstance(x, set)) json_rep = json.dumps(simplified_object, sort_keys=True) data_id = str(hashxx(json_rep.encode())) data[entity_id] = str(data_id) return data_id
def make_key(key, key_prefix, version): """ Makes a memcached-safe cache key using pyhashxx. Use as a KEY_FUNCTION """ clean_key = bad_key_chars.sub('', key) full_key = '%s:%s:%s' % (key_prefix, version, clean_key) if clean_key != key or len(full_key) > MAX_LENGTH: hashed_key = str(hashxx(key)) abbrev_keylen = MAX_LENGTH - len(hashed_key) - 4 - len(key_prefix) - len(str(version)) new_key = '%s[%s]' % (clean_key[:abbrev_keylen], hashed_key) full_key = '%s:%s:%s' % (key_prefix, version, new_key) return full_key
def buildWordIndex(self, item): ''' Get current url id For each word in current url's text, add the url to the set of urls which contain that word ''' url_id = self.r.get("%s:%s" % (self.URL2ID, hashxx(item['url']))) word_id = "" for word in item['words']: if self.r.sadd(self.WORD_SET, word): word_id = str(self.r.incr(self.WORD_CTR, 1)) self.r.set("%s:%s" % (self.WORD2ID, word), word_id) else: word_id = self.r.get("%s:%s" % (self.WORD2ID, word)) self.r.sadd("%s:%s" % (self.WORD_IN, word_id), url_id)
def steps_cost(self, state_to_find): # compute the hash_index for state_to_find, look that many bytes into the # file/self.conten and retrieve a single hex character. This hex character # is the number of steps required to solve the corresponding state. hash_raw = hashxx(state_to_find.encode('utf-8')) hash_index = int(hash_raw % self.bucketcount) result = int(chr(self.content[hash_index]), 16) # This should never be zero if not result: #log.warning("%s: state_to_find %s, hash_raw %s. hash_index %s, result is %s" % (self, state_to_find, hash_raw, hash_index, result)) raise SolveError( "%s: state_to_find %s, hash_raw %s. hash_index %s, result is %s" % (self, state_to_find, hash_raw, hash_index, result)) return result
def setUp(self): self.date = str(datetime.datetime.now()) self.uuid = str(hashxx('wut.jpg')) + str(self.date) self.formatted_image = {'data': { "id": self.uuid, "title": "Monkey Socks", "type": "image/jpeg", "creation_date": self.date, "size": 52191, "url": "http://dumpon.us/imgs/" + self.uuid + '.jpg' }, 'status': 200 } self.image_keys = ['id', 'title', 'type', 'creation_date', 'size', 'url'] self.json_image = json.dumps(self.formatted_image) self.app = dumponus.app.test_client() self.image_model = Photo('wut.jpg', 'Monkey Socks', 'image/jpeg', 52191, "http://dumpon.us/imgs/" + self.uuid + '.jpg', creation_date=self.date) self.db = dumponus.db #keep track of ids (they are changing) and delete test data on tearDown self.ids = [] self.images = [] self.url = '/api/images'
def string_hash(string): return pyhashxx.hashxx(string.encode('utf8'))
def client_fuzzer(fd, lfd, args=None, **kwargs): """ Client AFL fuzzer. Executed by AFL, fed to STDIN. Communicates with the fuzzer server, reads response, changes SHM. :param fd: :param lfd: :param args: :return: """ global stdin_compat in_afl = os.getenv('PYTHON_AFL_PERSISTENT', None) llog(fd, 'init1') sys.settrace(None) llog(fd, 'init2, in afl: %s' % in_afl) # Argument processing tpler = Templater(args) llog(fd, 'templater: %s' % tpler) # by default, start with 4byte input - fuzz instruction with empty data tpler.gen_inputs() # Call our fuzzer try: # s = csock() # Pre-fork connection. needs more sophisticated reconnect if socket is broken. while afl.loop(3): sys.settrace(None) stdin_compat.seek(0) buffer = stdin_compat.read() buffer = tpler.transform(buffer) if buffer is None: continue llog(fd, 'init4, buffer: %s' % binascii.hexlify(bytes(buffer))) s = SockComm(server=False) s.connect() s.send(bytes([0]) + bytes(buffer)) resp = s.read() llog(fd, 'Recv: %s' % binascii.hexlify(resp)) if resp[0] != 0: llog(fd, 'Invalid response code: %s' % resp[0]) continue sw1 = resp[1] sw2 = resp[2] timing = resp[3:5] data = resp[5:] statuscode = (sw1 << 8) + sw2 llog(fd, 'status: %04x timing: %s' % (statuscode, timing)) if in_afl: afl.trace_offset(hashxx(bytes([sw1, sw2]))) afl.trace_offset(hashxx(timing)) afl.trace_offset(hashxx(bytes(data))) except Exception as e: llog(fd, 'Exc: %s\n' % e) traceback.print_exc(file=fd) fd.flush() except KeyboardInterrupt: return finally: fd.close() os._exit(0)
def prefix_fuzzing(fd, lfd, args=None, **kwargs): """ Original forking fuzzer with AFL without TCP binding. Only for demo purposes. Did not work well with libpcsc and forking. :param fd: :param args: :return: """ global stdin_compat in_afl = os.getenv('PYTHON_AFL_PERSISTENT', None) # reader = get_reader() # card = connect_card(reader) llog(fd, 'init1') fwd = FileWriter(fd=lfd) sys.settrace(None) llog(fd, 'init2, in afl: %s' % in_afl) # Call our fuzzer try: while afl.loop(3): # afl.init() sys.settrace(None) stdin_compat.seek(0) buffer = stdin_compat.read() buffer = form_buffer(buffer) if buffer is None: continue llog(fd, 'init4, buffer: %s' % binascii.hexlify(bytes(buffer))) ln = int(buffer[4]) if len(buffer) >= 5 else 0 test_elem = FuzzerObject(int(buffer[0]), int(buffer[1]), int(buffer[2]), int(buffer[3]), ln, list(bytearray(buffer[5:]))) if args.dry: elem = test_elem sw1 = 0 sw2 = 0 out = bytes() else: card_interactor = CardInteractor(CARD_READER_ID) llog(fd, 'reader: %s' % (card_interactor, )) elem = card_interactor.send_element(test_elem) sw1 = elem.out['sw1'] sw2 = elem.out['sw2'] out = elem.out['data'] statuscode = (sw1 << 8) + sw2 time_bin = int(test_elem.misc['timing'] // 10) if time_bin < 0: time_bin = 0 serialized_element = elem.serialize() fwd.print_to_file("%s" % json.dumps(serialized_element)) llog(fd, 'status: %04x timing: %s' % (statuscode, time_bin)) if in_afl: afl.trace_offset(hashxx(bytes([sw1, sw2]))) afl.trace_offset(hashxx(bytes(time_bin.to_bytes(2, 'big')))) afl.trace_offset(hashxx(out)) os._exit(0) except Exception as e: llog(fd, 'Exc: %s\n' % e) traceback.print_exc(file=fd) fd.flush() finally: fd.close() os._exit(0)
def get_hash(string): return hashxx(string.encode('utf-8'))
def test_seeds(self): self.assertNotEqual(hashxx(b'hello', seed=0), hashxx(b'hello', seed=1)) self.assertEqual(hashxx(b'hello', seed=0), self.hash_value(b'hello', seed=0)) self.assertEqual(hashxx(b'hello', seed=1), self.hash_value(b'hello', seed=1)) self.assertEqual(hashxx(b'hello', seed=2), self.hash_value(b'hello', seed=2))
def generate_hash(name, group_name, posted_by, total_parts): """Generates a mostly-unique temporary hash for a part.""" return pyhashxx.hashxx(name.encode('utf-8'), posted_by.encode('utf-8'), group_name.encode('utf-8'), total_parts.encode('utf-8') )
def create_id(name, lang, typ, visibility): time_now = dt.now().strftime('%H:%M:%S') return str( hashxx("{}{}{}{}{}".format(name[0], lang[0], typ[0], visibility[0], time_now)))[2:6]
def generate_hash(name, group_name, posted_by, total_parts): """Generates a mostly-unique temporary hash for a part.""" return pyhashxx.hashxx(name.encode('utf-8'), posted_by.encode('utf-8'), group_name.encode('utf-8'), total_parts.encode('utf-8'))
from pprint import pprint import cPickle import glob import sys from itertools import * import os.path import pyhashxx from UserDict import UserDict import random import operator from gurobipy import * nulhash = int(pyhashxx.hashxx('\0' * 4096)/10) freq = {} tables = [] fi = 0 frqc = 0 if os.path.isfile("dumps/dumps.cache"): print "Loading hashlist... (cached)" (frqc, tables) = cPickle.load(open("dumps/dumps.cache", "rb")) else: print "Loading hashes and purging sole occurences" for file in sorted(glob.glob("dumps/*.dump")): tp = cPickle.load(open(file, "rb")) table = UserDict(tp) table.filename = os.path.basename(file).replace(".dump","") table.idx = fi fi += 1 table.memory = (768,1024)[random.randint(0,1)]
def convert_to_cost_only(filename, bucketcount, filename_statetargets): state_targets = set() with open(filename_statetargets, 'r') as fh: for line in fh: line = line.replace("'", "").replace(",", "").strip() state_targets.add(line) filename_new = filename.replace('.txt', '.hash-cost-only.txt') prev_state_int = None first_permutation_rank = None bucket = bytearray(bucketcount) collisions = 0 with open(filename, 'r') as fh: for (line_number, line) in enumerate(fh): (state, steps) = line.strip().split(':') steps = steps.split() hash_raw = hashxx(state.encode('utf-8')) hash_index = int(hash_raw % bucketcount) # Write the steps_len if state in state_targets: #log.info("found state_target %s" % state) steps_len = 0 else: if steps[0].isdigit(): steps_len = int(steps[0]) else: steps_len = len(steps) #log.info("state: %s, hash_index %s, steps_len %s" % (state, hash_index, steps_len)) if not bucket[hash_index]: bucket[hash_index] = steps_len else: collisions += 1 if bucket[hash_index] > steps_len: bucket[hash_index] = steps_len if line_number % 1000000 == 0: log.info(line_number) #if line_number >= 1000: # break log.info("%d collisions" % collisions) log.info("begin writing %s" % filename_new) with open(filename_new, 'w') as fh_new: to_write = [] for (index, x) in enumerate(bucket): if x > 15: to_write.append('f') else: # Convert steps_len to hex and ignore the 0x part of the string to_write.append(hex(x)[2]) if index % 100000 == 0: fh_new.write(''.join(to_write)) to_write = [] if to_write: fh_new.write(''.join(to_write)) fh_new.write('\n') log.info("end writing %s" % filename_new)
''' Assign id to current url Each link's url is assigned an ID and vice versa This stage will only be reached if the 'if' condition in nofilter.py fails and the function returns true. The only way the 'if' condition fails is if the url_id of this item's url exists and is negative (=> uit has been processed before) Thus, either the url has been assigned an id or it hasnt. If it has, negate its current id . If it hasnt, get a new id from URL_CTR, negate it and assign it to this url. Finally, change URL2ID, ID2URL correspondingly in either case. Ultimately, +ve id => assigned id but not processed -ve id => assigned id and processed no id => not assigned id and not processed ''' <<<<<<< HEAD hashed_url = hashxx(item['url']) url_id = self.r.get("%s:%s" % (self.URL2ID, hashed_url)) if not url_id: url_id = -1 * self.r.incr(self.URL_CTR, 1) else: self.r.delete("%s:%s" % (self.ID2URL, url_id)) url_id = -1 * int(url_id) self.r.sadd(self.URL_SET, url_id) self.r.set("%s:%d" % (self.ID2URL, url_id), item['url']) self.r.set("%s:%s" % (self.URL2ID, hashed_url), url_id) for link in item['link_set']: hashed_link = hashxx(link) if not self.r.get("%s:%s" % (self.URL2ID, hashed_link)):
def generate_seed(key, value): combination = key + value value = hashxx(combination.encode()) return value
def test_empty_string(self): self.assertEqual(hashxx(b''), self.hash_value(b''))
def generate_hash(subject, posted_by, group_name, total_segments): """Generates a mostly-unique temporary hash for a part.""" return pyhashxx.hashxx(subject.encode('utf-8'), posted_by.encode('utf-8'), group_name.encode('utf-8'), struct.pack('I', total_segments))
def make_hash_name(name): #makes the hashed name to be used as the id i = str(hashxx(name + str(datetime.datetime.now()))) + '.' return i
def _gen_hash_fn(self, seed, num_hash_fns): return lambda x: hashxx(str(x), seed=seed) % self.bits
def my_hashxx(x, seed=0): x = str(x) seed = int(seed) return hashxx(x, seed=seed)
from pyhashxx import hashxx import leveldb db = leveldb.LevelDB('./db') new_db = leveldb.LevelDB('./new_db') count = 0 for record in db.RangeIter(): count += 1 if count % 1000 == 0: print(count) new_db.Put(str(hashxx(record[0])), record[1]) # count = 0 # for line in iter(log): # json_line = json.loads(line) # if 'terms' in json_line['val']: # count += 1 # if count % 1000 == 0: # print(count) # db.Put(str(json_line['key']), json.dumps(json_line['val'])) # count = 0 # for doc in db.RangeIter(): # count += 1 # if count % 1000 == 0:
def _gen_hash_fn(self, seed, slices): return lambda x: hashxx(str(x), seed=seed) % slices
def test_string(self): self.assertEqual(hashxx(b'hello'), self.hash_value(b'hello'))
def ft_hashxx(text): global m hash_value = hashxx(text.encode()) return (hash_value % m)
def generate_hash(subject, posted_by, group_name, total_segments): """Generates a mostly-unique temporary hash for a part.""" return pyhashxx.hashxx(subject.encode('utf-8'), posted_by.encode('utf-8'), group_name.encode('utf-8'), struct.pack('I', total_segments) )
def hxx(x): return post_hash_fn(hashxx(x))