def test_32bit_basic_string( self ): solution = self._load_solutions('solution_hash32_seed0.txt') with open( os.path.join( file_dir, 'pg1260.txt' ), 'rb' ) as test_file: for l in test_file.readlines(): s = solution[l] r = pymmh3.hash( l ) self.assertEqual( s, r, 'different hash for line: "%s"\n0x%08X != 0x%08X' % ( l, s, r ) )
def test_32bit_custom_seed_bytearray( self ): solution = self._load_solutions('solution_hash32_seed1234ABCD.txt') with open( os.path.join( file_dir, 'pg1260.txt' ), 'rb' ) as test_file: for l in test_file.readlines(): s = solution[l] r = pymmh3.hash( bytearray( l ), seed = 0x1234ABCD ) self.assertEqual( s, r, 'different hash for line: "%s"\n0x%08X != 0x%08X' % ( l, s, r ) )
def test_32bit_custom_seed_bytearray( self ): solution = self._load_solutions('solution_hash32_seed1234ABCD.txt', 10) with open( os.path.join( file_dir, 'pg1260.txt' ), 'rb' ) as test_file: for l in test_file.readlines(): s = solution[l] r = pymmh3.hash( bytearray( l ), seed = 0x1234ABCD ) self.assertEqual( s, r, 'different hash for line: "%s"\n0x%08X != 0x%08X' % ( l, s, r ) )
def add_server(self, key): if key not in self.server_keys: server_node_index = pymmh3.hash(key) % self.size self.server_keys_dict[key] = server_node_index self.server_keys.append(key) self.nodes[server_node_index] = key self.server_node_loc.append(server_node_index) self.server_node_loc.sort()
def get_x(csv_row, D): fullind = [] for key, value in csv_row.items(): s = key + '=' + value fullind.append(hash(s) % D) # weakest hash ever ?? Not anymore :P x = {} for index in fullind: if (not x.has_key(index)): x[index] = 0 if signed: x[index] += (1 if (hash(str(index)) % 2) == 1 else -1 ) # Disable for speed else: x[index] += 1 return x # x contains indices of features that have a value as number of occurences
def test_32bit_basic_string( self ): solution = self._load_solutions('solution_hash32_seed0.txt', 10) with open( os.path.join( file_dir, 'pg1260.txt' ), 'rb' ) as test_file: for l in test_file.readlines(): s = solution[l] r = pymmh3.hash( l ) self.assertEqual( s, r, 'different hash for line: "%s"\n0x%08X != 0x%08X' % ( l, s, r ) )
def lookup(self, string): string = str.encode(str(string)) vals = [] for seed in range(1, self.hash_count): result = mmh3.hash(string, seed) % self.size val = self.arrays[seed][result] vals.append(val) return min(vals)
def _get_hash_values(self, element: str, range_size: int) -> List[int]: """ Map the given element to it counters using the hash functions :param element: The element to map :return: A list of cells indexes """ output: List[int] = [] for i in range(self.num_of_hash_functions): hashed_value = mmh3.hash(element, i) % range_size output.append(hashed_value) return output
def get_request(url): try: r = s.get(url, verify=False, timeout=max_time) favicon = codecs.encode(r.content, "base64") hash = mmh3.hash(favicon) key = hash a.setdefault(key, []) a[key].append(url) return url, hash, None except Exception as e: return url, None, e
def getfaviconhash(url): try: response = requests.get(url, verify=False) if response.headers['Content-Type'] == "image/x-icon": favicon = base64.b64encode(response.content).decode('utf-8') hash = pymmh3.hash(change_format(favicon)) else: hash = None except Exception: print("[!] Request Error") hash = None return hash
def getfaviconhash(url): try: resp = s.get(url, verify=False) if "image" in resp.headers['Content-Type']: favicon = base64.b64encode(resp.content).decode('utf-8') hash = pymmh3.hash(change_format(favicon)) else: hash = None resp.close() except Exception as ex: print("[!] Request Error"+"\n"+str(ex)) hash = None return hash
def set_server_nodes(self): for key in self.server_keys: for i in range(self.virtual_nodes): count = 0 while True: node_index = pymmh3.hash(key, i + count) % self.size if node_index in self.nodes: count += 1 else: self.nodes[node_index] = key self.server_node_loc.append(node_index) self.server_keys_dict[key] = node_index break self.server_node_loc.sort()
def get_node(self, key): node_index = pymmh3.hash(key) % self.size # do server_node walk for idx, node in enumerate(self.server_node_loc): if node_index <= node: #[5, 14, 15, 23, 26, 38, 51, 53, 66, 81, 86, 90, 95, 98] print("1 Data sharded at node_index " + str(node_index) + " serviced by server at node_index: " + str(node) + "\n") if idx == len(self.server_node_loc) - 1: print("1.1 Data replicated at node_index " + str(self.server_node_loc[0]) + "\n") else: print("1.2 Data replicated at node_index " + str(self.server_node_loc[idx + 1]) + "\n") return # not in between, so must be on the first print("2 Data sharded at node_index " + str(node_index) + " serviced by server at node_index " + str(self.server_node_loc[0]) + "\n") print("2.1 Data replicated at server at node_index " + str(self.server_node_loc[1]) + "\n")
def get_x(csv_row, D): fullind = [] for key, value in csv_row.items(): s = key + '=' + value fullind.append(hash(s) % D) # weakest hash ever ?? Not anymore :P if interaction == True: indlist2 = [] for i in range(len(fullind)): for j in range(i+1,len(fullind)): indlist2.append(fullind[i] ^ fullind[j]) # Creating interactions using XOR fullind = fullind + indlist2 x = {} x[0] = 1 # 0 is the index of the bias term for index in fullind: if(not x.has_key(index)): x[index] = 0 if signed: x[index] += (1 if (hash(str(index))%2)==1 else -1) # Disable for speed else: x[index] += 1 return x # x contains indices of features that have a value as number of occurences
def Mapping1(clientvalue,cohort,num_hashes,bfsize, allunique): inptomd=str(cohort)+clientvalue listofindx=[] i=0 entries=0 while(entries<num_hashes): digest = mmh3.hash(inptomd,i) % bfsize if(digest not in listofindx): listofindx.append(digest) entries=entries+1 i=i+1 return listofindx
def ShadonFaviconHash(): arg = request.form.get('faviconUrl', type=str, default=None) if arg: response = requests.get(arg) if response.headers['Content-Type'] == "image/x-icon": favicon = base64.b64encode(response.content).decode('utf-8') hash = pymmh3.hash(change_format(favicon)) result = ''' 其结果为:{} '''.format(hash) return render_template_string(result) else: return '''
def InsertStructuredInfo(listofSources): for listofObjects in listofSources: for num in range(3): q = NewsArticle.objects.filter(sourcename=listofObjects, hashprefix=num).limit(None) Articles = q.filter(state="content").limit(None) print(listofObjects, num, len(Articles)) for a in range(len(Articles)): article = Articles[a] try: #QueryForArticle = QueryForEachArticle(article.articlelink,EmdWords) datestring = article.updated dt = datetime.strptime(datestring, '%Y-%m-%d %H:%M:%S') date = str(dt.month) + "-" + str(dt.year) BOWnGrams = constructNGramsBOW(article.content) BOWnGramsTitle = constructNGramsBOW(article.title) #ner = NER(article.content) ner = "" d2v = doc2vec(article.articlelink, article.content) centroidOfDocument = CentroidOfDocument( BOWnGrams, EmdWords) key_words = keywords(article.content) #NewsArticleTM.create(sourcename=listofObjects,hashprefix=num,date=date,articlelink=article.articlelink,language="EN",key_words = key_words,bag_of_words=BOWnGrams,bag_of_words_title=BOWnGramsTitle,document_vector =d2v,centroid_of_document=centroidOfDocument) #CentroidOfDocument = list(CentroidOfDocument)#.tolist() #d2v = list(d2v) NewsArticleTM.create( sourcename=listofObjects, hashprefix=(pymmh3.hash(article.articlelink) % 3), date=date, articlelink=article.articlelink, language="EN", key_words=key_words, named_entities=ner, bag_of_words=BOWnGrams, bag_of_words_title=BOWnGramsTitle, document_vector=d2v, centroid_of_document=centroidOfDocument) NewsArticle.objects( sourcename=article.sourcename, hashprefix=article.hashprefix, articlelink=article.articlelink, language="EN").update(state="structinfo") except Exception as ex: print("in exception", ex) pass
def save_link(cls, title, url, body="", tags=[], clicks=0, unread=True): url = norm(url) id = mmh3.hash(url) key = ndb.Key(LinkModel, id) domain = urlparse(url).netloc if len(domain)>4 and domain.startswith('www.'): domain = domain[4:] link = LinkModel( key = key, title = title, url = url, domain = domain, body = body, tags = tags, clicks = clicks, unread = unread ) link.put() id = str(link.id) doc = cls._buildDoc(id, title, body, domain, tags) cls.add(doc) return cls(doc, link)
def save_link(cls, title, url, body="", tags=[], clicks=0, unread=True): url = norm(url) id = mmh3.hash(url) key = ndb.Key(LinkModel, id) domain = urlparse(url).netloc if len(domain) > 4 and domain.startswith('www.'): domain = domain[4:] link = LinkModel(key=key, title=title, url=url, domain=domain, body=body, tags=tags, clicks=clicks, unread=unread) link.put() id = str(link.id) doc = cls._buildDoc(id, title, body, domain, tags) cls.add(doc) return cls(doc, link)
def mapBloomFilter(clientvalue, icohort, nhashes, bfsize): inptomd=str(icohort)+clientvalue listofindx=[] i=0 entries=0 while(entries<nhashes): digest = mmh3.hash(inptomd,i) % bfsize if(digest not in listofindx): listofindx.append(digest) entries=entries+1 i=i+1 vec = np.zeros(bfsize) for ind in listofindx: vec[ind] = 1 return vec
import pymmh3 import requests import base64 import re import time def change_format(content): end_str_length = len(content) % 76 new_content_list = re.findall(r'.{76}', content) end_str = content[-end_str_length::] new_content_list.append(end_str) return "{}\n".format("\n".join(new_content_list)) if __name__ == "__main__": start_time = time.time() response = requests.get('https://www.baidu.com/favicon.ico') if response.headers['Content-Type'] == "image/x-icon": favicon = base64.b64encode(response.content).decode('utf-8') hash = pymmh3.hash(change_format(favicon)) print(hash) end_time = time.time() print("总用时为:{}".format((end_time - start_time)))
def add(self, string, val): string = str.encode(str(string)) for seed in range(1, self.hash_count): result = mmh3.hash(string, seed) % self.size self.arrays[seed][result] = val + self.arrays[seed][result]
def add(self, string): string = str.encode(str(string)) for seed in range(1, self.hash_count): result = mmh3.hash(string, seed) % self.size self.bit_array[result] = 1
def get_hash(self): favicon_raw_data = self.get_favicon() favicon_data = self.chang_format(favicon_raw_data) result = pymmh3.hash(favicon_data) self.hash = result
import base64 import sys import os import shodan import pymmh3 as mmh3 import requests url = f'{sys.argv[1]}/favicon.ico' try: content = requests.get(url).content except Exception as e: print(e.message) os.exit(1) h = mmh3.hash(base64.encodebytes(content)) print(f'http.favicon.hash:{h}') api_key = os.getenv('API_KEY', '') if api_key != '': api = shodan.Shodan(api_key) count = api.count(f'http.favicon.hash:{h}') print(f'count: {count.get("total", "not available")}')
def add(self, item): for i in range(self.num_hashes): hash_idx = pymmh3.hash(item, i) % self.size self.filter[hash_idx] = 1
def hash_func(self, key, node): return pymmh3.hash(key + str(node))
def is_member(self, item): for i in range(self.num_hashes): hash_idx = pymmh3.hash(item, i) % self.size if self.filter[hash_idx] == 0: return False return True
def bloomHash(self,inStr, hashSeed): #Use 32-bit hash function pymmh3 to map a string to an integer in {0, ..., m-1} maxHash = 2**31-1 h=pymmh3.hash(inStr,hashSeed)/maxHash return int(0.5*self._blmM*(h+1))
# -*- coding: utf-8 -*-