Esempio n. 1
0
 def _hash_bits(self, key):
     # http://spyced.blogspot.com/2009
     # /01/all-you-ever-wanted-to-know-about.html
     hash1 = mmh3.hash(key, 0)
     hash2 = mmh3.hash(key, hash1)
     for i in range(self._hash_funcs):
         yield abs((hash1 + i * hash2) % self._bits_per_slice)
def data(path, label_path=None):
    fd = open(path)
    fd.readline() # skip headers
    hash_cols = [3,4,34,35,61,64,65,91,94,95]
    npairs = len(hash_cols)
    x = [0] * (146 + npairs*(npairs-1)/2)
    if label_path:
        label = open(label_path)
        label.readline() # skip headers
    for t, line in enumerate(fd):
        # parse x
        row = line.rstrip().split(',')
        for m, feat in enumerate(row):
            if m == 0:
                ID = int(feat)
            else:
                # one-hot encode everything with hash trick
                # categorical: one-hotted
                # boolean: ONE-HOTTED
                # numerical: ONE-HOTTED!
                # note, the build in hash(), although fast is not stable,
                #       i.e., same value won't always have the same hash
                #       on different machines
                x[m] = abs(mmh3.hash(str(m) + '_' + feat)) % D
        for i in xrange(10):
            for j in xrange(i+1,10):
                m += 1
                x[m] = abs(mmh3.hash(str(m)+'_'+row[hash_cols[i]]+"_x_"+row[hash_cols[j]])) % D
        # parse y, if provided
        if label_path:
            # use float() to prevent future type casting, [1:] to ignore id
            y = [float(y) for y in label.readline().split(',')[1:]]
        yield (ID, x, y) if label_path else (ID, x)
Esempio n. 3
0
File: m_api.py Progetto: J4LP/mumble
 def authenticate(self, name, password, certificates, certhash, certstrong, current=None):
     with self.app.app_context():
         if name == 'SuperUser':
             return RET_FALLTHROUGH
         user = User.query.filter_by(user_id=name).first()
         if not user:
             try:
                 uuid.UUID(name, version=4)
             except ValueError:
                 return RET_DENIED
             guest_user = GuestUser.query.get(name)
             if guest_user:
                 if not guest_user.password == password or guest_user.banned:
                     return RET_DENIED
                 if guest_user.corporation:
                     self.app.logger.debug('Authenticating guest with: {} {} {}'.format(abs(mmh3.hash(guest_user.id.hex)), '[{}][GUEST] {}'.format(self.get_ticker(guest_user.corporation), guest_user.name), [u'Guest']))
                     return abs(mmh3.hash(guest_user.id.hex)), '[{}][GUEST] {}'.format(self.get_ticker(guest_user.corporation), guest_user.name), [u'Guest']
                 else:
                     self.app.logger.debug('Authenticating guest with: {} {} {}'.format(abs(mmh3.hash(guest_user.id.hex)), '[GUEST] {}'.format(guest_user.name), [u'Guest']))
                     return abs(mmh3.hash(guest_user.id.hex)), '[GUEST] {}'.format(guest_user.name), ['Guest']
             else:
                 return RET_DENIED
         if not user.mumble_password == password:
             return RET_DENIED
         self.app.logger.debug('Authenticating user with: {} {} {}'.format(mmh3.hash(user.user_id), '[{}] {}'.format(self.get_ticker(user.corporation_name), user.main_character), user.groups))
         return mmh3.hash(user.user_id), '[{}] {}'.format(self.get_ticker(user.corporation_name), user.main_character), user.groups
Esempio n. 4
0
    def hash(self, string):
        hash_arr = []
        hash1 = mmh3.hash(string, 0)
        hash2 = mmh3.hash(string, hash1)
        for i in range(self.k):
            hash_arr.append(abs((hash1 + i * hash2) % self.m))

        return hash_arr
Esempio n. 5
0
 def Hashmap_WordVector(self,nbits):
     length=len(self.Words_Vector)
     self.bl_bits=nbits
     self.bloom_vector=self.bl_bits*bitarray('0')
     for i in range(length):
         self.hashmap1.append(mmh3.hash(self.Words_Vector[i]) % self.bl_bits )
         self.hashmap2.append(mmh3.hash(self.Words_Vector[i],self.hashmap1[i]) % self.bl_bits )
         self.hashmap3.append(mmh3.hash(self.Words_Vector[i],self.hashmap2[i]) % self.bl_bits )
         self.bloom_vector[self.hashmap1[i]]=1
         self.bloom_vector[self.hashmap2[i]]=1
         self.bloom_vector[self.hashmap3[i]]=1
Esempio n. 6
0
def get_hash(label,namespace,feature,stride,mask):
    if namespace:
        namespace_hash = mmh3.hash(namespace,0)
    else:
        namespace_hash = 0
    if is_number(feature):
        feature_hash = int(feature) + namespace_hash
    else:
        feature_hash = mmh3.hash(feature,namespace_hash)
    feature_hash_oaa = feature_hash * stride
    return (feature_hash_oaa + label - 1) & mask
Esempio n. 7
0
 def parse_block(block):
     index_block = []
     for file_path in block:
         file_path_hash = mmh3.hash(file_path)
         with open(file_path, 'r') as input_file:
             for line in input_file:
                 items = line.strip().split(' ')
                 index_block.append(
                     (mmh3.hash(items[0]),
                      [file_path_hash,
                      items[1]])
                 )
     return index_block
Esempio n. 8
0
def select_terms_meta(query_terms, term_dict_stream):
    """
    reads term dictionary generator and selects query terms meta info
    """
    terms_meta_dict = {}

    for term in query_terms:
        term_hash = mmh3.hash(term.encode("utf-8"))
        terms_meta_dict[term_hash] = {
            "term": term,
            "seek_offset": None,
            "size": None
        }

    seek_offset = 0
    unseen_terms = terms_meta_dict.keys()
    for dict_term_hash, dict_term_size in term_dict_stream:
        if dict_term_hash in unseen_terms:
            terms_meta_dict[dict_term_hash]["seek_offset"] = seek_offset
            terms_meta_dict[dict_term_hash]["size"] = dict_term_size

            unseen_terms.remove(dict_term_hash)
            if len(unseen_terms) == 0:
                break

        seek_offset += dict_term_size

    query_terms_dict = {}
    for _, term_meta in terms_meta_dict.items():
        query_terms_dict[term_meta["term"]] = {
            "seek_offset": term_meta["seek_offset"],
            "size": term_meta["size"]
        }

    return query_terms_dict
Esempio n. 9
0
    def _indices(self, x):
        ''' A helper generator that yields the indices in x

            The purpose of this generator is to make the following
            code a bit cleaner when doing feature interaction.
        '''

        # first yield index of the bias term
        yield 0, 1.

        # then yield the linear indices
        if self.interaction != 2:
            for i,val in x:
                yield i,val

        # now yield interactions (if applicable)
        if self.interaction:
            D = self.D
            L = len(x)

            x = sorted(x)
            for i in xrange(L):
                for j in xrange(i+1, L):
                    # one-hot encode interactions with hash trick
                    yield abs(hash(str(x[i][0]) + '_' + str(x[j][0]))) % D, x[i][1]*x[j][1]
Esempio n. 10
0
 def process(self):
     # load data
     data = self.load()
     # index to elastic search
     print "\nStart processing"
     cursor = Cursor(self.es, self.data_from)
     cursor_num = cursor.get_new_cursor()
     for each_data in data:
         key_string = ''
         for each_key_string in key_value:
             key_string += each_data[each_key_string]
         hashkey = mmh3.hash(key_string)
         print "parsing id: ", hashkey
         # try to read record
         try:
             res = self.es.get(   index="deltadb", 
                             doc_type="data", 
                             id=hashkey)
             if res["found"]:
                 node = self.update_node(res["_source"], each_data, cursor_num)
             else:
                 node = self.create_node(each_data, cursor_num)
         except:
             node = self.create_node(each_data, cursor_num)
         # insert back to es
         try:
             res = self.es.index(index="deltadb", 
                                 doc_type="data", 
                                 id=hashkey, 
                                 body=node)
         except:
             continue
     print "\nProcess finish."
Esempio n. 11
0
def getHash(word):
	'''
		This return the hash value and does the anding with 0xffffffffL on a 32 bit system
	'''
	curHash = mmh3.hash(word)
	curHash = curHash & 0xffffffffL
	return curHash
Esempio n. 12
0
def shingles2sketch(shingles, m_baskets=20):
    baskets = defaultdict(lambda: -float("inf"))
    for shingle in shingles:
        h = mmh3.hash(shingle.encode('utf8'))
        if baskets[h % m_baskets] < h:
            baskets[h % m_baskets] = h
    return sorted(baskets.values())
Esempio n. 13
0
 def save_cursor(self, cursor_data):
     cursor_id = mmh3.hash(self.data_from)
     res = self.es.index(index="lookup", 
                         doc_type="data", 
                         id=cursor_id, 
                         body=cursor_data)
     return
Esempio n. 14
0
def saveSuccessCrawlDoc(crawldoc):
    '''step1: save crawl success crawldoc to crawl_result, make sure docid is unique
       step2: save outlinks(found new url) to crawl_pending
       step3: update crawl url status which at crawl_pending to crawled'''
    values = crawldoc.convert
    # only string can save to db, change dict or list to string.
    # reference: cccrawler.proto.db.models.CrawlResult
    values['reservation_dict'] = str(crawldoc.reservation_dict)
    values['history'] = str(values['history'])
    values['header'] = str(values['header'])
    values['created_at'] = timeutils.utcnow_ts()
    utils.convert_datetimes(values, 'created_at', 'deleted_at', 'updated_at')
    crawldoc_ref = models.CrawlResult()
    crawldoc_ref.update(values)
    crawldoc_ref.save()

    _updateCrawlStatus(crawldoc.pending_id,'crawled',crawlfail=False)

    cl = deweight.get_client()
    fresh_docs = []
    for doc in crawldoc.outlinks:
        real_url = urlutils.normalize(doc.url)
        docid = mmh3.hash(real_url)
        if not cl.has(docid):
            fresh_doc = addPendingCrawlDocDict(doc.url, int(crawldoc.level),
                        crawldoc.docid, crawldoc.reservation_dict,doc.text,
                        real_url, docid)
            print '@'*60
            print fresh_doc
            print '@'*60
            fresh_docs.append(fresh_doc)
    rushPendingCrawlDoc(fresh_docs)
Esempio n. 15
0
 def lookup(self, string):
     for seed in xrange(self.hash_count):
         result = mmh3.hash(string, seed) % self.size
         if self.bit_array[result] == 0:
             
             return "--%s--"%(words_list[i])
     return "Probably"
Esempio n. 16
0
    def lookup(self, string):

        hashlist = [mmh3.hash(string, seed=x) % 1000000 for x in xrange(Bloom.numberofhash)]
        for x in hashlist:
            if not Bloom.bit[x]:
                return False
        return True
Esempio n. 17
0
    def add(self, string):

        # Hash the string
        hashlist = [mmh3.hash(string, seed=x) % 1000000 for x in xrange(Bloom.numberofhash)]

        for x in hashlist:
            Bloom.bit[x] = 1
Esempio n. 18
0
    def _bit_offsets(self, value):
        '''The bit offsets to set/check in this Bloom filter for a given value.

        Instantiate a Bloom filter:

            >>> dilberts = BloomFilter(
            ...     num_values=100,
            ...     false_positives=0.01,
            ...     key='dilberts',
            ... )

        Now let's look at a few examples:

            >>> tuple(dilberts._bit_offsets('rajiv'))
            (183, 319, 787, 585, 8, 471, 711)
            >>> tuple(dilberts._bit_offsets('raj'))
            (482, 875, 725, 667, 109, 714, 595)
            >>> tuple(dilberts._bit_offsets('dan'))
            (687, 925, 954, 707, 615, 914, 620)

        Thus, if we want to insert the value 'rajiv' into our Bloom filter,
        then we must set bits 183, 319, 787, 585, 8, 471, and 711 all to 1.  If
        any/all of them are already 1, no problems.

        Similarly, if we want to check to see if the value 'rajiv' is in our
        Bloom filter, then we must check to see if the bits 183, 319, 787, 585,
        8, 471, and 711 are all set to 1.  If even one of those bits is set to
        0, then the value 'rajiv' must never have been inserted into our Bloom
        filter.  But if all of those bits are set to 1, then the value 'rajiv'
        was *probably* inserted into our Bloom filter.
        '''
        encoded_value = self._encode(value)
        for seed in range(self.num_hashes()):
            yield mmh3.hash(encoded_value, seed=seed) % self.size()
Esempio n. 19
0
def makeHashFuncs(key, size, numHashes):
    hashValue = []
    for i in range(1, (numHashes+1)):
        value = mmh3.hash(key,i) % size
        #print value
        hashValue.append(value)
    return hashValue
Esempio n. 20
0
def get_image_cache_name(url):
    last_segment = url.split('/')[-1]
    if last_segment.count('.') == 1:
        extension = '.' + url.split('.')[-1]
    else:
        extension = ""
    return 'img' + str(mmh3.hash(url.encode('utf-8'))) + extension.lower()
Esempio n. 21
0
	def in_bf(self, elem):
		for x in xrange(self.hash_count):
			index = mmh3.hash(elem, x) % self.size
			if (self.bit_arr[index] == 0):
				return False
		return True
			
Esempio n. 22
0
    def add_document_indexes(self, text, url, is_print=False):
        # TODO: Maybe, it is good idea to change key from string to hash
        self.documents.append(url)
        doc_id = len(self.documents)-1
        word_list = self._split_text(text.lower())

        for word in word_list:
            #"""
            try:
                word = word.encode('utf-8')

                w_hash = mmh3.hash(word) % self.count_of_files
                if is_print:
                    print word, w_hash
                r_index = self.full_index[w_hash]

                if r_index.has_key(word):
                    r_index[word]["docs"].append(doc_id)
                else:
                    r_index[word] = {}
                    r_index[word]["docs"] = [doc_id]

                if not r_index.has_key('encoding'):
                    r_index['encoding'] = self._encoding

            except Exception as e:
                print "EXCEPRION", word
                traceback.print_exc()
Esempio n. 23
0
def select_hash(hashkind, line):
    """Select the kind of hashing for the line.

    :param hashkind: -- (str) The name of the hash
    :param line: -- (str) The string to hash.

    This function is a kind of hash selector which will use the hash passed
    in argument to hash the string also passed in argument.

    """
    if hashkind == "md5":
        hashline = hashlib.md5(line).hexdigest()

    elif hashkind == "sha1":
        hashline = hashlib.sha1(line).hexdigest()

    elif hashkind == "crc":
        crc32 = crcmod.Crc(0x104c11db7, initCrc=0, xorOut=0xFFFFFFFF)
        crc32.update(line)
        hashline = crc32.hexdigest()

    elif hashkind == "murmur":
        hashline = mmh3.hash(line)

    return str(hashline)
 def count(self, item):
     counts = []
     for k, v in zip(self.sketch, range(self.hashes)):
         for j in k:
             search_key = mmh3.hash(item, v) % self.size
             counts.append(k[search_key])
     return min(counts)
 def last_seen(self, item):
     timestamps = []
     for k, v in zip(self.sketch, range(self.hashes)):
         for j in k:
             search_key = mmh3.hash(item, v) % self.size
             timestamps.append(k[search_key])
     return max(timestamps)
Esempio n. 26
0
    def contingentParitiesFunction(pop, verbose=False):
        assert(pop.shape[1] == order * height)
        popMissteps = []
        traceAndFitness = []
        for c in xrange(pop.shape[0]):
            output = 0
            ctr = 0
            length = pop.shape[1]
            loci = np.arange(length)
            missteps = []
            trace = ""
            while ctr < height:
                rng.seed(abs(mmh3.hash(trace)))
                acc = 0
                trace += "|"
                for i in xrange(order):
                    idx = rng.randint(length - (ctr * order + i)) + 1
                    swap = loci[-idx]
                    loci[-idx] = loci[ctr * order + i]
                    loci[ctr * order + i] = swap
                    trace += "%2d:%s|" % (swap + 1, int(pop[c, swap]))
                    acc += pop[c, swap]
                output += acc % 2

                if acc % 2 == 0:
                    missteps.append(ctr + 1)

                ctr +=1
            popMissteps.append(missteps)
            traceAndFitness.append((trace, height - len(missteps)))
        if verbose:
            for t in sorted(traceAndFitness):
                print "%s   %s " % t
        return np.array([height - len(missteps) for missteps in popMissteps]), popMissteps
Esempio n. 27
0
def alert_factory(location=None,
                bssid=None,
                channel=None,
                essid=None,
                tx=None,
                intent=None):

    # all arguments are required
    assert not any([
                location is None,
                bssid is None,
                channel is None,
                essid is None,
                tx is None,
                intent is None,
            ])

    # return dict from arguments
    _id = str(mmh3.hash(''.join([ bssid, str(channel), intent])))

    return {
        
        'id' : _id,
        'location' : location,
        'bssid' : bssid,
        'channel' : channel,
        'tx' : tx,
        'essid' : essid,
        'intent' : intent,
        'timestamp' : time.time(),
    }
Esempio n. 28
0
def get_scatter_prop(element_list):
  """ Gets the scatter property for an entity's key path.

  This will return a property for only a small percentage of entities.

  Args:
    element_list: A list of entity_pb.Path_Element objects.
  Returns:
    An entity_pb.Property object or None.
  """
  def id_from_element(element):
    if element.has_name():
      return element.name()
    elif element.has_id():
      return str(element.id())
    else:
      return ''

  to_hash = ''.join([id_from_element(element) for element in element_list])
  full_hash = mmh3.hash(to_hash)
  hash_bytes = struct.pack('i', full_hash)[0:2]
  hash_int = struct.unpack('H', hash_bytes)[0]
  if hash_int >= dbconstants.SCATTER_PROPORTION:
    return None

  scatter_property = entity_pb.Property()
  scatter_property.set_name('__scatter__')
  scatter_property.set_meaning(entity_pb.Property.BYTESTRING)
  scatter_property.set_multiple(False)
  property_value = scatter_property.mutable_value()
  property_value.set_stringvalue(hash_bytes)

  return scatter_property
Esempio n. 29
0
	def readHash(self):
		hll = Hll(self.p)
		x = sys.stdin.readline().rstrip('\n')
		while x:
			hll.AddItem(mmh3.hash(x))
			x = sys.stdin.readline().rstrip('\n')
		print hll.Count()
Esempio n. 30
0
 def lookup(self, string):
     for seed in range(self.hash_count):
         result = mmh3.hash(string, seed) % self.size
         if self.bit_array[result] == 0:
             #return "Nope"
             return False
     return True
 def lookup(self, element):
     for seed in self.seeds:
         result = hash(element, seed) % self.size
         if self.hash_values[result] == 0:
             return False
     return True
Esempio n. 32
0
 def exists(self, item):
     for i in range(self.hash_count):
         hashed_index = mmh3.hash(item, i) % self.size
         if self.bit_array[hashed_index] == 0:
             return False
     return True
Esempio n. 33
0
from ll import LL
import math


class HyperLogLog(LL):
    def __len__(self):
        indicator = sum(2**-m.counter for m in self.registers)
        E = self.alpha * (self.num_registers**2) / float(indicator)

        if E <= 5.0 / 2.0 * self.num_registers:
            V = sum(1 for m in self.registers if m.counter == 0)
            if V != 0:
                Estar = self.num_registers * \
                    math.log(self.num_registers / (1.0 * V), 2)
            else:
                Estar = E
        else:
            if E <= 2**32 / 30.0:
                Estar = E
            else:
                Estar = -2**32 * math.log(1 - E / 2**32, 2)
        return Estar


if __name__ == "__main__":
    import mmh3
    hll = HyperLogLog(8)
    for i in xrange(100000):
        hll.add(mmh3.hash(str(i)))
    print len(hll)
Esempio n. 34
0
 def add(self, s):
     for seed in range(self.hash_num):
         result = mmh3.hash(s, seed) % self.size
         self.bit_array[result] = 1
Esempio n. 35
0
    def get_machoc_hash(self):
        # Get Machoc Hash adapted from https://github.com/conix-security/machoke
        binary = self.r2p
        binary.cmd("aaa")
        mmh3_line = ""
        machoke_line = ""

        funcs = json.loads(binary.cmd("aflj"))
        if funcs is None:
            print("r2 could not retrieve functions list")

        def get_machoke_from_function(r2p, function):
            """Return machoke from specific
            :rtype: object
            """
            r2p.cmd("s {}".format(function["offset"]))
            agj_error = 0
            while True:
                try:
                    fcode = json.loads(r2p.cmd("agj"))
                    break
                except:
                    print >> sys.stderr, "Fail agj: %s" % hex(
                        function["offset"])
                if agj_error == 5:
                    break
                agj_error += 1
            blocks = []
            id_block = 1
            try:
                for block in fcode[0]["blocks"]:
                    blocks.append({
                        "id_block": id_block,
                        "offset": hex(block["offset"])
                    })
                    id_block += 1
            except:
                return ""
            line = ""
            id_block = 1
            for block in fcode[0]["blocks"]:
                word = "{}:".format(id_block)
                for instruction in block["ops"]:
                    # Check if call
                    if instruction["type"] == "call":
                        word = "{}c,".format(word)
                        for ublock in blocks:
                            if hex(instruction["offset"] +
                                   2) == ublock["offset"]:
                                word = "{}{},".format(word, ublock["id_block"])

                    # Check if jmp
                    if instruction["type"] == "jmp":
                        for ublock in blocks:
                            if instruction["esil"] == ublock["offset"]:
                                word = "{}{},".format(word, ublock["id_block"])

                    # Check if conditional jmp
                    elif instruction["type"] == "cjmp":
                        for ublock in blocks:
                            if hex(instruction["jump"]) == ublock["offset"]:
                                word = "{}{},".format(word, ublock["id_block"])
                            if hex(instruction["offset"] +
                                   2) == ublock["offset"]:
                                word = "{}{},".format(word, ublock["id_block"])
                    else:
                        pass
                if word[-2] == "c":
                    for ublock in blocks:
                        if hex(instruction["offset"] + 4) == ublock["offset"]:
                            word = "{}{},".format(word, ublock["id_block"])

                    if word[-2] == "c":
                        word = "{}{},".format(word, id_block + 1)

                if word[-1] == ":" and id_block != len(fcode[0]["blocks"]):
                    word = "{}{},".format(word, id_block + 1)
                # Clean word
                if word[-1] == ",":
                    word = "{};".format(word[:-1])
                elif word[-1] == ":":
                    word = "{};".format(word)
                line = "{}{}".format(line, word)
                id_block += 1
            return line

        for function in funcs:
            machoke = get_machoke_from_function(binary, function)
            machoke_line = "{}{}".format(machoke_line, machoke)
            mmh3_line = "{}{}".format(
                mmh3_line,
                hex(mmh3.hash(machoke) & 0xffffffff).replace("0x", "").replace(
                    "L", ""),
            )
        binary.quit()

        return mmh3_line
Esempio n. 36
0
  def test_hash_values(self):
    """ Test that on randomized data, values computed from mmh3 and pymmh3 match. """

    for i in range(10):
      random_value = str(random.random())
      self.assertEqual(mmh3.hash(random_value), pymmh3.hash(random_value))
Esempio n. 37
0
def normalized_hash(identifier: str, activation_group: str) -> int:
    return mmh3.hash("{}:{}".format(identifier, activation_group)) % 100 + 1
Esempio n. 38
0
 def lookup(self, string):
     for seed in range(self.hash_count):
         result = mmh3.hash(string, seed) % self.size
         if self.bit_array[result] == 0:
             return "Nope"
     return "Probably"
Esempio n. 39
0
 def add(self, string):
     for seed in range(self.hash_count):
         result = mmh3.hash(string, seed) % self.size
         self.bit_array[result] = 1
def lookup(string, bit_array, hash_count, size):
    for seed in range(hash_count):
        result = mmh3.hash(string, seed) % size
        if bit_array[result] == 0:
            return False
    return True
	def add(self, item):
		digests = []
		for i in range (self.hash_counts):
			digest = mmh3.hash(item, i) % self.bit_array_use
			self.bit_array_size[digest] = True
Esempio n. 42
0
def get_feature(feat_str, model):
    # The feature string may be unicode, but MurmurHash3 expects ASCII encoded strings.
    return mmh3.hash(feat_str.encode('ascii', 'xmlcharrefreplace')) % model.num_features
Esempio n. 43
0
def schingling(doc):
    return [mmh3.hash(doc[i:i + 9], signed=False) for i in range(len(doc) - 9)]
Esempio n. 44
0
def normalized_hash(identifier, activation_group):
    return mmh3.hash("{}:{}".format(activation_group, identifier),
                     signed=False) % 100 + 1
                "mimeFile": mimeFile,
                "normHtmlFile": normHtmlFile,
                "plainTextFile": plainTextFile
            }

    # If enabled, remove boilerplate HTML
    if options.boilerpipe:
        logging.info(url + ": deboiling html")
        extractor = ExtrB(extractor='ArticleExtractor', html=text)
        deboiled = str(extractor.getHTML())
    else:
        deboiled = text

    # We compute a hash on the HTML (either normalized one or after boilerpipe if enabled):
    # if we get duplicate files we discard them
    html_hash = mmh3.hash(deboiled, signed=False)
    # checking for duplicate content (duplicates are discarded)
    if html_hash in seen_html:
        logging.info("Repeated file:\t" + url)
        continue

    # get text with Alcazar library
    if options.parser == "alcazar":
        logging.info(url + ": Getting text with Alcazar")
        btext = alcazar.bodytext.parse_article(deboiled)
        if btext.body_text:
            plaintext = btext.body_text
        else:
            plaintext = ""

    # or get text with beautifulsoup
Esempio n. 46
0
def hash32(data: bytes) -> bytes:
    return struct.pack('i', mmh3.hash(data))
Esempio n. 47
0
 def _hashes(self, key):
     for i in xrange(self.k):
         yield mmh3.hash(key, self.k)
Esempio n. 48
0
 def add(self, item):
     for i in range(self.hash_count):
         hashed_index = mmh3.hash(item, i) % self.size
         self.bit_array[hashed_index] = 1
Esempio n. 49
0
 def _hash(self, element):
     return [
         b % self.size
         for b in [mmh3.hash(element, i) for i in range(self.hash_count)]
     ]
Esempio n. 50
0
 def _hashes_opt(self, key):
     # Kirsch - Mitzenmacher - Optimization
     h0 = mmh3.hash(key, 1)
     h1 = mmh3.hash(key, 10)
     for i in xrange(self.k):
         yield h0 + i * h1
Esempio n. 51
0
def murmur3_32(text):
    val = mmh3.hash(text)
    return val if val >= 0 else val + 2**32
def runSim(args):

    ## avoid one processes starting multiple threads
    os.environ["MKL_NUM_THREADS"] = "1"
    os.environ["NUMEXPR_NUM_THREADS"] = "1"
    os.environ["OMP_NUM_THREADS"] = "1"

    dataset = args[0]
    if dataset[-1] != '/':
        dataset += '/'
    ground_truth, theta = args[1]

    ## load dna reads into reads_lst
    reads_lst = []
    fastaFile = dataset + "/reads.fasta"
    with open(fastaFile) as handle:
        for values in SimpleFastaParser(handle):
            reads_lst.append(values[1])
    n = len(reads_lst)

    ## load precomputed Jaccard Similarities
    JSims = np.loadtxt(dataset + "/minHashes/JSims.txt")

    ## load alignments
    gt_file = "{}/{}_ground_truth.txt".format(dataset, ground_truth)

    with open(gt_file) as f:
        lines = [[float(x) for x in line.rstrip('\n').split('\t')]
                 for line in f]

    refDict = {}
    for i in range(n):
        refDict[i] = {}
    for line in lines:
        refDict[int(line[0]) - 1][int(line[1]) -
                                  1] = line[2] / (line[3] + line[4] - line[2])
        refDict[int(line[1]) - 1][int(line[0]) -
                                  1] = line[2] / (line[3] + line[4] - line[2])

    ## convert each read into it's k-mers
    symLength = 7  # k in k-mer

    def generateSymSets(reads_lst, symLength):
        symSets = {}
        for i, read in enumerate(reads_lst):
            lst = set()
            for j in range(len(read) - symLength):
                lst.add(read[j:j + symLength])
            symSets[i] = lst
        return symSets

    symSets = generateSymSets(reads_lst, symLength)

    ## load precomputed minHashes
    minHashArr = np.zeros((n, 1000))
    for i in range(n):
        minHashArr[i] = np.load(dataset +
                                "minHashes/minHashes_{}.txt".format(i),
                                allow_pickle=True)

    ## load precomputed iid sequences and their minhashes
    numRandReads = 5
    randMinHashArr = np.zeros((numRandReads, 1000))
    for i in range(numRandReads):
        randMinHashArr[i] = np.load(dataset +
                                    "randReads/randMinHashes_{}.txt".format(i),
                                    allow_pickle=True)

    minHashArrExtended = np.vstack((minHashArr[:, :1000], randMinHashArr))

    ## checking to see precomputed minHashes works
    i = np.random.randint(0, n, size=100)
    j = np.random.randint(0, 1000, size=100)
    lst = []
    for iter_round in range(100):

        iterLst = list(symSets[i[iter_round]])

        lst.append(
            min([
                mmh3.hash(sym, j[iter_round], signed=False) for sym in iterLst
            ]))

    assert (np.alltrue(minHashArrExtended[i, j] == lst))

    ## checking to see precomputed JSim works
    i = np.random.randint(0, n, size=100)
    j = np.random.randint(0, n, size=100)
    lst = []
    for iter_round in range(100):
        i1 = i[iter_round]
        j1 = j[iter_round]

        lst.append(JSims[i1, j1] == 1.0 *
                   len(symSets[i1].intersection(symSets[j1])) /
                   (len(symSets[i1].union(symSets[j1]))))

    assert (np.alltrue(lst) and np.allclose(JSims, JSims.T))

    ## Testing SVD, JSimEmp, JSim Exact, reference vs all
    storageArrGround = []
    storageArrpHatSVD = []
    storageArrJsimExact = []
    storageArrJsimEmp = []
    storageArrNumOnesCol = []
    storageArrQjs = []
    storageArrwSJS = []

    h = 1000

    for ref_read in trange(n):

        groundTruthLocs = np.array(list(refDict[ref_read].keys()))
        if len(groundTruthLocs) == 0:  ## read has no alignments in dataset
            continue
        refReadMatches = refDict[ref_read]
        groundTruthVals = [refReadMatches[i] for i in groundTruthLocs]

        lst = set(list(groundTruthLocs))
        rangeN = set(range(n))
        rangeN.discard(ref_read)

        toAppend = list(rangeN - lst)
        groundTruthLocs = np.hstack((groundTruthLocs, np.array(toAppend)))
        groundTruthVals += [0] * len(toAppend)

        empiricalMatrix = (minHashArrExtended == minHashArrExtended[ref_read])
        empiricalMatrix = np.delete(empiricalMatrix, ref_read, axis=0)[:, :h]

        updatedGroundTruthLocs = groundTruthLocs - 1 * (groundTruthLocs >=
                                                        ref_read)
        updatedGroundTruthLocs = updatedGroundTruthLocs.astype(int)

        jSimEmpirical = np.mean(empiricalMatrix, axis=1)
        jSimExact = np.delete(JSims[ref_read], ref_read)

        ## here we can modify what normalization is used without having to rerun SVDs
        # u = np.loadtxt(dataset+"/SVD/raw_pi_refread_{}.txt".format(ref_read))
        # pHatSVD = 1-np.abs(u[:n-1])/np.abs(np.median(u[:n-1])) ## normalize median of p_i
        # pHatSVD = 1-np.abs(u[:n-1])/np.abs(np.median(u[n-1:])) ## random read normalization
        # pHatSVD = 1-np.abs(u[:n-1])/np.max(np.abs(u[n-1:])) ## naive max normalziation

        pHatSVD = np.loadtxt(dataset +
                             "/SVD/pi_refread_{}.txt".format(ref_read))
        qSVD = np.loadtxt(dataset + "/SVD/qj_refread_{}.txt".format(ref_read))

        ## for approximation
        empQ = empiricalMatrix.sum(axis=0)
        x = np.matmul(empiricalMatrix - np.ones(empiricalMatrix.shape),
                      1 - np.array(empQ / np.max(empQ)))[:n - 1]
        x = np.abs(x - np.min(x))
        x /= np.max(x)
        storageArrwSJS.extend(x[updatedGroundTruthLocs])

        storageArrGround.extend(groundTruthVals)
        storageArrpHatSVD.extend(pHatSVD[updatedGroundTruthLocs])
        storageArrJsimEmp.extend(jSimEmpirical[:n - 1][updatedGroundTruthLocs])
        storageArrJsimExact.extend(jSimExact[updatedGroundTruthLocs])
        storageArrNumOnesCol.extend(np.mean(empiricalMatrix, axis=0))
        storageArrQjs.extend(qSVD)

    fpr, tpr, _ = roc_curve(
        np.array(storageArrGround) >= theta, storageArrpHatSVD)
    fpr_jsim, tpr_jsim, _ = roc_curve(
        np.array(storageArrGround) >= theta, storageArrJsimExact)
    fpr_js_emp, tpr_js_emp, _ = roc_curve(
        np.array(storageArrGround) >= theta, storageArrJsimEmp)
    fpr_wsjs, tpr_wsjs, _ = roc_curve(
        np.array(storageArrGround) >= theta, storageArrwSJS)

    pickle.dump([
        auc(fpr, tpr),
        auc(fpr_jsim, tpr_jsim),
        auc(fpr_js_emp, tpr_js_emp),
        auc(fpr_wsjs, tpr_wsjs), storageArrNumOnesCol,
        np.corrcoef(storageArrpHatSVD, storageArrGround)[0, 1],
        np.corrcoef(storageArrJsimExact, storageArrGround)[0, 1],
        np.corrcoef(storageArrJsimEmp, storageArrGround)[0, 1], storageArrQjs,
        'SJS AUC,JS AUC, JS emp AUC, wSJS AUC,numOnes per col,SJS r^2,JS r^2,JS emp r^2,storageArrQjs'
    ],
                open(
                    "AUCs/{}_{}_{}.pkl".format(dataset[:-1], ground_truth,
                                               str(theta % 1).split('.')[1]),
                    "wb"))
Esempio n. 53
0
 def insert(self, item):
   for i in range(self.qty_hash):
     t = mmh3.hash(bytes(item), i) % self.size
     self.bitarray[t] = True
Esempio n. 54
0
 def faviconHash(self, data, web_source=None):
     if web_source:
         b64data = base64.encodebytes(data).decode()
     else:
         b64data = base64.encodebytes(data)
     return mmh3.hash(b64data)
Esempio n. 55
0
def hash(flowkey):
    global width
    flowkey_bytes = struct.pack("L", flowkey)
    r = mmh3.hash(flowkey_bytes, signed=False)
    return r % width
 def add(self, element):
     for seed in self.seeds:
         result = hash(element, seed) % self.size
         self.hash_values[result] = 1
     return self.hash_values
Esempio n. 57
0
def string_digest(item, index):
    return mmh3.hash(bytes(item, 'utf-8'), index)
	def is_member(self, item):
		for i in range (self.hash_counts):
			digest = mmh3.hash(item, i) % self.bit_array_use
			if self.bit_array_size[digest] == False:
				return False
		return True
Esempio n. 59
0
def murmur(key):
    return mmh3.hash(key)
Esempio n. 60
0
    #np.dot(m.transpose(),m)
    #Jaccard(m[0],m[10])

    #s=signature(m,10000)
    #s.shape

    #m_new = firma(m)

    m_new2 = m.dot(rndVecs) # projected matrix
    # Indexing text collection
    for doc_id in range(m_new2.shape[0]):
        docSgt = np.array(m_new2[doc_id, :] >= 0, dtype=np.int)
        for blk in range(NRBLK):
            # (blk*BLKSZ):((blk+1)*BLKSZ)
            blkData = docSgt[(blk*BLKSZ):((blk+1)*BLKSZ)]
            docHashVal = mmh3.hash(''.join(map(str, blkData))) % MAXBKTS
            hshTbl_blk = HshTabls[blk]
            if docHashVal not in hshTbl_blk:
                hshTbl_blk[docHashVal] = set()
            hshTbl_blk[docHashVal].add(doc_id + 1)
    collision = np.zeros((m.shape[0], m.shape[0]), dtype=np.int)
    for hshTbl_blk in HshTabls:
        for e in hshTbl_blk:
            for i in hshTbl_blk[e]:
                for o in hshTbl_blk[e]:
                    collision[i - 1][o - 1] += 1
    pldHaming=penalizedHcc(m_new2)
    simcos=(np.pi / 2) * (1 - hmmg(m_new2))
    simcospenalized =(np.pi / 2) * (1 - pldHaming)
    print("End!")