def __init__(self, tokens, length=100000):

        """Calculates a Charikar simhash with appropriate bitlength.
        
        Input can be any iterable, but for strings it will automatically
        break it into words first, assuming you don't want to iterate
        over the individual characters. Returns nothing.
        
        """
        if isinstance(tokens,basestring):
            tokens = tokens.split()

        v = {}
        if isinstance(tokens,dict):
            for value,w in tokens.iteritems():
                k = xxhash.xxh64(value).intdigest()
                x = v.get(k%length,0)
                if k & 1 << 63:
                    v[k%length] = x + w
                else:
                    v[k%length] = x - w
        else:
            for value in tokens:
                k = xxhash.xxh64(value).intdigest()
                x = v.get(k%length,0)
                if k & 1 << 63:
                    v[k%length] = x + 1
                else:
                    v[k%length] = x - 1
    
        self.hash = v
        self.vector = v
Example #2
0
 def __hash_from_argument(self, argument):
     arg_string = ""
     if hasattr(argument, 'md5hash'):
         return argument.md5hash
     if hasattr(argument, 'xxhash64'):
         return argument.xxhash64
     if type(argument) is numpy.ndarray:
         if argument.size > 181440000:
             return self.__hash_choice(argument.data)
         else:
             return str(xxhash.xxh64(argument.data))
     if type(argument) is pandas.core.frame.DataFrame:
         col_values_list = list(argument.columns.values)
         try:
             col_values_string = ''.join(col_values_list)
             arg_string = col_values_string
             if argument.values.size > 181440000:
                 return str(xxh.hash64(argument.data)) + "+" + str(xxhash.xxh64(arg_string))
             else:
                 return self.__hash_choice(argument.values.data) + "+" + str(xxhash.xxh64(arg_string))
         except:
             if argument.values.size > 181440000:
                 return str(xxh.hash64(argument.values.data))
             else:
                 return self.__hash_choice(argument.values.data)
     if type(argument) is list or type(argument) is tuple:
         arg_string = str(len(argument))
     arg_string += str(argument)
     return self.__hash_choice(arg_string)
Example #3
0
    def _xxhash(self):
        """
        An xxhash.b64 hash of the array.

        Returns
        -------------
        xx: int, xxhash.xxh64 hash of array.
        """
        # repeat the bookkeeping to get a contiguous array inside
        # the function to avoid additional function calls
        # these functions are called millions of times so everything helps
        if self._modified_x or not hasattr(self, '_hashed_xx'):
            if self.flags['C_CONTIGUOUS']:
                hasher = xxhash.xxh64(self)
                self._hashed_xx = hasher.intdigest()
            else:
                # the case where we have sliced our nice
                # contiguous array into a non- contiguous block
                # for example (note slice *after* track operation):
                # t = util.tracked_array(np.random.random(10))[::-1]
                contiguous = np.ascontiguousarray(self)
                hasher = xxhash.xxh64(contiguous)
                self._hashed_xx = hasher.intdigest()
        self._modified_x = False
        return self._hashed_xx
Example #4
0
def hashRequests(authTicket, payload):
    baseHash = xxhash.xxh64(
        authTicket.SerializeToString(),
        seed=0x1B845238
    ).intdigest()

    # Serialize and hash each request
    return [xxhash.xxh64(
        request.SerializeToString(),
        seed=baseHash
    ).intdigest() for request in payload]
Example #5
0
def string_hash(value,length=11):
    s = ''
    for i in range(0,length,11):
        s = s + xxhash.xxh64(value+str(i)).hexdigest()
    s = encode_hash(int(s,16))[:length]
    if len(s) < length:
        s = s + "A" * (length - len(s))
    return s
Example #6
0
def string_hash_bits(value,length_in_bits=128):
    ''' Length must be a multiple of 4'''
    hex_length = length_in_bits / 4
    s = ''
    for i in range(0,length_in_bits,64):
        s = s + xxhash.xxh64(value+str(i)).hexdigest()
    s = s[:hex_length]
    x = int(s,16)
    return x
Example #7
0
def xxhash64(path, block_size=4096):
    try:
        with open(path, 'rb') as rf:
            h = xxhash.xxh64()
            for chunk in iter(lambda: rf.read(block_size), b''):
                h.update(chunk)
        return h.hexdigest(), path
    except IOError:
        return None, path
Example #8
0
    def hash(self):
        """Return hash of motif.

        This is an unique identifier of a motif, regardless of the id.

        Returns:
        hash : str
        """
        return xxhash.xxh64(self._pwm_to_str(3)).hexdigest()
def subscribe(request):
    """
    Subcribe the given email to the given URL.

    TODO BEN: Include Subscription title or description in POST variables
    """
    url = request.POST['subscription_url']
    email = request.POST['email']

    user, created_user = User.objects.get_or_create(email=email)
    if created_user:
        user_verification_hash = uuid.uuid4().hex
        user_verification = Verification.objects.create(
            verified=False,
            verification_hash=user_verification_hash)
        user.verification = user_verification
        user.save()

    content, created_content = SubscribedContent.objects.get_or_create(url=url)
    if created_content:
        content_response = requests.get(url)
        content_hash = xxhash.xxh64(content_response.text).hexdigest()
        content.latest_content_hash = content_hash

    if not Subscription.objects.filter(user=user, content=content).exists():
        verification_hash = uuid.uuid4().hex
        verification_url = request.build_absolute_uri(
            reverse(
                'verify',
                kwargs={
                    'email': email,
                    'key': verification_hash}
            )
        )
        verification_item = Verification.objects.create(
            verified=False,
            verification_hash=verification_hash
        )

        Subscription.objects.create(
            user=user,
            content=content,
            verification=verification_item
        )

        email_sent = EMAIL.VERIFY_SUBSCRIPTION.send(
            email, {'verification_url': verification_url})
        if email_sent:
            message = MESSAGES.EMAIL.VERIFICATION_SENT.format(email)
        else:
            message = MESSAGES.EMAIL.ERROR_SENDING_EMAIL.format(
                email)
    else:
        message = MESSAGES.EMAIL.ALREADY_SUBSCRIBED

    messages.add_message(request, messages.INFO, message)
    return redirect(request.META['HTTP_REFERER'], {'message': message})
Example #10
0
    def test_XXH64_reset(self):
        x = xxhash.xxh64()
        h = x.intdigest()

        for i in range(10, 50):
            x.update(os.urandom(i))

        x.reset()

        self.assertEqual(h, x.intdigest())
Example #11
0
 def save(self, *args, **kwargs):
     new_hash = xxhash.xxh64(self.content_raw).hexdigest()
     mentioned_users = []
     if new_hash != self.raw_content_hash or (not self.pk):
         # To (re-)render the content if content changed or topic is newly created
         self.content_rendered, mentioned_users = render_content(self.content_raw, sender=self.user.username)
     super(Topic, self).save(*args, **kwargs)
     self.raw_content_hash = new_hash
     for to in mentioned_users:
             notify.delay(to=to.username, sender=self.user.username, topic=self.pk)
Example #12
0
    def cached_parse_dhcp(self, lines, cur_time=None):
        if cur_time is None:
            cur_time = dt.utcnow()
        m = xxhash.xxh64()
        m.update("".join(lines[:self.dhcp_cache_len]).encode("utf8"))
        new_hash = m.digest()
        # new_len = len(lines)

        if new_hash != self.dhcp_hash:
            self.dhcp_cache_len = 0
            self.dhcp_cache = []
            m = xxhash.xxh64()

        lines = lines[self.dhcp_cache_len:]
        self.dhcp_cache.extend(self.from_dhcp(lines, cur_time))
        m.update("".join(lines).encode("utf8"))
        self.dhcp_hash = m.digest()
        self.dhcp_cache_len += len(lines)

        return self.dhcp_cache
Example #13
0
def xxhash_file(srcfile, logger, block_size=2**20):
    f_name = func_name()
    logger.info(f_name+"\t\tCalculating xx-hash on : "+srcfile)
    f = open(srcfile, 'r')  
    x = xxhash.xxh64()
    while True:
        data = f.read(block_size)
        if not data:
            break
        x.update(data)
    return  x.hexdigest()
Example #14
0
    def generate_content_hashes(self):
        """
        Generate a dictionary which maps parl_ids to their respective hashes

        Used for speedy comparison of changes
        """
        es_response = json.loads(self.get_content())
        content_hashes = {}
        for res in es_response['result']:
            content_hashes[res['parl_id']] = xxhash.xxh64(
                json.dumps(res)).hexdigest()
        return json.dumps(content_hashes)
Example #15
0
 def save(self, *args, **kwargs):
     new_hash = xxhash.xxh64(self.content_raw).hexdigest()
     mentioned_users = []
     if new_hash != self.raw_content_hash or (not self.pk):
         self.content_rendered, mentioned_users = render_content(self.content_raw, sender=self.user.username)
     super(Post, self).save(*args, **kwargs)
     t = self.topic
     t.reply_count = t.get_reply_count()
     t.last_replied = t.get_last_replied()
     t.save(update_fields=['last_replied', 'reply_count'])
     for to in mentioned_users:
             notify.delay(to=to.username, sender=self.user.username, post=self.pk)
Example #16
0
 def memoize_wrapper(*args, **kwargs):
     hash = xxhash.xxh64(str(args) + str(kwargs)).intdigest()
     path = path_pattern.format(hash=hash)
     try:
         with open(path, 'rb') as file:
             logger.debug("Loading pickle %s", path)
             data = pickle.load(file)
     except (FileNotFoundError, EOFError):
         data = fn(*args, **kwargs)
         with open(path, 'wb') as file:
             pickle.dump(data, file)
     return data
Example #17
0
 def test_XXH64(self):
     x = xxhash.xxh64()
     x.update('a')
     self.assertEqual(xxhash.xxh64('a').digest(), x.digest())
     x.update('b')
     self.assertEqual(xxhash.xxh64('ab').digest(), x.digest())
     x.update('c')
     self.assertEqual(xxhash.xxh64('abc').digest(), x.digest())
     seed = random.randint(0, 2**32)
     x = xxhash.xxh64(seed=seed)
     x.update('a')
     self.assertEqual(xxhash.xxh64('a', seed).digest(), x.digest())
     x.update('b')
     self.assertEqual(xxhash.xxh64('ab', seed).digest(), x.digest())
     x.update('c')
     self.assertEqual(xxhash.xxh64('abc', seed).digest(), x.digest())
Example #18
0
    def parse(self, response):
        soup = BeautifulSoup(response.body)
        for script in soup(["script", "style"]):
            script.extract()
        text = soup.get_text()

        response.meta.update(score=KeywordScorer.score(text))
        response.meta.update(
            content_hash=xxhash.xxh64(text.encode('ascii', 'ignore')).intdigest())

        for link in self.link_extractor.extract_links(response):
            request = Request(url=link.url)
            request.meta.update(link_text=link.text)
            link_score = KeywordScorer.score(link.text)
            request.meta.update(score=link_score)
            yield request
Example #19
0
def _calculate(reaction, descriptorDict, verbose=False, whitelist=None):
    """Calculate descriptors for this plugin with descriptorDict already created."""
    # descriptor Value classes
    cat = DRP.models.CatRxnDescriptorValue
    perm = DRP.models.CategoricalDescriptorPermittedValue

    # reaction space descriptor
    heading = 'rxnSpaceHash1'
    if whitelist is None or heading in whitelist:
        h = xxhash.xxh64()  # generates a hash
        for reactant in reaction.compounds.order_by('abbrev'):
            h.update(reactant.abbrev)
        p = perm.objects.get_or_create(descriptor=descriptorDict[
                                       heading], value=h.hexdigest())[0]
        cat.objects.update_or_create(defaults={
                                     'value': p}, reaction=reaction, descriptor=descriptorDict['rxnSpaceHash1'])[0]
Example #20
0
def feature_hash_string(s, window, dim):

    start = time.clock()

    # Generate window-char Markov chains & create feature hash vector

    v = {}
    for x in range(0, dim):
        v[x] = 0
    length = len(s)
    max_num = 2.0 ** 64
    for x in range(0, length - window):
        key = xxhash.xxh64(s[x:x + window]) % dim
        v[key] += 0x1

    return numpy.asarray(v.values())
Example #21
0
 def wrapper(*args, **kwds):
     if not cache:
         return f(*args, **kwds)
     if key_func:
         key = 'django_vimeo_cache:{}'.format(key_func(*args, **kwds))
     else:
         key = 'django_vimeo_cache:' + f.__name__ + ':' +\
               str(list(args) + list(sorted(kwds.items())))
     key = xxhash.xxh64(key).hexdigest()
     value = cache.get(key)
     if value is None:
         value = f(*args, **kwds)
         cache.set(key, value, expires)
         value = cache.get(key)
         if value is None:
             raise Exception('failed to fetch cached value, try again')
     return value
Example #22
0
    def bv_hash(self):
        """
        Iterate over all the BinaryView (flat iteration over the hex values themselves)
        :return:(INT) Hash  of the whole file
        """

        # create file object
        br = BinaryReader(self.bv)

        # calculate file hash
        file_hash = xxhash.xxh64()
        # for some reason a BinaryReader won't read more then 1000 or so bytes
        temp_hash = br.read(1000)
        while temp_hash:
            file_hash.update(temp_hash)
            temp_hash = br.read(1000)

        return file_hash.hexdigest()
Example #23
0
def ducos1xxh(lastBlockHash, expectedHash, difficulty):
    # XXHASH algorithm
    # Measure starting time
    timeStart = time()
    # Loop from 1 too 100*diff
    for ducos1xxres in range(100 * int(difficulty) + 1):
        # Generate hash
        ducos1xx = xxhash.xxh64(str(lastBlockHash) + str(ducos1xxres),
                                seed=2811)
        ducos1xx = ducos1xx.hexdigest()
        # Check if result was found
        if ducos1xx == expectedHash:
            # Measure finish time
            timeStop = time()
            # Calculate hashrate
            timeDelta = timeStop - timeStart
            hashrate = ducos1xxres / timeDelta
            return [ducos1xxres, hashrate]
Example #24
0
def read(mount_point, incoming_data, **kwargs):
    outgoing_data = {}
    flock = kwargs['flock']
    offset = incoming_data['offset']
    chunk_size = incoming_data['repeats']
    f_path = ''.join([mount_point, incoming_data['target']])
    with open(f_path, 'rb') as f:
        f.seek(offset)
        flock.lockf(f.fileno(), fcntl.LOCK_SH | fcntl.LOCK_NB, chunk_size,
                    offset, 0)
        buf = f.read(chunk_size)
        flock.lockf(f.fileno(), fcntl.LOCK_UN, chunk_size, offset)
        outgoing_data['hash'] = xxhash.xxh64(buf).intdigest()
        outgoing_data['offset'] = offset
        outgoing_data['chunk_size'] = chunk_size
        outgoing_data['uuid'] = incoming_data['uuid']
        outgoing_data['tid'] = incoming_data['tid']
        # outgoing_data['buffer'] = buf[:256].decode()
        return outgoing_data
Example #25
0
    def extract_attribute(self, base_object: BDBasicBlock) -> int:
        # Check if value already exists
        BasicBlockHash_value = base_object.get_attribute_value(
            'BasicBlockHash')

        if BasicBlockHash_value:
            pass
        else:
            hash_value = xxhash.xxh64()
            for instruction_expression in base_object.underlying_obj:
                for instruction in instruction_expression[0]:
                    hash_value.update(instruction.text)

            base_object.add_attribute_value('BasicBlockHash',
                                            {'hash': hash_value.intdigest()})
            BasicBlockHash_value = base_object.get_attribute_value(
                'BasicBlockHash')

        return BasicBlockHash_value['hash'] if BasicBlockHash_value else None
Example #26
0
    def write(self, path, data, offset, fh):
        realpath = self.remotepath(path)
        cachefile = self.cachefile(realpath)
        if not os.path.exists(cachefile):
            if self.empty_file(realpath):
                self.create(path, 'wb')
            else:
                raise FuseOSError(ENOENT)

        with open(cachefile, 'rb+') as outfile:
            outfile.seek(offset, 0)
            outfile.write(data)

        self.attributes.insert(realpath, self.extract(os.lstat(cachefile)))
        task = Task(
            xxhash.xxh64(realpath).intdigest(), self._write, realpath, data,
            offset)
        self.taskpool.submit(task)
        return len(data)
Example #27
0
    def set_motifs(self, motifs):
        try:
            # Check if motifs is a list of Motif instances
            motifs[0].to_pwm()
            tmp = NamedTemporaryFile(mode="w", delete=False)
            for m in motifs:
                tmp.write("{}\n".format(m.to_pwm()))
            tmp.close()
            motif_file = tmp.name
        except AttributeError as e:
            motif_file = motifs

        self.motifs = motif_file
        with open(motif_file) as f:
            self.motif_ids = [m.id for m in read_motifs(f)]
        self.checksum = {}
        if self.use_cache:
            chksum = xxhash.xxh64("\n".join(sorted(self.motif_ids))).digest()
            self.checksum[self.motif_file] = chksum
Example #28
0
def filesget(id):
    # helper: *deep level route*

    rfile = StaticFile.get(id=id)
    mimetype = (lambda x: 'application/octet-stream'
                if x == None else x)(mimetypes.guess_type(rfile.original,
                                                          strict=True))
    response.set_header('Accept-Ranges', 'bytes')
    response.set_header('Content-Length', str(rfile.length))
    response.set_header('Content-Type', mimetype)
    response.set_header(
        'Last-Modified',
        datetime.fromtimestamp(
            rfile.created,
            tz=pytz.timezone('GMT')).strftime('%a, %d %b %Y %H:%M:%S GMT'))
    response.set_header(
        'ETag', '"{0}"'.format(xxhash.xxh64(rfile.content).hexdigest()))

    return io.BytesIO(rfile.content)
Example #29
0
 def iter_archive(self, archive_dir):
     archivehashes = []
     if os.path.isdir(archive_dir):
         for path, dirs, files in walk(archive_dir):
             for filename in files:
                 filepath = joinpath(path, filename)
                 if os.path.isfile(filepath):
                     filesize = stat(filepath).st_size
                     try:
                         filehash = xxhash.xxh64(
                             open(filepath, 'rb').read()).hexdigest()
                     except:
                         print("Didnt like this file", filepath)
                     if filehash != '':
                         archivehashes.append([{
                             'filehash': filehash,
                             'path': filepath
                         }])
     return archivehashes
Example #30
0
 def handle(self, *args, **kwargs):
     hashDictionary = {}
     collisionCount = 0
     for reaction in Reaction.objects.all():
         reactantString = ''
         h = xxhash.xxh64()
         for reactant in reaction.compounds:
             h.update(reactant.abbrev)
             reactantString += reactant.abbrev
         if h in hashDictionary:
             if hashDictionary[h].hexdigest() != reactantString:
                 collisionCount += 1
         else:
             hashDictionary[h] = reactantString
     if collisionCount > 0:
         e = EmailToAdmins('Dark Reactions Project: Hash Collision Failure',
                           'A collision between reaction space hashes has occured. Please contact the DRP development team and file a bug report.')
         e.send()
         exit(1)
Example #31
0
def data(path, D):
    ''' GENERATOR: 
            Apply hash-trick to the original csv row
            and for simplicity, we one-hot-encode everything

        INPUT:
            path: path to training or testing file
            D: the max index that we can hash to

        YIELDS:
            x: a list of hashed and one-hot-encoded 'indices'
               we only need the index since all values are either 0 or 1
            y: y = 1 if we have a click, else we have y = 0
    '''

    with open(path, 'r', encoding='utf-8') as f:
        csvreader = reader(f)  # create a CSV reader
        header = next(csvreader)
        for row in csvreader:  # iterate over the available rows
            row = dict(zip(header, row))

            # ts and bid_id are used only while updating train data
            for feat in ['bid_id', 'ts']:
                if feat in row:
                    del row[feat]

            # process clicks
            y = 0.
            target = 'click'
            if target in row:
                if row[target] == '1':
                    y = 1.
                del row[target]

            # build x
            x = []
            for key in row:
                value = row[key]
                # one-hot encode everything with hash trick
                index = xxh64(key + '_' + value).intdigest() % D
                x.append(index)

            yield x, y
Example #32
0
 def iter_duplicates(self, duplicates_dir):
     tobevalidated = []
     if os.path.isdir(duplicates_dir):
         for path, dirs, files in walk(duplicates_dir):
             for filename in files:
                 filepath = joinpath(path, filename)
                 if os.path.isfile(filepath):
                     filesize = stat(filepath).st_size
                     try:
                         filehash = xxhash.xxh64(
                             open(filepath, 'rb').read()).hexdigest()
                     except:
                         print("Didnt like this file: ", filepath)
                     if filehash != '':
                         tobevalidated.append([{
                             'filehash': filehash,
                             'path': filepath
                         }])
     return tobevalidated
Example #33
0
def store(val, srcID):
    # if srcID not seen yet, make new entry in srcIDs
    if srcID not in srcIDs:
        srcIDs[srcID] = []
    key = xxhash.xxh64(val).intdigest() & 0xffff
    # remove expired keys in srcID's list
    srcIDs[srcID] = [k for k in srcIDs[srcID] if data[k][2] + 300 >= time.time()]
    # if over the limit, discard the store
    if len(srcIDs[srcID]) >= srcLimit:
        return
    # if key not yet stored
    if key not in data:
        data[key] = (val, srcID, time.time())
        srcIDs[srcID].append(key)
    # check for key expiration
    elif data[key][2] + 300 < time.time():
        srcIDs[data[key][1]].remove(key)
        data[key] = (val, srcID, time.time())
        srcIDs[srcID].append(key)
Example #34
0
    def generate_content_hashes(self, content=None):
        """
        Generate a dictionary which maps parl_ids to their respective hashes

        Used for speedy comparison of changes
        """
        if not content:
            es_response = json.loads(self.get_content())
        else:
            try:
                es_response = json.loads(content)
            except:
                es_response = json.loads(self.get_content())

        content_hashes = {}
        for res in es_response['result']:
            content_hashes[res['parl_id']] = xxhash.xxh64(
                json.dumps(res)).hexdigest()
        return json.dumps(content_hashes)
Example #35
0
def hash_file2(fpath, blocksize=65536, hasher='xx64'):
    r"""
    Hashes the data in a file on disk using xxHash

    xxHash is much faster than sha1, bringing computation time down from .57
    seconds to .12 seconds for a 387M file.

    my_weights_fpath_ = ub.truepath('~/tmp/my_weights.pt')


    xdata = 2 ** np.array([8, 12, 14, 16])
    ydatas = ub.ddict(list)
    for blocksize in xdata:
        print('blocksize = {!r}'.format(blocksize))
        ydatas['sha1'].append(ub.Timerit(2).call(ub.hash_file, my_weights_fpath_, hasher='sha1', blocksize=blocksize).min())
        ydatas['sha256'].append(ub.Timerit(2).call(ub.hash_file, my_weights_fpath_, hasher='sha256', blocksize=blocksize).min())
        ydatas['sha512'].append(ub.Timerit(2).call(ub.hash_file, my_weights_fpath_, hasher='sha512', blocksize=blocksize).min())
        ydatas['md5'].append(ub.Timerit(2).call(ub.hash_file, my_weights_fpath_, hasher='md5', blocksize=blocksize).min())
        ydatas['xx32'].append(ub.Timerit(2).call(hash_file2, my_weights_fpath_, hasher='xx32', blocksize=blocksize).min())
        ydatas['xx64'].append(ub.Timerit(2).call(hash_file2, my_weights_fpath_, hasher='xx64', blocksize=blocksize).min())

    import netharn as nh
    nh.util.qtensure()
    nh.util.multi_plot(xdata, ydatas)
    """
    import xxhash
    if hasher == 'xx32':
        hasher = xxhash.xxh32()
    elif hasher == 'xx64':
        hasher = xxhash.xxh64()

    with open(fpath, 'rb') as file:
        buf = file.read(blocksize)
        # otherwise hash the entire file
        while len(buf) > 0:
            hasher.update(buf)
            buf = file.read(blocksize)
    # Get the hashed representation
    text = ub.util_hash._digest_hasher(hasher,
                                       hashlen=None,
                                       base=ub.util_hash.DEFAULT_ALPHABET)
    return text
Example #36
0
 def handle(self, *args, **kwargs):
     """Handle the command call."""
     hashDictionary = {}
     collisionCount = 0
     for reaction in Reaction.objects.all():
         reactantString = ''
         h = xxhash.xxh64()
         for reactant in reaction.compounds:
             h.update(reactant.abbrev)
             reactantString += reactant.abbrev
         if h in hashDictionary:
             if hashDictionary[h].hexdigest() != reactantString:
                 collisionCount += 1
         else:
             hashDictionary[h] = reactantString
     if collisionCount > 0:
         e = EmailToAdmins('Dark Reactions Project: Hash Collision Failure',
                           'A collision between reaction space hashes has occured. Please contact the DRP development team and file a bug report.')
         e.send()
         exit(1)
Example #37
0
def _encValKey(v):
    '''
    Encode a value as used in a key.

    Non-negative numbers are msgpack encoded.  Negative numbers are encoded as a marker, then the
    encoded negative of that value, so that the ordering of the encodings is easily mapped to the
    ordering of the negative numbers.  Strings too long are hashed.  Note that this scheme prevents
    interleaving of value types: all string encodings compare larger than all negative number
    encodings compare larger than all nonnegative encodings.
    '''
    if isinstance(v, int):
        if v >= 0:
            return s_msgpack.en(v)
        else:
            return NEGATIVE_VAL_MARKER_ENC + s_msgpack.en(-v)
    else:
        if len(v) >= LARGE_STRING_SIZE:
            return (HASH_VAL_MARKER_ENC + s_msgpack.en(xxhash.xxh64(v).intdigest()))
        else:
            return STRING_VAL_MARKER_ENC + s_msgpack.en(v)
Example #38
0
def hash_file(path):
    """
    Calculates hash for the file of given path

    Parameters
    ----------
    path: string
        Path to file to be hashed

    Returns
    -------
        Hash digest calculated for given file
    """
    if path is None or not os.path.isfile(path):
        return None
    fo = open(path, 'rb')
    c = fo.read()
    r = xxhash.xxh64(c).hexdigest()
    fo.close()
    return r
Example #39
0
        def validate(data_dst, data):
            match = False
            try:
                dst_checksum = data_dst.attrs["checksum"]
            except KeyError:
                # checksum does not exist, since it is only updated when dump was
                # completed
                logger.warning(
                    f'"{data_dst.path}" contains partial dump, rewrite')
            else:
                src_checksum = xxhash.xxh64(data.compute()).hexdigest()
                if dst_checksum == src_checksum:
                    match = True
                else:
                    # checksum mismatch, reset
                    logger.warning(
                        f'"{data_dst.path}" does not match the source')
                    del data_dst.attrs["checksum"]

            return match, (data_dst, data)
Example #40
0
def getHashSum(file_path):
    hashsums = {}
    result = {}
    hashsums['xxh32'] = xxhash.xxh32()
    hashsums['xxh64'] = xxhash.xxh64()
    hashsums['md5'] = hashlib.md5()

    with open(file_path, 'rb') as f:
        while True:
            chunk = f.read(64 * 1024)
            if len(chunk):
                for key in hashsums.keys():
                    hashsums[key].update(chunk)
            else:
                break

    for key, value in hashsums.items():
        result[key] = value.hexdigest()

    return result
Example #41
0
    def _digestAndWrite(self, myqueue, topath):
        digest = xxhash.xxh64()
        try:
            os.makedirs(os.path.dirname(topath))
        except:
            pass
        finally:
            with open(topath, 'w') as f:
                while True:
                    item = myqueue.get()

                    if isinstance(item, exitcode):
                        myqueue.put(digest.hexdigest())
                        sys.stdout.write(self.greencolor +
                                         '[COPY PASS]'.rjust(16))
                        sys.stdout.flush()
                        break

                    digest.update(item)
                    f.write(item)
Example #42
0
def write_content(content, **kwargs):

    h64 = xxhash.xxh64(content).hexdigest()
    filepath = 'trnews-data/' + h64[:2] + '/' + h64[2:4] + '/' + h64

    d = os.path.dirname(filepath)
    if not os.path.isdir(d):
        os.makedirs(d)
    elif os.path.exists(filepath):
        return filepath, False

    with open(filepath + ".meta", "wb") as fp:
        fp.write(yaml.safe_dump(kwargs, 
            default_flow_style = False, 
            allow_unicode=True, indent=2, encoding="utf-8"))

    with gzip.open(filepath, "wb") as fp:
        fp.write(content)

    return filepath, True
Example #43
0
    def constrained(pattern: Union[str, re.Pattern]) -> Type[SymbolName]:
        """Create a new SymbolName subclass using the provided string as validation RE."""
        if isinstance(pattern, re.Pattern):
            regex = pattern
            pattern = pattern.pattern
        else:
            try:
                regex = re.compile(pattern)
            except re.error as e:
                raise TypeError(
                    f"Invalid regular expression definition:  '{pattern}'."
                ) from e

        assert isinstance(pattern, str)
        xxh64 = xxhash.xxh64()
        xxh64.update(pattern.encode())
        subclass_name = f"SymbolName_{xxh64.hexdigest()[-8:]}"
        namespace = dict(regex=regex)

        return type(subclass_name, (SymbolName, ), namespace)
Example #44
0
def _calculate_asset_hash(asset_file, dev_mode):
    if dev_mode:
        return random.random()

    """
    1. calculate the hash of asset file, use the hash as version number to control(maximize) the HTTP cache.
    2. the hash value will be cached in memory until the python app server restarted.
    3. only process text asset file(js and css), no binary file(img, fonts) processed. ##Todo##
    """
    hash = _asset_hash_cache_.get(asset_file)

    if not hash:
        file = os.path.join(os.path.dirname(__file__), *[x for x in asset_file.split('/')])
        if os.path.isfile(file):
            with open(file, 'r', encoding='utf-8') as f:
                data = f.read()
                hash = xxhash.xxh64(data).hexdigest()
                _asset_hash_cache_[asset_file] = hash

    return hash
Example #45
0
def build_recursive_tree(tree, base, depth, width):
    """
    Args:
        tree: Tree
        base: Node
        depth: int
        width: int
    """
    if depth >= 0:
        depth -= 1
        for _ in range(width):
            directory = Directory(None)
            tree.create_node("{0}".format(directory.name), "{0}".format(xxhash.xxh64(directory.name)),
                             parent=base.identifier, data=directory)
        dirs_nodes = tree.children(base.identifier)
        for dir_node in dirs_nodes:
            newbase = tree.get_node(dir_node.identifier)
            build_recursive_tree(tree, newbase, depth, width)
    else:
        return
Example #46
0
    def parse(self, response):
        soup = BeautifulSoup(response.body)
        for script in soup(["script", "style"]):
            script.extract()
        text = soup.get_text()

        if text:
            response.meta.update(
                content_hash=xxhash.xxh64(text.encode('ascii', 'ignore')).intdigest())

            try:
                langid = detect(text)
            except LangDetectException:
                return

            if langid == 'en':
                tagged = filter(lambda x: x[2] >= 0.99,
                                tag_locations(MySpider.geo_names, text))
                gid_count = collections.Counter(gid for name, gid, score in tagged)

                score = scorer(
                    float(sum(gid_count.itervalues()))/
                    float(len(text))
                )
                response.meta.update(score=score)

                for link in self.link_extractor.extract_links(response):
                    request = Request(url=link.url)
                    request.meta.update(link_text=link.text)
                    request.meta.update(score=score)
                    yield request

                date = datetime.datetime.now()
                for gid, count in gid_count.iteritems():
                    yield LocationsItem(
                        date=date,
                        geoname_id=gid,
                        count=count
                    )
            else:
                response.meta.update(score=0)
Example #47
0
def _hash_image(image):
    """
    Two hash variant are possible :
    - if imgui_cv.USE_FAST_HASH is True : select 100 random pixels and hash them
    - otherwise : compute the hash of the whole image (using xxhash for performance)
    :param image:
    :return:hash
    """
    if USE_FAST_HASH:
        rng = np.random.RandomState(89)
        inds = rng.randint(low=0, high=image.size, size=100)
        b = image.flat[inds]
        result = hash(tuple(b.data))
        return result
    else:
        # cf https://stackoverflow.com/questions/16589791/most-efficient-property-to-hash-for-numpy-array
        h = xxhash.xxh64()
        h.update(image)
        result = h.intdigest()
        h.reset()
        return result
Example #48
0
    def parse_binary_view(self):
        bv_hash = xxhash.xxh64()
        bv_hash.update(self.bv.file.filename)
        bv_object = BinaryView.BinaryViewNode(self.bv, str(bv_hash.hexdigest()), parent_uuid='0',
                                              parent_node_label='RootNode')

        # Update node list
        self.node_list.append(bv_object)

        func_index = 0
        # Iterate all functions
        for function in self.bv.functions:
            # NOTE: THE FOLLOWING CODE IS NOT THREAD SAFE~!!!!!!!!!!!!
            # Each function needs its own basic block cache
            self.basic_block_cache = dict()
            self.parse_function(function.mlil, bv_object, str(func_index))
            func_index += 1

        self.run_post_processing()

        return self.node_list
Example #49
0
def ducos1xxh(lastBlockHash, expectedHash, difficulty, efficiency):
    # XXHASH algorithm
    # Measure starting time
    timeStart = time()
    # Loop from 1 too 100*diff
    for ducos1xxres in range(100 * int(difficulty) + 1):
        # If efficiency lower than 100% sleep to use less CPU
        if ducos1xxres % 1000000 == 0 and float(100 - efficiency * 100) < 100:
            sleep(float(efficiency))
        # Generate hash
        ducos1xx = xxhash.xxh64(str(lastBlockHash) + str(ducos1xxres),
                                seed=2811)
        ducos1xx = ducos1xx.hexdigest()
        # Check if result was found
        if ducos1xx == expectedHash:
            # Measure finish time
            timeStop = time()
            # Calculate hashrate
            timeDelta = timeStop - timeStart
            hashrate = ducos1xxres / timeDelta
            return [ducos1xxres, hashrate]
Example #50
0
def test_hash_file(fs):
    """ Test to hash a file
    Cases
    -----
    - Not existing file (should return None)
    - Existing file (should return Digest)
    - None (should return None)
    - Pass directory (should return None)
    """
    # prepare file system
    fs.create_file('/phonyDir/testfile', contents='test')
    # Not existing file (should return None)
    assert DupFinder.fs.hash_file('notexisting.txt') is None
    # Existing file (should return Digest)
    assert DupFinder.fs.hash_file('/phonyDir/testfile') == xxhash.xxh64(
        'test').hexdigest()
    # None (should return None)
    assert DupFinder.fs.hash_file(None) is None
    # Pass directory (should return None)
    assert DupFinder.fs.hash_file('/phonyDir') is None
    assert DupFinder.fs.hash_file('/phonyDir/') is None
    def test_key_s3_incomprehensible_range_start(self):
        """
        Check that the key is constructed as we expect
        """
        import common.tztools

        job_scope = JobScope(
            ad_account_id=gen_string_id(),
            report_type=ReportType.day_platform,
            report_variant=Entity.Campaign,
            range_start='blah-blah',
        )

        # even though range_start is provided ^ above, it's not date-like and we
        # should be ok with that and just fall back to datetime.utcnow()
        now_dt = datetime(2000, 1, 2, 3, 4, 5)
        with mock.patch.object(common.tztools, 'now', return_value=now_dt) as now_mocked, mock.patch.object(
            uuid, 'uuid4', return_value='UUID-HERE'
        ):

            storage_key = cold_storage.store({'data': 'yeah!'}, job_scope)

        assert now_mocked.called

        prefix = xxhash.xxh64(job_scope.ad_account_id).hexdigest()[:6]

        expected_key = (
            f'fb/'
            + f'{prefix}-{job_scope.ad_account_id}/'
            + f'{job_scope.report_type}/'
            + f'{now_dt.strftime("%Y")}/'
            + f'{now_dt.strftime("%m")}/'
            + f'{now_dt.strftime("%d")}/'
            + f'{now_dt.strftime("%Y-%m-%dT%H:%M:%SZ")}-'
            + f'{job_scope.job_id}-'
            + f'UUID-HERE'
            + f'.json'
        )

        assert storage_key == expected_key
Example #52
0
def touch_success(logger, incoming_message, dir_tree):
    logger.debug(f"Successful touch arrived incoming_message['target']")
    path = incoming_message['target'].split('/')[1:]  # folder:file
    syncdir = dir_tree.get_dir_by_name(path[0])
    dir_index = xxhash.xxh64(path[0]).hexdigest()
    if not syncdir:
        logger.debug(
            f"Directory {path[0]} already removed from active dirs list, dropping touch {path[1]}"
        )
        return
    # There might be a raise when successful mkdir message will arrive after successful touch message
    # So we won't check here if dir is already synced

    f = syncdir.data.get_file_by_name(path[1])
    #  Now, when we got reply from client that file was created,
    #  we can mark it as synced
    syncdir.data.size += 1
    f.ondisk = True
    f.creation_time = datetime.datetime.strptime(incoming_message['timestamp'],
                                                 '%Y/%m/%d %H:%M:%S.%f')
    f.uuid = uuid.uuid4().hex[
        -5:]  # Unique session ID, will be modified on each file modify action
    logger.debug(f"File {path[0]}/{path[1]} was created at: {f.creation_time}")
    logger.debug(
        f"File {path[0]}/{path[1]} is synced. Directory size updated to {syncdir.data.size} bytes"
    )
    if syncdir.data.size > MAX_FILES_PER_DIR:
        try:
            logger.debug(
                f"Directory {path[0]} going to be removed from dir tree")
            dir_tree.remove_dir_by_name(path[0])
            del dir_tree.synced_nodes[dir_index]
            del dir_tree.nids[dir_index]
            logger.debug(
                f"Directory {path[0]} is reached its size limit and removed from active dirs list"
            )
        except (NodeIDAbsentError, KeyError):
            logger.debug(
                f"Directory {path[0]} already removed from active dirs list, skipping...."
            )
Example #53
0
    def test_xxh64_update(self):
        x = xxhash.xxh64()
        x.update('a')
        self.assertEqual(xxhash.xxh64('a').digest(), x.digest())
        self.assertEqual(xxhash.xxh64_digest('a'), x.digest())
        x.update('b')
        self.assertEqual(xxhash.xxh64('ab').digest(), x.digest())
        self.assertEqual(xxhash.xxh64_digest('ab'), x.digest())
        x.update('c')
        self.assertEqual(xxhash.xxh64('abc').digest(), x.digest())
        self.assertEqual(xxhash.xxh64_digest('abc'), x.digest())

        seed = random.randint(0, 2**64)
        x = xxhash.xxh64(seed=seed)
        x.update('a')
        self.assertEqual(xxhash.xxh64('a', seed).digest(), x.digest())
        self.assertEqual(xxhash.xxh64_digest('a', seed), x.digest())
        x.update('b')
        self.assertEqual(xxhash.xxh64('ab', seed).digest(), x.digest())
        self.assertEqual(xxhash.xxh64_digest('ab', seed), x.digest())
        x.update('c')
        self.assertEqual(xxhash.xxh64('abc', seed).digest(), x.digest())
        self.assertEqual(xxhash.xxh64_digest('abc', seed), x.digest())
Example #54
0
def _job_scope_to_storage_key(
        job_scope: JobScope,
        chunk_marker: Optional[int] = DEFAULT_CHUNK_NUMBER,
        custom_namespace: Optional[str] = None) -> str:
    """
    Puts together the S3 object key we need for given report data. This is
    just a helper function

    :param job_scope: The job scope (dict representation)
    :param chunk_marker: Order number of written chunk
    :param custom_namespace: Custom job namespace
    :return string: The full S3 key to use
    """
    assert isinstance(job_scope, JobScope)

    prefix = xxhash.xxh64(job_scope.ad_account_id).hexdigest()[:6]

    # datetime is a subclass of date, so we must check for date first
    if isinstance(job_scope.range_start, date):
        report_datetime = datetime.combine(job_scope.range_start,
                                           datetime.min.time())
    elif isinstance(job_scope.range_start, datetime):
        report_datetime = job_scope.range_start
    else:
        # long import line to allow mocking of call to now() in tests.
        report_datetime = common.tztools.now()

    key = (f'{custom_namespace or job_scope.namespace}/'
           f'{prefix}-{job_scope.ad_account_id}/'
           f'{job_scope.report_type}/'
           f'{report_datetime.strftime("%Y")}/'
           f'{report_datetime.strftime("%m")}/'
           f'{report_datetime.strftime("%d")}/'
           f'{report_datetime.strftime("%Y-%m-%dT%H:%M:%SZ")}-'
           f'{job_scope.job_id}-'
           f'{str(chunk_marker)+"-" if chunk_marker else ""}'
           f'{uuid.uuid4()}'
           f'.json')

    return key
Example #55
0
def import_opml(user_id, path):
    _opml = opml.parse(path)

    uncategorized = None
    for outline in _opml:
        if hasattr(outline, 'xmlUrl'):
            if uncategorized is None:  # does not defined yet
                uncategorized = Category.query.filter_by(user_id=user_id, name="Uncategorized").first()
                if uncategorized is None:  # not found
                    uncategorized = Category(user_id, "Uncategorized", order_id=9999)
                    uncategorized.save()

            feed = Feed(outline.xmlUrl)
            feed.save()

            user_feed = UserFeed(user_id, uncategorized.id, feed.id, outline.text)
            user_feed.save()

        else:
            category = Category.query.filter_by(user_id=user_id, name=outline.text).first()
            if category is None:
                category = Category(user_id, outline.text)
                category.save()

            for child in outline:
                if hasattr(child, 'xmlUrl'):
                    hash = xxhash.xxh64()
                    feed = Feed.query.filter_by(feed_url_hash=hash).first()
                    if feed is None:
                        feed = Feed(child.xmlUrl)
                        feed.save()

                    user_feed = UserFeed(user_id=user_id, category_id=category.id, feed_id=feed.id, feed_name=child.text)
                    user_feed.save()
                else:
                    logger.warn("Nested category is not supported yet, ignored!")
    def hashdirectory(self,directory,map):
        hashfunc = xxhash.xxh32()
        for file in os.listdir(directory):
            if(os.path.isdir(os.path.join(directory,file))):
                #print os.path.join(directory,file)
                key = self.hashdirectory(os.path.join(directory,file),map)
                if key in map:
                    map[key] = map[key] + "?"+os.path.join(directory,file)
                else:
                    map[key] = os.path.join(directory,file)
                hashfunc.update(key)
            if(os.path.isfile(os.path.join(directory,file))):
                hf = xxhash.xxh64()
                f = open(os.path.join(directory,file),'rb').read()
                byts = bytes(f)
                #mem = memoryview(byts)
                buffersize = 1048576
                bytesize = sys.getsizeof(byts)
                self.ldb.pgb.step(bytesize/1024)
                if bytesize-buffersize>0:
                    for i in range(0,bytesize-buffersize,buffersize):
                        if bytesize-i>buffersize:
                            hf.update(byts[i:(i+buffersize)])
                        else:
                            hf.update(byts[i:])
                else:
                    hf.update(byts[0:])

                key = hf.digest()
                if key in map:
                    map[key] = map[key] + "?"+os.path.join(directory,file)
                else:
                    map[key] = os.path.join(directory,file)
                hashfunc.update(key)
        key = hashfunc.digest()
        return key
Example #57
0
 def _hash_xxhash(buf):
     """
     Produce a 8-bytes hash of *buf* using xxHash.
     """
     return xxhash.xxh64(buf).digest()
Example #58
0
 def __init__(self, *args, **kwargs):
     super(Topic, self).__init__(*args, **kwargs)
     self.raw_content_hash = xxhash.xxh64(self.content_raw).hexdigest()
Example #59
0
def generateRequestHash(authticket, request):
    firstHash = xxhash.xxh64(authticket, seed=0x1B845238).intdigest()                      
    return xxhash.xxh64(request, seed=firstHash).intdigest()