コード例 #1
0
ファイル: sarni_minhash.py プロジェクト: mattsarn/MinHash
def estimateDistinctElements(items, num_perm):
    """This function will estimate the number of distinct elements in a list.
       The default number of hash function permutations is num_perm(128), but 
       I asjusted after researching more-
       http://blog.cluster-text.com/tag/minhash/"""
    h = MinHash(num_perm)  # creates a minhash object with the parameter 
    for item in items:     # being the number of hash permutations
        h.digest(sha1(item.encode('utf8')))  # digests the minhash signatures 
    print("Estimated number of elements: ", h.count())
コード例 #2
0
ファイル: annoy.1.py プロジェクト: PhilSk/ann-jaccard
 def query(self, v, n):
     m = MinHash(num_perm=1)
     for e in v:
         m.update(str(e).encode('utf-8'))
     print(
         self._annoy.get_nns_by_vector(m.digest().tolist(), n,
                                       self._search_k))
     return self._annoy.get_nns_by_vector(m.digest().tolist(), n,
                                          self._search_k)
コード例 #3
0
ファイル: sarni_minhash.py プロジェクト: mattsarn/MinHash
def estimateDistinctElementParallel(listOfItems, num_perm):
    """Same as above, except here we have a nested for loop to iterate through the 
       lists in the list. This function will also append the estimation result 
       to a list for use in the following accuracy function."""
    h = MinHash(num_perm)
    for item in listOfItems:
        for i in item:  # nested for loop to iterate over lists within a list
            h.digest(sha1(i.encode('utf8')))
    estimate.append(h.count())
    print("Estimated number of elements: ", h.count())
コード例 #4
0
class VisualMinHashWithDataSketch:
    """
    minHash with sketches for near image duplicate detection.
    This is an implementation of minHash algorithm introduced in
    Scalable Near Identical Image and Shot Detection - Microsoft (https://www.microsoft.com/en-us/research/wp-content/uploads/2016/02/civr2007.pdf)
    by Ondrej Chum, James Philbin, Michael Isard, Andrew Zisserman
    """

    # TODO: add word weighting on this minHash algorithm.

    def __init__(self,
                 minHash_hash_num=512,
                 minHash_param_k=512,
                 minHash_param_s=3,
                 rand_seed=0):
        # We could use minHash function as permutation of vocabulary.
        # However, it is memory inefficient. As an alternative, we can use hash function and take min value among the existing members.
        # TODO: This alternative may not work. Check this out.
        from datasketch import MinHash

        # In paper, sec 4.1, it says they use 512 independent hash function and grouped 512 sketches by usning hash function multiple times.
        # I think this is not valid implementation, because sketches are not indenpendent anymore.
        # Maybe that was compromise between mathmatical accuracy and speed. Caluclating 512*3 hash function is 3 times slower.
        # To reproduce the paper results, I may have to follow this implementation.
        # But, let me try correct implemenation first, which makes 512 sketches to be truly independent.
        self.minHash_hash_num = minHash_hash_num  # indenpendent hash function.
        self.minHash_param_k = minHash_param_k  # number of sketches
        self.minHash_param_s = minHash_param_s  # tuple length, or sketch size

        np.random.seed(rand_seed)
        self.sketch_choices = []
        for k in range(minHash_param_k):
            rand_choice_hashfunc = []
            for s in range(minHash_param_s):
                rand_choice_hashfunc.append(
                    np.random.randint(0, minHash_hash_num))
            # print('choice:', rand_choice_hashfunc)

            self.sketch_choices.append(rand_choice_hashfunc)

        self.minHash = MinHash(num_perm=minHash_hash_num, seed=rand_seed)

    def hash_bow(self, target_set):
        # init minHashes
        self.minHash.clear()

        for elem in target_set:
            self.minHash.update_with_intval(elem)

        hashval = self.minHash.digest()
        # print('hashval:', hashval)

        result = []
        for choice_indexes in self.sketch_choices:
            # print('choice_indexes:', choice_indexes)
            sketch = hashval[choice_indexes]
            # print('sketch:', sketch)
            result.append(tuple(sketch))
        return result
コード例 #5
0
 def FIG(self, tr):
     #wm = self.wmg.minhash(tr) # wm1 is of the type WeightedMinHash
     #vl=np.transpose(wm.hashvalues)
     #vl=vl[0]
     m = MinHash(num_perm=self.num_perm)
     for d in tr:
         m.update(d.encode('utf8'))
     return (m.digest())
コード例 #6
0
 def query(self, v, n):
     if self._metric == 'angular':
         v = sklearn.preprocessing.normalize([v], axis=1, norm='l2')[0]
     m = MinHash(num_perm=128)
     for e in v:
         m.update(str(e).encode('utf-8'))
     return self._lshf.kneighbors([m.digest()],
                                  return_distance=False,
                                  n_neighbors=n)[0]
コード例 #7
0
 def find_minhash(self, num_perm=128):
     """
     Compute minhash, cached.
     """
     words = self.words
     doc_hash = MinHash(num_perm=num_perm)
     for word, _ in words:
         doc_hash.update(word.encode('utf8'))
     return list(doc_hash.digest())
コード例 #8
0
    def fit(self, X):
        self.index = numpy.empty([0, 32])
        self._index_minhash = []
        self._ball_index = []
        self._index = MinHashLSHForest(num_perm=self._n_perm, l=self._n_rep)

        for i, x in enumerate(X):
            m = MinHash(num_perm=self._n_perm)
            for e in x:
                m.update(str(e).encode('utf-8'))
            self._index.add(str(i), m)
            #self.index.append(m.digest())
            self.index = numpy.vstack((self.index, m.digest()))
            self._ball_index.append(m.digest())
            self._index_minhash.append(m)
        self._index.index()
        self._X = X

        self.tree = BallTree(self.index, leaf_size=self._n_leaves)
コード例 #9
0
def minhash_implem(url_shingles_list):
    list_url_hash = []
    for url in range(len(url_shingles_list)):
        m = MinHash(num_perm=8)
        shingle_list = url_shingles_list[url][1]
        for shingle in shingle_list:
            m.update(shingle.encode('utf8'))
        list_url_hash.append(
            ["{0}".format(url_shingles_list[url][0]),
             m.digest()])
    return list_url_hash
コード例 #10
0
 def fit(self, X):
     self.index = numpy.empty([0, 128])
     for i, x in enumerate(X):
         m = MinHash(num_perm=128)
         for e in x:
             m.update(str(e).encode('utf-8'))
         self.index = numpy.vstack((self.index, m.digest()))
         self._index_minhash.append(m)
     self._lshf = sklearn.neighbors.LSHForest(
         n_estimators=self._n_estimators, n_candidates=self._n_candidates)
     if self._metric == 'angular':
         X = sklearn.preprocessing.normalize(X, axis=1, norm='l2')
     self._lshf.fit(self.index)
コード例 #11
0
def minHash_bml(SX, SY):
    print()
    print("MinHash BML")

    l = 32
    m = 8
    num_perm = pow(2, m)
    error = pow(10, -5)

    print("Number of permutations is ", num_perm)

    m1 = MinHash(num_perm)
    m2 = MinHash(num_perm)

    for d in SX:
        m1.update(d.encode('utf8'))
    for d in SY:
        m2.update(d.encode('utf8'))

    nx = m1.count()
    ny = m2.count()
    print("Estimated nx is ", nx)
    print("Estimated ny is ", ny)

    Vx = m1.digest()
    Vy = m2.digest()

    z = 0
    for i in range(0, num_perm):
        if Vx[i] >= Vy[i]:
            z = z + 1
    P = z / num_perm

    print("P is: ", P)
    print("Inclusion Coefficient: ",
          lookup(P, 0, min(nx, ny), nx, ny, error, m, num_perm, l, 0, 0))

    return
コード例 #12
0
ファイル: annoy.1.py プロジェクト: PhilSk/ann-jaccard
    def fit(self, X):
        self.index = numpy.empty([0, 1])

        self._index_minhash = []
        for i, x in enumerate(X):
            m = MinHash(num_perm=1)
            for e in x:
                m.update(str(e).encode('utf-8'))
            self.index = numpy.vstack((self.index, m.digest()))
            self._index_minhash.append(m)
        self._annoy = annoy.AnnoyIndex(self.index.shape[1])
        for i, x in enumerate(self.index):
            self._annoy.add_item(i, x.tolist())
        self._annoy.build(self._n_trees)
コード例 #13
0
    def query(self, v, n):
        print("&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&")
        m = MinHash(num_perm=self._n_perm)
        for e in v:
            m.update(str(e).encode('utf-8'))

        # for i in self._annoy.get_nns_by_vector(v.tolist(), n, 100):
        #     print(self._index_minhash[int(i)].jaccard(m))

        dist, ind = self.tree.query([m.digest()], k=n)
        for i in ind[0]:
            # print(i)
            print(self._index_minhash[int(i)].jaccard(m))
        print("=======================")
        brute_indices = self.query_with_distances(m.digest(), n)
        for i in brute_indices:
            print(self._index_minhash[int(i)].jaccard(m))
        print("-----------------------")
        ind2 = self._index.query(m, n)
        for i in ind2:
            print(self._index_minhash[int(i)].jaccard(m))

        # return map(int, ind[0])
        return self.query_with_distances(m.digest(), n)
コード例 #14
0
    def extract_attribute(self, base_object: BDFunction) -> int:
        # Check if value already exists
        FunctionMinHashLSH_value = base_object.get_attribute_value('FunctionMinHashLSH')

        if FunctionMinHashLSH_value:
            pass
        else:
            normalized_instr_set: set = set(base_object.get_attribute_value('FunctionNormalized'))

            # Create MinHash object
            minhash = MinHash(num_perm=Configuration.MINHASH_PERMUTATIONS, seed=Configuration.MINHASH_SEED)
            for instr in normalized_instr_set:
                minhash.update(instr.encode('utf8'))

            base_object.add_attribute_value('FunctionMinHashLSH', {'function_lsh': minhash.digest()})
            FunctionMinHashLSH_value = base_object.get_attribute_value('FunctionMinHashLSH')

        return FunctionMinHashLSH_value['function_lsh'] if FunctionMinHashLSH_value else None
コード例 #15
0
class Text(Base):
    """
    client class to control api with restful
    """
    def __init__(self, api_key=None, host_url=None):
        """
        Initial sixecho
        Attributes:
            api_key(string)       - Optional : api_key generate from sixecho
            host_url(string)      - Optional : is sixecho domain
        """
        self.api_key = api_key
        if host_url is not None:
            if host_url.endswith("/"):
                host_url = host_url[:-1]
            self.host_url = host_url
        self.array_words = []
        self.min_hash = MinHash(num_perm=128)
        self.max_workers = 1
        self.sha256 = ""
        self.file_size = 0
        self.meta_media = None
        self.type = "TEXT"
        self.digest = ""
        self.common_info = {}
        self.ref_info = {}
        self.detail_info = {}

    #  def digest(self):
    #  """Export the hash values, which is the internal state of the
    #  MinHash.

    #  Returns:
    #  numpy.array: The hash values which is a Numpy array.
    #  """
    #  return self.min_hash.digest()

    def set_detail_info(self, detail_info):
        """
        detail_info: Required
          "isbn" Options - string
          "author" Optons - string
          "publisher" Options - string
          "published_date Options - integer (unixtimestmap)
          "language" Options - string
          "number_of_pages" Options - integer
        """
        self.detail_info = detail_info

    def generate(self, str=None, txtpath=None, epubpath=None, pdfpath=None):
        """Generate minhash with new value from string or file
        we use minhash from https://ekzhu.github.io/datasketch/_modules/datasketch/minhash.html#MinHash.update
        Args:
            str(string)     - Optional  :   string whose minhash to be computed.
            txtpath(string)   - Optional  :   path of text file to be computed.
            epubpath(string)   - Optional  :   path of epub file to be computed.
            pdfpath(string)   - Optional  :   path of pdf file to be computed.
        """
        if txtpath:
            self.load_file(txtpath)
        elif epubpath:
            size = len(epubpath.split('.'))
            name = epubpath.split('.')[size - 2]
            size = len(name.split('/'))
            name = name.split('/')[size - 1]
            name = name.replace("/", "")
            name = name + '.txt'
            cur_path = os.path.dirname(os.path.abspath(__file__))
            self.write2text(self.readepub(epubpath), name)
            self.load_file(cur_path + '/' + name)
            os.remove(cur_path + '/' + name)
        elif pdfpath:
            size = len(pdfpath.split('.'))
            name = pdfpath.split('.')[size - 2]
            size = len(name.split('/'))
            name = name.split('/')[size - 1]
            name = name.replace("/", "")
            name = name + '.txt'
            cur_path = os.path.dirname(os.path.abspath(__file__))
            self.write2text(self.readpdf(pdfpath), name)
            self.load_file(cur_path + '/' + name)
            os.remove(cur_path + '/' + name)

        else:
            sha256 = hashlib.sha256()
            sha256.update(str.encode())
            self.sha256 = sha256.hexdigest()
            self.array_words = tokenize(str)
            self.file_size = len(str)
            for d in self.array_words:
                self.min_hash.update(d.encode('utf8'))
        self.make_digest()

    def set_meta(self, meta_books):
        """
        Args:
            meta_books(Hash)      - Require  : struct books include
                - category_id(string) - Require : category of books you can get from search category api
                - publisher_id(string) - Require : publisher of book you can get from search publisher api
                - title(string) - Require : title book
                - auther(string) - Require : auther book
                - country_of_origin(string) : country iso 3166-1
                - language(string) Require : language iso 639-1
                - paperback(string) Require : total page book
                - publish_date(string) Require : publish date
        """
        self.meta_media = meta_books

    def create_sha256_signature(self, secret, message):
        secret = str(secret)
        message = str(message)
        # print(secret, message)
        # print(type(secret))
        # print(type(message))
        secret_byte = str(secret).encode('utf-8')
        message_byte = str(message).encode('utf-8')
        signature = hmac.new(secret_byte, message_byte,
                             hashlib.sha256).hexdigest()
        return signature

    def make_digest(self):
        self.digest = ",".join([str(num) for num in self.min_hash.digest()])

    def load_file(self, fpath):
        """
        method load_file
        """
        sha256 = hashlib.sha256()
        f_count = open(fpath, "r")
        f = f_count.readlines()
        f_count.close()
        list_of_groups = None
        if self.max_workers != 1:
            l = f
            n = self.max_workers
            list_of_groups = [l[i:i + n] for i in range(0, len(l), n)]
            #  list_of_groups = zip(*(iter(f), ) * self.max_workers)

        file_size = os.path.getsize(fpath)
        #  print_progress_bar(0,
        #  file_size,
        #  prefix='Progress:',
        #  suffix='Complete',
        #  length=50)
        progress = 0
        lines = []
        if self.max_workers == 1:
            for line in f:
                progress = progress + len(line)
                sha256.update(line.encode())
                words = tokenize(line)
                if len(words) != 0:
                    for d in words:
                        self.min_hash.update(d.encode('utf8'))
                #  print_progress_bar(progress,
                #  file_size,
                #  prefix='Progress:',
                #  suffix='Complete',
                #  length=50)
        else:
            for line in f:
                sha256.update(line.encode())
            for lines in list_of_groups:
                for line in lines:
                    progress = progress + len(line)
                words = tokenize_mutiline(lines)
                if len(words) != 0:
                    for d in words:
                        self.min_hash.update(d.encode('utf8'))
                #  print_progress_bar(progress,
                #  file_size,
                #  prefix='Progress:',
                #  suffix='Complete',
                #  length=50)
        self.sha256 = sha256.hexdigest()
        self.file_size = file_size

    def readepub(self, fpath):
        list_text = []
        book = open_book(fpath)
        lines = ec.utils.convert_epub_to_lines(book)
        for line in lines:

            text = ec.utils.convert_lines_to_text(str(line), "txt")
            text = list(text)
            for ele in text:
                list_text.append(ele)
        return list_text

    def readpdf(self, fpath):
        pdfFileObj = open(fpath, 'rb')  # 'rb' for read binary mode
        pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
        total_page = pdfReader.numPages
        # print(total_page)
        list_text = []
        for i in range(total_page):
            pageObj = pdfReader.getPage(i)
            list_text.append(pageObj.extractText())
        return list_text

    def write2text(self, list_text, opname):
        cur_path = os.path.dirname(os.path.abspath(__file__))
        fpath = opname
        print(cur_path)
        print(cur_path + '/' + fpath)
        file = open(cur_path + '/' + fpath, 'w')
        for ele in list_text:
            file.write(ele)
        file.close()
コード例 #16
0
    #make corpus a dictionary. Needed to calculate true jaccard score. 
    #mycorpus={i+1:set(line.lower().split()) for i,line in enumerate(open(fname,'r')) if i+1 in linestoget}

    print("--- %s seconds ---" % (time.time() - start_time))

    print 'Calculate minhash signatures'
    start_time = time.time()

    #prepare dictionary of hashes
    hashcorp=dict.fromkeys(linestoget)
    #compute hashes
    for key,doc in mycorpus:#.iteritems():
        #compute minhash signature
        m=MinHash(num_perm=num_permutations)
        #for token in doc: m.digest(sha1(token.encode('utf8')))
        for token in doc: m.digest(sha1(token))
        #for token in doc: m.digest(sha1(token.encode('utf8', 'ignore')))
        hashcorp[key]=m
    print("--- %s seconds ---" % (time.time() - start_time))

    if calc_clusters:    
        p=Pool(num_processes)
        assignment=[ (x,) for x in thresholds]
        print assignment
        p.map(compute_clusters,assignment)

    if calc_match:

        #create a balanced, pairwise test set
        
        #first create cluster to ad dictionary
コード例 #17
0
class ColumnSketch:
    """A Column Sketch contains a summary of a table column. 

    Args:
        column_name: the extracted column name.
        minhash_size: the number of permutations to use for MinHash.
        minhash_seed: the random seed used by MinHash.
        hyperloglog_p: the precision parameter used by HyperLogLog.
        sample_size: the size of sample to be kept.
        enable_word_vector_data: whether to build word embedding vector for 
            data values -- can be 10x more expensive.
    """
    def __init__(
        self,
        column_name,
        minhash_size=256,
        minhash_seed=43,
        hyperloglog_p=8,
        sample_size=100,
        enable_word_vector_data=False,
        model=WordVectorModel,
    ):
        self._column_name = column_name
        self._sample = set([])
        self._sample_size = sample_size
        self._count = 0
        self._empty_count = 0
        self._oov_count = 0
        self._numeric_count = 0
        self._minhash = MinHash(num_perm=minhash_size,
                                seed=minhash_seed,
                                hashfunc=self._hashfunc32)
        self._hhl = HyperLogLogPlusPlus(p=hyperloglog_p,
                                        hashfunc=self._hashfunc64)
        self._enabled_word_vec_data = enable_word_vector_data
        self._model = model
        self._sum_vector = self._model.get_empty_word_vector()

    def _hashfunc32(self, str_value):
        return farmhash.hash32(str_value)

    def _hashfunc64(self, str_value):
        return farmhash.hash64(str_value)

    @property
    def column_name(self):
        """The extracted column name.
        """
        return self._column_name

    @property
    def sample(self):
        """A sample (non-random) of the data values in the column as a list.
        """
        return list(self._sample)

    @property
    def count(self):
        """The total number of data values (i.e. rows) including
        the empty ones.
        """
        return self._count

    @property
    def empty_count(self):
        """The number of empty data values.
        """
        return self._empty_count

    @property
    def non_empty_count(self):
        """The number of non-empty data values.
        """
        return self._count - self._empty_count

    @property
    def out_of_vocabulary_count(self):
        """The number of data values that are non-empty and outside of
        the language model's vocabulary.
        """
        return self._oov_count

    @property
    def in_vocabulary_count(self):
        """The number of data values that are non-empty and in
        the language model's vocabulary.
        """
        return self._count - self._empty_count - self._oov_count

    @property
    def numeric_count(self):
        """The number of data values that are non-empty and numerical.
        """
        return self._numeric_count

    @property
    def is_numeric(self):
        """Whether the column is numeric, based on if at least 50% of rows
        are numeric.
        """
        if self.non_empty_count == 0:
            return False
        return (float(self._numeric_count) /
                float(self.non_empty_count)) >= 0.5

    @property
    def distinct_count(self):
        """The approximate distinct count made by the HyperLogLog.
        """
        if len(self._sample) < self._sample_size:
            return len(self._sample)
        return max(len(self._sample), self._hhl.count())

    @property
    def word_vector_column_name(self):
        """The word embedding vector of the column name as a list.
        """
        doc = self._model.process(self.column_name)
        vectors = [token.vector for token in doc if token.has_vector]
        if len(vectors) == 0:
            return None
        return list(float(v) for v in np.sum(vectors, axis=0))

    @property
    def word_vector_data(self):
        """The mean word embedding vector of all data values as a list.
        """
        if not self._enabled_word_vec_data:
            return None
        if self.in_vocabulary_count == 0:
            return None
        vector = self._sum_vector / np.float32(self.in_vocabulary_count)
        return list(float(v) for v in vector)

    @property
    def minhash(self):
        """The hash values in the MinHash.
        """
        return list(int(v) for v in self._minhash.digest())

    @property
    def seed(self):
        """The random seed used for MinHash.
        """
        return self._minhash.seed

    @property
    def hyperloglog(self):
        """The register values of the HyperLogLog counter.
        """
        return list(int(v) for v in self._hhl.digest())

    def update(self, value):
        """Add a data value into the sketch.
        """
        # Update counter.
        self._count += 1
        if not isinstance(value, str):
            value = json.dumps(value, sort_keys=True)
        # Clean the value
        value = value.strip().lower()
        # Skip if the value is empty string.
        if len(value) == 0:
            self._empty_count += 1
            return
        if _is_number(value):
            self._numeric_count += 1
        # Add to sample.
        if len(self._sample) < self._sample_size:
            self._sample.add(value)
        # Update the MinHash sketch.
        self._minhash.update(value)
        # Update the HyperLogLog sketch.
        self._hhl.update(value)
        # Skip word vector extraction if not enabled.
        if not self._enabled_word_vec_data:
            return
        # Update the sum of word embeddings.
        vectors = [
            token.vector for token in self._model.process(value)
            if token.has_vector
        ]
        if len(vectors) > 0:
            self._sum_vector += np.sum(vectors, axis=0)
        else:
            self._oov_count += 1
コード例 #18
0
        if args.header:
            next(f)
        #TODO test robustness
        #mycorpus=[(i,set(line.encode('utf8', 'ignore').lower().split())) for i,line in enumerate(f)]
        mycorpus=[(i,set(line.lower().split())) for i,line in enumerate(f)]

    print(("--- %s seconds ---" % (time.time() - start_time)))

    print('Calculate minhash signatures')
    start_time = time.time()

    #prepare dictionary of hashes
    hashcorp=dict.fromkeys([tup[0] for tup in mycorpus])
    #compute hashes
    for key,doc in mycorpus:
        #compute minhash signature
        m=MinHash(num_perm=num_permutations)
        for token in doc: m.digest(sha1(token))
        hashcorp[key]=m
    print(("--- %s seconds ---" % (time.time() - start_time)))
    if num_processes> 1:
        if len(thresholds)<num_processes:
            num_processes=len(thresholds)
        p=Pool(num_processes)
        assignment=[ (x,) for x in thresholds]
        p.map(compute_clusters,assignment)
    else:
        for x in thresholds:
            compute_clusters((x,))

コード例 #19
0
ファイル: utils.py プロジェクト: EdwardBetts/undercrawler
def get_min_hash(text, too_common, num_perm=128):
    min_hash = MinHash(num_perm=num_perm)
    for shingle_h in shingle_hashes(text):
        if shingle_h.hexdigest() not in too_common:
            min_hash.digest(shingle_h)
    return min_hash
コード例 #20
0
ファイル: HashCluster.py プロジェクト: lsz1994024/TagTree
def getHashSig(tagsListOfPep):
    minHash = MinHash(num_perm=NUM_PERMUTATION)
    for tag in tagsListOfPep:
        minHash.update(tag.encode('utf-8'))

    return minHash.digest()
コード例 #21
0
ファイル: client.py プロジェクト: Steap/SIXEcho
class Client(object):
    """
    client class to control api with restful
    """

    def __init__(self, api_key=None, host_url=None, max_workers=1):
        """
        Initial sixecho
        Attributes:
            api_key(string)       - Optional : api_key generate from sixecho
            host_url(string)      - Optional : is sixecho domain
        """
        self.api_key = api_key
        deepcut.tokenize("Welcome")  # Load library
        if host_url is not None:
            if host_url.endswith("/"):
                host_url = host_url[:-1]
            self.host_url = host_url
        self.array_words = []
        self.min_hash = MinHash(num_perm=128)
        self.max_workers = max_workers
        self.sha256 = ""

    def digest(self):
        """Export the hash values, which is the internal state of the
        MinHash.

        Returns:
            numpy.array: The hash values which is a Numpy array.
        """
        return self.min_hash.digest()

    def generate(self, str=None, fpath=None):
        """Generate minhash with new value from string or file
        we use minhash from https://ekzhu.github.io/datasketch/_modules/datasketch/minhash.html#MinHash.update
        Args:
            str(string)     - Optional  :   string whose minhash to be computed.
            fpath(string)   - Optional  :   path file to be computed.
        """
        if fpath:
            self.load_file(fpath)
        else:
            sha256 = hashlib.sha256()
            sha256.update(str)
            self.sha256 = sha256.hexdigest()
            self.array_words = tokenize(str)
            for d in self.array_words:
                self.min_hash.update(d.encode('utf8'))

    def upload(self):
        """Upload digital conent to server

        """
        digest = ",".join([str(num) for num in self.digest()])
        if self.host_url is None or self.api_key is None:
            raise Exception("Require host_url and api_key")

        headers = {
            "x-api-key": self.api_key,
            'content-type': 'application/json'
        }
        response = requests.post((self.host_url + "/checker"),
                                 json={
                                     "digest": digest,
                                     "sha256": self.sha256
                                 },
                                 headers=headers)
        print("content:" + str(response.text))
        return json.loads(response.text)

    def load_file(self, fpath):
        """
        method load_file
        """
        sha256 = hashlib.sha256()
        f_count = open(fpath, "r")
        f = f_count.readlines()
        f_count.close()
        list_of_groups = None
        if self.max_workers != 1:
            l = f
            n = self.max_workers
            list_of_groups = [l[i:i + n] for i in range(0, len(l), n)]
            #  list_of_groups = zip(*(iter(f), ) * self.max_workers)

        fileSize = os.path.getsize(fpath)
        printProgressBar(0,
                         fileSize,
                         prefix='Progress:',
                         suffix='Complete',
                         length=50)
        progress = 0
        lines = []
        if self.max_workers == 1:
            for line in f:
                progress = progress + len(line)
                sha256.update(line)
                words = tokenize(line)
                if len(words) != 0:
                    for d in words:
                        self.min_hash.update(d.encode('utf8'))
                printProgressBar(progress,
                                 fileSize,
                                 prefix='Progress:',
                                 suffix='Complete',
                                 length=50)
        else:
            for line in f:
                sha256.update(line)
            for lines in list_of_groups:
                for line in lines:
                    progress = progress + len(line)
                    #  sha256.update(line)
                words = tokenize_mutiline(lines)
                if len(words) != 0:
                    for d in words:
                        self.min_hash.update(d.encode('utf8'))
                printProgressBar(progress,
                                 fileSize,
                                 prefix='Progress:',
                                 suffix='Complete',
                                 length=50)
        self.sha256 = sha256.hexdigest()
コード例 #22
0
class PradoProjector(Projector):
    def __init__(
        self,
        feature_length: int = None,
        config: Optional[PradoProjectorConfig] = None,
    ):
        super().__init__()

        if config is None:
            config = PradoProjectorConfig(feature_length=feature_length)

        self._config = copy.deepcopy(config)
        self._hashobj = MinHash(num_perm=self.n_permutations,
                                hashfunc=farmhash.hash32)
        self._projection_operator = PradoProjectionOperator()

        self._vectorized_projection = np.vectorize(self.project,
                                                   signature="()->(n)")

    # region Properties
    @property
    def feature_length(self) -> int:
        return self._config.feature_length

    @property
    def B(self) -> int:
        return self.feature_length

    @property
    def n_permutations(self) -> int:
        return (2 * self.B + 32 - 1) // 32

    # endregion

    def project(self, x: str):
        self._hashobj.clear()
        self._hashobj.update(x)

        # (4 * n_permutations, )
        token_as_bytes = b"".join(
            int(x).to_bytes(4, "big") for x in self._hashobj.digest())

        # (32 * n_permutations, )
        token_as_bits = bitarray.bitarray()
        token_as_bits.frombytes(token_as_bytes)

        # (2B, ) - MinHash can give us larger hashes than
        # we need. It is recommended to set B up so this
        # doesn't destroy/skip data. In other words, B should
        # be a multiplier of 16.
        return torch.tensor(token_as_bits[:2 * self.B], dtype=torch.float)

    def __call__(self, x: List) -> torch.Tensor:
        # Can be anything, (Any, N[str]) -> (Any, N, 2B)
        token_features = self._vectorized_projection(x)
        token_features = torch.tensor(token_features, dtype=torch.float)

        # (Any, N, 2B) -> (Any, N, B, 2)
        token_features = torch.reshape(token_features,
                                       (*token_features.shape[:-1], -1, 2))

        # (Any, N, B, 2) -> (Any, N, B, 1)
        fingerprint = self._projection_operator(token_features)

        # (Any, N, B, 1) -> (Any, N, B)
        fingerprint = torch.squeeze(fingerprint, dim=-1)

        return fingerprint