Ejemplo n.º 1
0
class DataSketch(BaseANN):
    def __init__(self, metric, n_perm, n_rep):
        if metric not in ('jaccard'):
            raise NotImplementedError("Datasketch doesn't support metric %s" % metric)
        self._n_perm = n_perm
        self._n_rep = n_rep
        self._metric = metric
        self.name = 'Datasketch(n_perm=%d, n_rep=%d)' % (n_perm, n_rep)

    def fit(self, X):
        self._index = MinHashLSHForest(num_perm = self._n_perm, l = self._n_rep)
        for i, x in enumerate(X):
            m = MinHash(num_perm = self._n_perm)
            for e in x:
                m.update(str(e))
            self._index.add(str(i), m)
        self._index.index()

    def query(self, v, n):
        m = MinHash(num_perm = self._n_perm)
        for e in v:
            m.update(str(e))
        return map(int, self._index.query(m, n))
Ejemplo n.º 2
0
class DataSketch(BaseANN):
    def __init__(self, metric, n_perm, n_rep):
        if metric not in ('jaccard'):
            raise NotImplementedError("Datasketch doesn't support metric %s" %
                                      metric)
        self._n_perm = n_perm
        self._n_rep = n_rep
        self._metric = metric
        self.name = 'Datasketch(n_perm=%d, n_rep=%d)' % (n_perm, n_rep)

    def fit(self, X):
        self._index = MinHashLSHForest(num_perm=self._n_perm, l=self._n_rep)
        for i, x in enumerate(X):
            m = MinHash(num_perm=self._n_perm)
            for e in x:
                m.update(str(e))
            self._index.add(str(i), m)
        self._index.index()

    def query(self, v, n):
        m = MinHash(num_perm=self._n_perm)
        for e in v:
            m.update(str(e))
        return map(int, self._index.query(m, n))
Ejemplo n.º 3
0
    def saver(self, i, q, retq, matq, l):
        print_start = t.time()
        save_start = t.time()
        global_time = t.time()
        chunk_size = 100
        count = 0
        forest = MinHashLSHForest(num_perm=self.numperm)

        taxstr = ''
        if self.tax_filter is None:
            taxstr = 'NoFilter'
        if self.tax_mask is None:
            taxstr += 'NoMask'
        else:
            taxstr = str(self.tax_filter)
        dataset_name = self.saving_name + '_' + taxstr
        self.errorfile = self.saving_path + 'errors.txt'
        with open(self.errorfile, 'w') as hashes_error_files:
            with h5py.File(self.hashes_path, 'w', libver='latest') as h5hashes:
                datasets = {}
                if dataset_name not in h5hashes.keys():
                    if self.verbose == True:
                        print('creating dataset')
                        print(dataset_name)
                        print('filtered at taxonomic level: ' + taxstr)
                    h5hashes.create_dataset(dataset_name + '_' + taxstr,
                                            (chunk_size, 0),
                                            maxshape=(None, None),
                                            dtype='int32')
                    datasets[dataset_name] = h5hashes[dataset_name + '_' +
                                                      taxstr]
                    if self.verbose == True:
                        print(datasets)
                    h5flush = h5hashes.flush
                print('saver init ' + str(i))
                while True:
                    this_dataframe = retq.get()
                    if this_dataframe is not None:
                        if not this_dataframe.empty:
                            hashes = this_dataframe['hash'].to_dict()
                            print(str(this_dataframe.Fam.max()) + 'fam num')
                            print(str(count) + ' done')
                            hashes = {
                                fam: hashes[fam]
                                for fam in hashes if hashes[fam]
                            }
                            [
                                forest.add(str(fam), hashes[fam])
                                for fam in hashes
                            ]
                            for fam in hashes:
                                if len(datasets[dataset_name]) < fam + 10:
                                    datasets[dataset_name].resize(
                                        (fam + chunk_size,
                                         len(hashes[fam].hashvalues.ravel())))
                                datasets[dataset_name][
                                    fam, :] = hashes[fam].hashvalues.ravel()
                                count += 1
                            if t.time() - save_start > 200:
                                print(t.time() - global_time)
                                forest.index()
                                print(forest.query(hashes[fam], k=10))
                                h5flush()
                                save_start = t.time()
                                with open(self.lshforestpath,
                                          'wb') as forestout:
                                    forestout.write(pickle.dumps(forest, -1))
                                if self.verbose == True:
                                    print('save done at' +
                                          str(t.time() - global_time))
                        else:
                            print(this_dataframe)
                    else:
                        if self.verbose == True:
                            print('wrap it up')
                        with open(self.lshforestpath, 'wb') as forestout:
                            forestout.write(pickle.dumps(forest, -1))
                        h5flush()
                        if self.verbose == True:
                            print('DONE SAVER' + str(i))
                        break
Ejemplo n.º 4
0
class AutoTag():

    def __init__(self, num_permutation=60):
        self.__num_permutation = num_permutation
        self.__forest = MinHashLSHForest(self.__num_permutation)
        self.__lem = WordNetLemmatizer()
        stop_words = set(stopwords.words("english"))
        stop_words.add('—')
        stop_words.add('And')
        self.__stop_words = stop_words

    def fit(self, csv):
        df = pd.read_csv(csv)
        df.drop_duplicates(subset='webURL', keep=False, inplace=True)
        df.dropna(inplace=True)
        for index, row in df.iterrows():
            min_hash = self.make_min_hash(self.make_clean_words_list(row['Text']))
            self.__forest.add(row['webURL'], min_hash)
            if index % 100 == 0 :print(index, end='\r', flush=True)
        self.__forest.index()


    def make_clean_words_list(self, text):
        text = re.sub('[^a-zA-Z]', ' ', text)

        #Convert to lowercase
        text = text.lower()

        #remove tags
        text=re.sub("&lt;/?.*?&gt;"," &lt;&gt; ",text)

        # remove special characters and digits
        text=re.sub("(\\d|\\W)+"," ",text)

        #Lemmatisation
        text = text.split()
        lem = WordNetLemmatizer()
        text = [lem.lemmatize(word) for word in text if not word in self.__stop_words]

        return text


    def predict(self, text, num_of_niebhors):
        #TODO : change results into tags
        query = self.make_min_hash(self.make_clean_words_list(text))
        return self.__forest.query(query, num_of_niebhors)




    def make_min_hash(self,words):
        min_hash = MinHash(self.__num_permutation)
        for word in words:
            min_hash.update(word.encode('utf8'))
        return min_hash


    def load_trained_model(self, trained_model_file_name, num_of_permutations):
        self.__forest = pickle.load(open(trained_model_file_name, 'rb'))
        self.__num_permutation = num_of_permutations

    def save_model(self, file_name):
        pickle.dump(self.__forest, open(file_name, 'wb'))
Ejemplo n.º 5
0
class LshNN(ProgramNN):
	CACHE_DIR = 'cache/'

	def __init__(self, sampledDataPath, num_perm=128, top_k=1, evict_cache=False):
		"""
		An agent class to find rubric sampled nearest neighbour of a given
		program by using a MinHash LSH forest.

		"""
		self.sampledDataPath = sampledDataPath
		self.num_perm = num_perm
		self.top_k = top_k
		self.evict_cache = evict_cache
		self.rawProgramData, self.sampledData = self.loadSyntheticData()
		self.create_lsh_forest()


	def create_lsh_forest(self):
		cache_file = os.path.join(self.CACHE_DIR, 'lsh_forest.pkl')
		if not self.evict_cache and os.path.isfile(cache_file):
			# load precomputed
			print('Loading cached forest')
			self.forest = load_pickle(cache_file)
		else:
			sampledSets = self.processData(self.sampledData)
			self.sampledMinHashes = self.createMinHashSet(sampledSets)

			self.forest = MinHashLSHForest(num_perm=self.num_perm)
			for prog_idx, minHash in enumerate(self.sampledMinHashes):
				self.forest.add(prog_idx, minHash)

			self.forest.index()

			os.makedirs(self.CACHE_DIR, exist_ok=True)
			save_pickle(self.forest, cache_file)

	def minHash(self, code_tokens):
		minHash = MinHash(num_perm=self.num_perm)
		for d in code_tokens: # TODO modify this for n-grams
			minHash.update("".join(d).encode('utf-8'))

		return minHash

	# create minHash objects for every dataset
	def createMinHashSet(self, dataset):
		minHashes = []
		for code in tqdm(dataset):
			minHashes.append(self.minHash(code))
		return minHashes

	def multi_dict_get(self, key, all_dicts):
		for dic in all_dicts:
			if key in dic:
				return dic[key]
		raise ValueError('Key not in any of the dictionaries')

	def loadSyntheticData(self):
		cache_file = os.path.join(self.CACHE_DIR, 'lsh_programs.pkl')
		if not self.evict_cache and os.path.isfile(cache_file):
			data = load_json(cache_file)
			prog_items = data['raw_programs']
			anon_progs = data['anon_programs']
		else:
			standard_path = self.sampledDataPath + '/standard/train' + SYNTH_NAME
			uniform_path = self.sampledDataPath + '/uniform/train' + SYNTH_NAME
			tempered_path = self.sampledDataPath + '/tempered/train' + SYNTH_NAME
			standardDict = pickle.load(open(standard_path, "rb" ))
			uniformDict = pickle.load(open(uniform_path, "rb" ))
			temperedDict =  pickle.load(open(tempered_path, "rb" ))

			all_dicts = [standardDict, uniformDict, temperedDict]

			# this step is not stable across different runs if caching forest
			# so this needs to be cached too
			prog_items = list(standardDict.keys() | uniformDict.keys() | temperedDict.keys())
			anon_progs = [self.multi_dict_get(prog, all_dicts) for prog in prog_items]
			data = dict(raw_programs=prog_items, anon_programs=anon_progs)

			os.makedirs(self.CACHE_DIR, exist_ok=True)
			save_json(data, cache_file)

			# if we dont load cache here, we should regenerate forest too
			self.evict_cache = True

		return prog_items, anon_progs



	def transformCode(self, program):
		splitCode = program.split()
		return splitCode
		#return ngrams(splitCode, 3)

	# tokenize every sentence and return a list of sentences
	def processData(self, dataset):
		processed = []
		for datum in dataset:
			transformedCode = self.transformCode(datum)
			processed.append(transformedCode)
		return processed

	def findNearestNeighbours(self, studentProgram, **kwargs):
		minHash = self.minHash(self.transformCode(studentProgram))
		result = self.forest.query(minHash, self.top_k)
		top_k_programs_anon = [self.sampledData[idx] for idx in result]
		top_k_programs = [self.rawProgramData[idx] for idx in result]
		#return top_k_programs, top_k_programs_anon
		return top_k_programs
Ejemplo n.º 6
0
    artist_shingle[artist].append(tokens)


from datasketch import MinHashLSHForest, MinHash
from sklearn.metrics import jaccard_similarity_score

g = []

listlsh = []
lsh = MinHashLSHForest(num_perm=128)
for artist,sets in artist_shingle.items():
    a = MinHash(num_perm=128)
    for d in sets[0]:
        a.update(d.encode('utf8'))
    listlsh.append(a)
    lsh.add(artist,a)

lsh.index()
tester = {}
with open('tester.json') as file:
    tester = json.loads(file.read().encode('latin-1'))
numcorrect_1 =0
numcorrect_5 = 0
numcorrect_10 = 0
total = 0
for artist,songlist in tester.items():
    for song in songlist:
        m1 = MinHash(num_perm=128)
        songp = clean_text(song['lyrics'])
        for d in songp:
            m1.update(d.encode('utf8'))
Ejemplo n.º 7
0
# TODO: neither of these work well with puzzles...

df = pd.read_csv(
    '../chess-opening/csvs/lichess_db_standard_rated_2020-08_600+0.csv',
    nrows=1000000)


def create_min_hash(fens):
    min_hash = MinHash(num_perm=128)
    for fen in fens:
        min_hash.update(fen.encode('utf8'))
    return min_hash


user_df = df.groupby('username').agg({'fen': set, 'elo': 'mean'})
user_df['min_hash'] = user_df['fen'].apply(create_min_hash)

forest = MinHashLSHForest(num_perm=128)
for row in user_df.itertuples():
    forest.add(row.Index, row.min_hash)

forest.index()

for i in range(10):
    result = forest.query(user_df['min_hash'][i], 10)

    elos = []
    for username in result:
        elos.append(user_df.loc[username]['elo'])
    print(user_df['elo'][i], np.mean(elos))
Ejemplo n.º 8
0
data3 = ['minhash', 'is', 'probability', 'data', 'structure', 'for',
        'estimating', 'the', 'similarity', 'between', 'documents']

# Create MinHash objects
m1 = MinHash(num_perm=128)
m2 = MinHash(num_perm=128)
m3 = MinHash(num_perm=128)
for d in data1:
    m1.update(d.encode('utf8'))
for d in data2:
    m2.update(d.encode('utf8'))
for d in data3:
    m3.update(d.encode('utf8'))

# Create a MinHash LSH Forest with the same num_perm parameter
forest = MinHashLSHForest(num_perm=128)

# Add m2 and m3 into the index
forest.add("m2", m2)
forest.add("m3", m3)

# IMPORTANT: must call index() otherwise the keys won't be searchable
forest.index()

# Check for membership using the key
print("m2" in forest)
print("m3" in forest)

# Using m1 as the query, retrieve top 2 keys that have the higest Jaccard
result = forest.query(m1, 1)
print("Top 2 candidates", result)
Ejemplo n.º 9
0
    string2hash = {}
    pbar = tqdm(total=len(string2label))
    for program, _ in string2label.items():
        tokens = program.split()
        minhash = MinHash()
        for token in tokens:
            minhash.update(token.encode('utf-8'))
        string2hash[program] = minhash
        pbar.update()
    pbar.close()

    forest = MinHashLSHForest()
    pbar = tqdm(total=len(string2hash))
    for program, minhash in string2hash.items():
        forest.add(program, minhash)
        pbar.update()
    pbar.close()
    forest.index()

    true_labels = []
    pred_labels = []
    zipf_labels = []
    for program, label in real_data.items():
        zipf = real_zipf[program]
        try:
            tokens = program.split()
        except:
            continue
        minhash = MinHash()
        for token in tokens:
Ejemplo n.º 10
0
    return temp


# 3.建立分词之后的文档
docment = []
for sentence in content_list:
    item_str = jieba_cut(sentence)
    docment.append(item_str)

# 建立MinHash结构
MinHashList = []
forest = MinHashLSHForest()
for i, line in enumerate(docment):
    hash_codes = get_minhash(line)
    MinHashList.append(hash_codes)
    forest.add(i, hash_codes)

# index所有key,以便可以进行检索
forest.index()

query = '国足输给叙利亚后,里皮坐不住了,直接辞职了'
print("query  str :", query)
# 4. 将item_text进行分词
item_str = jieba_cut(query)
# 得到item_str的MinHash
minhash_query = get_minhash(item_str)

# 5. 查询forest中与m1相似的Top-K个邻居
result = forest.query(minhash_query, 3)
for i in range(len(result)):
    print("vocab_id:", result[i], "jaccard :",
Ejemplo n.º 11
0
def main():
    corpus = {}
    with open('corpus_data/preprocessedf_corpus.json') as file:
        corpus = json.loads(file.read().encode('Utf-8'))

    def processLyrics(lyrics):
        authors = {}
        for author in lyrics:
            for song in lyrics[author]:
                lyric = re.sub(r'\[[^>]+\]', '', song["lyrics"])
                lyric = re.sub(r'\([^>]+\)', '', lyric)
                lyric = re.sub(r'\{[^>]+\}', '', lyric)
                lyric = lyric.split(r'\s')
                for line in lyric:
                    line = re.sub(r'\n', ' ', line)
                    if author not in authors:
                        authors[author] = line
                    else:
                        authors[author] += line
        return authors

    import nltk
    from nltk.corpus import stopwords
    from collections import defaultdict
    from collections import Counter

    nltk.download('wordnet')
    from nltk.corpus import wordnet as wn

    def get_lemma(word):
        lemma = wn.morphy(word)
        if lemma is None:
            return word
        else:
            return lemma

    from nltk import word_tokenize

    def clean_text(text, ar):
        tokenized_text = word_tokenize(text.lower())
        tokenized_text = [token for token in tokenized_text if len(token) > 5]
        cleaned_text = [
            t for t in tokenized_text
            if re.match('[a-zA-Z\-][a-zA-Z\-]{2,}', t)
        ]
        if ar == 'sw':
            cleaned_text = [t for t in cleaned_text if t not in STOPWORDS]
        if ar == 'lm':
            cleaned_text = [get_lemma(token) for token in cleaned_text]
        if ar == 'rw':
            cleaned_text = [
                token for token in cleaned_text if token not in PROFANITY
            ]
        return cleaned_text

    STOPWORDS = set(stopwords.words('english'))
    with open('corpus_data/rapsw.txt') as infile:
        infile = infile.read()
        infile = infile.split()
        PROFANITY = set(infile)

    corpus = processLyrics(corpus)

    for author, text in corpus.items():
        corpus[author] = clean_text(text, sys.argv[1])

    artist_shingle = defaultdict(list)
    for artist, lyrics in corpus.items():
        #tokens = [w for w in tokens if not w in sw]
        #shingle3 = set([tuple(tokens[i:i+3]) for i in range(len(tokens) - 3 + 1) if len(tokens[i]) < 10])
        #shingle2 = set([tuple(tokens[i:i+2]) for i in range(len(tokens) - 2 + 1) if len(tokens[i]) < 10])
        shingle1 = lyrics
        # set([tokens[i] for i in range(len(tokens) - 1 + 1) if len(tokens[i]) < 4])
        artist_shingle[artist].append(shingle1)
        #artist_shingle[artist].append(shingle2)
        #artist_shingle[artist].append(shingle3)

    from datasketch import MinHashLSHForest, MinHash
    from sklearn.metrics import jaccard_similarity_score

    listlsh = []
    lsh = MinHashLSHForest(num_perm=128)
    for artist, sets in artist_shingle.items():
        a = MinHash(num_perm=128)
        for d in sets[0]:
            a.update(d.encode('utf8'))
        listlsh.append(a)
        lsh.add(artist, a)

    lsh.index()

    m1 = MinHash(num_perm=128)
    g = []
    with open(sys.argv[2]) as g:
        g = g.read()
        g = g.split()
    for d in g:
        m1.update(d.encode('utf8'))

    result = lsh.query(m1, 5)
    print(" (Up to) Top 5 candidates", result)
Ejemplo n.º 12
0
    print(doc['title'])
    # filename = "result_minHashLSHforest.txt"
    # myfile = open(filename, 'a+')
    # myfile.write(doc['title'] + '\n')
    if result:
        for item in result:
            doc = docs_col.find_one({"_id": ObjectId(str(item))})
            print(doc['title'])
            # myfile.write(doc['title'] + '\n')
    # myfile.write("================" + '\n')
    print("=====================")


if __name__ == '__main__':
    forest = MinHashLSHForest(num_perm=128)
    documents_en = docs_col.find({"lang": 'english'})
    for item in documents_en:
        minhash = MinHash(num_perm=128)
        list_keyword = item["keyword"].split(",")
        for k in list_keyword:
            minhash.update(k.encode("utf-8"))
        forest.add(str(item["_id"]), minhash)
    forest.index()

    documents_vi = docs_col.find({"lang": 'vietnamese'})
    start_time = time.time()
    for doc in documents_vi:
        # pdb.set_trace()
        query_candidates(doc, 5)
    elapsed_time = time.time() - start_time
    print(elapsed_time)
Ejemplo n.º 13
0
class DataSketch(BaseANN):
    def __init__(self, metric, n_perm, n_rep, n_leaves):
        if metric not in ('jaccard'):
            raise NotImplementedError("Datasketch doesn't support metric %s" %
                                      metric)
        self._n_perm = n_perm
        self._n_rep = n_rep
        self._n_leaves = n_leaves
        self._metric = metric
        self.name = 'Datasketch(n_perm=%d, n_rep=%d, n_leaves=%d)' % (
            n_perm, n_rep, n_leaves)

    def fit(self, X):
        self.index = numpy.empty([0, 32])
        self._index_minhash = []
        self._ball_index = []
        self._index = MinHashLSHForest(num_perm=self._n_perm, l=self._n_rep)

        for i, x in enumerate(X):
            m = MinHash(num_perm=self._n_perm)
            for e in x:
                m.update(str(e).encode('utf-8'))
            self._index.add(str(i), m)
            #self.index.append(m.digest())
            self.index = numpy.vstack((self.index, m.digest()))
            self._ball_index.append(m.digest())
            self._index_minhash.append(m)
        self._index.index()
        self._X = X

        self.tree = BallTree(self.index, leaf_size=self._n_leaves)

        # self._annoy = annoy.AnnoyIndex(X.shape[1], metric='euclidean')
        # for i, x in enumerate(X):
        #     self._annoy.add_item(i, x.tolist())
        # self._annoy.build(100)

    def query(self, v, n):
        print("&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&")
        m = MinHash(num_perm=self._n_perm)
        for e in v:
            m.update(str(e).encode('utf-8'))

        # for i in self._annoy.get_nns_by_vector(v.tolist(), n, 100):
        #     print(self._index_minhash[int(i)].jaccard(m))

        dist, ind = self.tree.query([m.digest()], k=n)
        for i in ind[0]:
            # print(i)
            print(self._index_minhash[int(i)].jaccard(m))
        print("=======================")
        brute_indices = self.query_with_distances(m.digest(), n)
        for i in brute_indices:
            print(self._index_minhash[int(i)].jaccard(m))
        print("-----------------------")
        ind2 = self._index.query(m, n)
        for i in ind2:
            print(self._index_minhash[int(i)].jaccard(m))

        # return map(int, ind[0])
        return self.query_with_distances(m.digest(), n)

    popcount = []
    for i in range(256):
        popcount.append(bin(i).count("1"))

    def query_with_distances(self, v, n):
        """Find indices of `n` most similar vectors from the index to query vector `v`."""
        if self._metric == 'jaccard':
            dists = numpy.array(
                [pd[self._metric]['distance'](v, e) for e in self.index])
        else:
            assert False, "invalid metric"  # shouldn't get past the constructor!
        # partition-sort by distance, get `n` closest
        nearest_indices = dists.argsort()[-n:][::-1]

        return nearest_indices
Ejemplo n.º 14
0
class duplicate_docs:
    def __init__(self):
        self.lsh = MinHashLSHForest(
            num_perm=config.LSH_CONFIG['num_permutation'])

    def load(self, model):
        print('loading %s ...' % (model))
        if os.path.isfile(model):
            return joblib.load(model)
        else:
            return None

    def save(self, model, path):
        print('saving %s ...' % (path))
        joblib.dump(model, path)
        return

    # load data from list documents
    def run(self, docs):
        count = 1
        for itemid, content in docs.items():
            try:
                doc = document(content)
                self.insert(doc, key=itemid)
                print('\rpushed %d items' % (count)),
                sys.stdout.flush()
                count += 1
            except:
                pass
        self.lsh.index()
        print('')

    def run_ex(self, itemid, content, call_index=True):
        try:
            doc = document(content)
            self.insert(doc, key=itemid)
            if call_index:
                self.lsh.index()
        except:
            pass

    def query(self, doc, topn=1000):
        try:
            unicodedata.normalize('NFKC', doc)
            doc = document(doc)
            minhash = doc.get_minhash(doc.k_shingles,
                                      config.MINHASH_CONFIG['num_permutation'])
            return self.lsh.query(minhash, topn)
        except:
            return []

    # insert a document object
    # output: key if document does not exist duplicate item
    # otherwise return alert duplication.
    def insert(self, doc, key=None):
        if key is None:
            key = utils.id_generator()
        minhash = doc.get_minhash(doc.k_shingles,
                                  config.MINHASH_CONFIG['num_permutation'])
        if len(doc.k_shingles) == 0:
            return u'Does not insert this document to database.\nDocument\'s shingle = 0.\nDocument need to contain at least %d word' \
                   % (config.SHINGLE_CONFIG['k'])
        self.lsh.add(key, minhash)

    def load_model(self):
        self.lsh = self.load('model/lsh.pkl')
        self.docs = self.load('model/docs.pkl')
        self.docs_time = self.load('model/docs_time.pkl')
        if self.lsh != None and self.docs != None and self.docs_time != None:
            return True
        return False

    def save_model(self):
        utils.mkdir('model')
        self.save(self.lsh, 'model/lsh.pkl')
        self.save(self.docs, 'model/docs.pkl')
        self.save(self.docs_time, 'model/docs_time.pkl')
Ejemplo n.º 15
0
# create a minhash for each row(product) by calling the 'create_minhash' function
df_tfidf['Minhash'] = df_tfidf[0:].apply(lambda x: create_minhash(x), axis=1)

# create a list with all the Minhash signatures
minhash_list = df_tfidf['Minhash']

# create a MinHashLSHForest object with num_perm parameter equal to sample_size(=128)
# num_perm: the number of permutation functions
forest = MinHashLSHForest(num_perm=128)

# add each Minhash signature into the index
i = 0
for minhash in minhash_list:
    # Add minhash into the index
    forest.add("m"+str(i), minhash)
    i += 1

# call index() in order to make the keys searchable
forest.index()

# create the recommendations by retrieving top 10 keys that have the higest Jaccard for each product

def make_recs(doc_id, n_recs):
    """
    This function takes the id of the target product and returns the top n_recs(=10) keys that have the higest Jaccard
    :param doc_id: the id of the target product
    :param n_recs: the number of similar products to be returned
    :return: top n_recs keys that have the higest Jaccard for each product
    """
    query = minhash_list[doc_id]
Ejemplo n.º 16
0
class HashJaccard(FilterProblem):
    """
  A class that does clustering based on hashes from the datasketch library.
  """
    @property
    def num_perm(self):
        return DATA_FILTERING["num_permutations"]

    @property
    def DataPointClass(self):
        return DataPoint

    # Find nearest medoid for a data point.
    def find_nearest_medoid(self, data_point, data_tag=""):
        nearest_medoid = self.forest.query(data_point.min_hash, 1)
        if not nearest_medoid:
            nearest_medoid = [
                random.randint(0, self.num_clusters[data_tag] - 1)
            ]
        return nearest_medoid[0]

    # Do the clustering of sources and targets.
    def clustering(self, data_tag):
        """
    Params:
      :data_tag: Whether it's source or target data.
    """

        # Create a min hash forest to quickly find nearest neighbours.
        self.forest = MinHashLSHForest(num_perm=self.num_perm)

        # Initialize clusters.
        medoids = random.sample(range(len(self.data_points[data_tag])),
                                self.num_clusters[data_tag])

        for i in range(self.num_clusters[data_tag]):
            cl = self.ClusterClass(self.data_points[data_tag][medoids[i]])
            self.clusters[data_tag].append(cl)

            # Put medoids in a the forest.
            self.forest.add(i, self.clusters[data_tag][-1].medoid.min_hash)
        self.forest.index()

        # For each data_point find a cluster.
        self.cluster_points(data_tag)

        # These will be needed for the stopping criterion.
        cluster_names = [
            self.clusters[data_tag][i].medoid.string
            for i in range(self.num_clusters[data_tag])
        ]
        cluster_names_old = list(cluster_names)
        count = 0
        counts = []
        exit = False

        # Clustering loop.
        while not exit:
            count += 1

            # Find the point that minimizes the mean distance within a cluster.
            self.find_medoid(data_tag)

            # Create new forest.
            self.forest = MinHashLSHForest(num_perm=self.num_perm)
            for i in range(self.num_clusters[data_tag]):
                self.forest.add(i, self.clusters[data_tag][i].medoid.min_hash)
            self.forest.index()

            # Assign each point to the new medoids.
            self.cluster_points(data_tag)

            # Check stopping criterions.
            exit, cluster_names, cluster_names_old, counts = self.stop_clustering(
                data_tag, cluster_names, cluster_names_old, count, counts)
Ejemplo n.º 17
0
stop = []
# 得到分词后的documents
documents = []
for item_text in sentences:
    # 将item_text进行分词
    item_str = get_item_str(item_text)
    documents.append(item_str)

# 创建LSH Forest及MinHash对象
minhash_list = []
forest = MinHashLSHForest()
for i in range(len(documents)):
    # 得到train_documents[i]的MinHash
    temp = get_minhash(documents[i])
    minhash_list.append(temp)
    forest.add(i, temp)
# index所有key,以便可以进行检索
forest.index()

query = '00:01:36,2019天猫双11总成交额超100亿元'
# 将item_text进行分词
item_str = get_item_str(query)
# 得到item_str的MinHash
minhash_query = get_minhash(item_str)

# 查询forest中与m1相似的Top-K个邻居
result = forest.query(minhash_query, 3)
for i in range(len(result)):
    print(result[i], minhash_query.jaccard(minhash_list[result[i]]),
          documents[result[i]].replace(' ', ''))
print("Top 3 邻居", result)
Ejemplo n.º 18
0
def printStats(json_filename):
    with open(json_filename) as json_data:
        d = json.load(json_data)

        # Query simple index: queryNum -> queryText
        queryIndex = {}

        # Index of queries as a LSH forest for top-k similar queries.
        queriesLSHIndex = MinHashLSHForest(num_perm=128)

        # You can grok the CSV from stdout by using cut, e.g.,
        #
        # $ python analyzer.py -i ../../data/queries_ASTs.json | grep "csv:" | cut -d':' -f2 > /tmp/out.csv
        print 'csv:"queryNum","numExplicitJoins","referencedTables","groupByColumns","numGroupByClauses"'

        for queryNum, entry in enumerate(d):
            print '\n=> Stats for query number \"%s:\"' % queryNum

            # Group by clauses.
            groupByColumns = jmespath.search(
                'ast.statement[*].group.expression[*].name[]', entry)
            print 'groupBy columns: %s' % groupByColumns

            # Base tables when the query has no joins.
            baseTables = jmespath.search(
                'ast.statement[?from.variant == \'table\'].from.name[]', entry)
            print 'baseTables: %s' % baseTables

            # Base tables when the query has joins.
            baseTables += jmespath.search(
                'ast.statement[?from.variant == \'join\'].from.source.name[]',
                entry)
            print 'baseTables (with joins): %s' % baseTables

            # Join tables.
            joinTables = jmespath.search(
                'ast.statement[?from.variant == \'join\'].from.map[*].source.name[]',
                entry)
            print 'joinTables: %s' % joinTables

            # All tables mentioned in the query
            referencedTables = baseTables + joinTables

            # Joins.
            joinPathPrefix = 'ast.statement[*].from.map[*].constraint.on'
            joinsLeft = jmespath.search(joinPathPrefix + '.left.name', entry)
            joinsRight = jmespath.search(joinPathPrefix + '.right.name', entry)
            print 'explicit joins (left-hand side): %s' % joinsLeft
            print 'explicit joins (right-hand side): %s' % joinsRight

            # Text
            queryText = jmespath.search('queryText', entry)

            # Index it into an LSH forest for top-k textually similar queries.
            queryLSH = getQueryMinHash(queryText)
            queryIndex[queryNum] = {
                'queryText': queryText,
                'queryLSH': queryLSH
            }
            queriesLSHIndex.add(queryNum, queryLSH)

            # Sort for a prettier CSV dump.
            referencedTables.sort()
            groupByColumns.sort()
            # CSV header:
            # queryNum,numExplicitJoins,referencedTables,groupByColumns,numGroupByColumns
            print 'queryNum = %s' % queryNum
            print 'csv:"%s","%s","%s","%s","%s"' % (
                queryNum, len(joinsLeft[0]) if len(joinsLeft) > 0 else 0,
                ','.join(referencedTables), ','.join(groupByColumns),
                len(groupByColumns))

            # Populate a reverse index from table to script.
            tableToQuery = {}
            for referencedTable in referencedTables:
                if referencedTable not in tableToQuery:
                    tableToQuery[referencedTable] = [queryNum]
                else:
                    tableToQuery[referencedTable].append(queryNum)

        # Sample search on LSH forest index: top-3 most similar queries.
        queriesLSHIndex.index()
        k = 3
        queryNum = 10
        query = queryIndex[queryNum]
        print '\n\nTop %s queries similar to "%s":' % (k, query['queryText'])
        top_k = queriesLSHIndex.query(query['queryLSH'], k)
        for k in top_k:
            print '\n"%s"' % queryIndex[k]['queryText']
Ejemplo n.º 19
0
    'estimating', 'the', 'similarity', 'between', 'documents'
]
data3 = [
    'minhash', 'is', 'probability', 'data', 'structure', 'for', 'estimating',
    'the', 'similarity', 'between', 'documents'
]

dataset = [[0., 0., 0.], [0., 0., 1.], [0., 1., 0.], [0., 1., 1.]]
# Create a MinHash LSH Forest with the same num_perm parameter
forest = MinHashLSHForest(num_perm=128)

for i, data in enumerate(dataset):
    m = MinHash(num_perm=128)
    for d in data:
        m.update(str(d).encode('utf8'))
    forest.add(str(i), m)

# IMPORTANT: must call index() otherwise the keys won't be searchable

pickle.dump(forest, open('forest.lsh', 'wb'))
del forest
forest = pickle.load(open('forest.lsh', 'rb'))

forest.index()

# Check for membership using the key
print("1" in forest)
print("2" in forest)

m = MinHash(num_perm=128)
for d in dataset[0]:
class LshSamplesEval:

	'''STUDENT_PATH = '../studentData/liftoff/'
	STANDARD_PATH = '../data/raw/liftoff/standard/'
	UNIFORM_PATH = '../data/raw/liftoff/uniform/'
	TEMPERED_PATH = '../data/raw/liftoff/tempered/'''

	def __init__(self, studentDataPath, sampledDataPath):
		print('Loading data...')
		self.studentDataPath = studentDataPath
		print(self.studentDataPath)
		self.sampledDataPath = sampledDataPath
		print(self.sampledDataPath)
		self.sampledData = self.loadSyntheticData()
		self.studentData = self.loadStudentData()

	def loadStudentData(self):
		path = self.studentDataPath + STUDENT_NAME
		datadict = pickle.load(open(path, "rb" ))
		return list(datadict.keys())

	def loadSyntheticData(self):
		standard_path = self.sampledDataPath + '/standard/train' + SYNTH_NAME
		uniform_path = self.sampledDataPath + '/uniform/train' + SYNTH_NAME
		tempered_path = self.sampledDataPath + '/tempered/train' + SYNTH_NAME
		standardDict = pickle.load(open(standard_path, "rb" ))
		uniformDict = pickle.load(open(uniform_path, "rb" ))
		temperedDict =  pickle.load(open(tempered_path, "rb" ))
		#import pdb; pdb.set_trace()
		return list(standardDict.values()) + list(uniformDict.values()) + list(temperedDict.values())

	def computeLshNN(self):
		print('Processing sampled and student data...')
		sampledSets = self.processData(self.sampledData)
		studentSets = self.processData(self.studentData)

		print('Finding nearest neighbors from sampled data...')
		sampledScores = self.constructNNList(studentSets, sampledSets, self.studentData, self.sampledData)

		print('Found nearest neighbors for data!')

		# self.constructHistogram(sampledScores)

		return sampledScores

	# tokenize every sentence and return a list of sentences
	def processData(self, dataset):
		processed = []
		for datum in dataset:
			splitCode = datum.split()
			processed.append(splitCode)
		return processed

	# runs MinHashLsh
	def constructNNList(self, studentSets, sampledSets, studentData, sampledData):
		print('Creating min-hashes for student data')
		self.studentMinHashes = self.createMinHash(studentSets)
		print('Creating min-hashes for rubric data')
		self.sampledMinHashes = self.createMinHash(sampledSets)

		self.forest = MinHashLSHForest(num_perm = 128)
		i = 0
		for minHash in self.sampledMinHashes:
			self.forest.add(str(i), minHash)
			i += 1

		self.forest.index()

		print("calculating nearest neighbor")
		scores = []
		for i, query in enumerate(tqdm(self.studentMinHashes)):
			result = self.forest.query(query, 1)
			indexMatch = int(result[0])
			# Uncomment these to print examples of 
			# student code and their nearest neighbor!
			print(result)
			print('Student Code: \n')
			print(studentData[i])
			print('\n')
			print('Closest Sampled Code: \n')
			print(sampledData[indexMatch])
			print('\n')
			score = self.sampledMinHashes[indexMatch].jaccard(query)
			print('Score: \n')

			scores.append(score)

		return scores

	# create minHash objects for every dataset
	def createMinHash(self, dataset):
		minHashes = []
		for code in tqdm(dataset):
			minHash = MinHash(num_perm = 128)
			for d in code: # TODO modify this for n-grams
				minHash.update("".join(d).encode('utf-8'))
			minHashes.append(minHash)
		return minHashes

	def constructHistogram(self, scores):
		plt.hist(scores)
		plt.xlabel('Jaccard Similarity Score')
		plt.ylabel('Counts')
		plt.show()