Ejemplos de LSH.LSH en Python, ejemplos de lsh.LSH.LSH en Python

Ejemplo n.º 1

0

Mostrar archivo

Archivo: test_lsh.py Proyecto: pdoyle5000/lsh

    def setUp(self):
        self.lsh = LSH(3, 2, 1)
        self.lsh_two_tables = LSH(3, 2, 2)

        # Overwrite randomly initalized planes with known values.
        self.lsh.planes = [np.array([[0.1, 0.2], [-0.1, -0.2], [-1.0, 1.0]])]
        self.lsh_two_tables.planes = [
            np.array([[0.1, 0.2], [-0.1, -0.2], [-1.0, 1.0]]),
            np.array([[-0.1, -0.2], [0.1, 0.2], [-2.0, 2.0]]),
        ]

Ejemplo n.º 2

0

Mostrar archivo

 def __init__(self, digest_length, num_hashtables, dTreshold):
     self._dTrehold = dTreshold
     self._lsh = LSH(digest_length,
                     2**20,
                     storage_config=None,
                     num_hashtables=num_hashtables)
     self._hashvect = HashVect()

Ejemplo n.º 3

0

Mostrar archivo

Archivo: index.py Proyecto: namanarora00/Information-Retreival

 def _build_lsh(self, minhashes):
     '''
         Builds the LSH using the given minhash vectors
     '''
     self.lsh = LSH(minhashes)
     with open(self.LSH_FILE, 'wb') as f:
         pickle.dump(self.lsh.hash_tables, f)

Ejemplo n.º 4

0

Mostrar archivo

Archivo: task5.py Proyecto: satrajitrik/HandImages

def starter(image_id, m, k, l):
    query_results = Database().retrieve_many()
    id_vector_pairs = [(item["image_id"], item["vector"])
                       for item in query_results]

    search_results = LSH(l, k, id_vector_pairs).get_search_results(image_id,
                                                                   show=True)

    print(
        "Original dataset size: {} | Reduced search space size: {} | Reduction by {} %"
        .format(
            len(id_vector_pairs),
            len(search_results),
            float(len(id_vector_pairs) - len(search_results)) * 100 /
            len(id_vector_pairs),
        ))
    query_results = Database().retrieve_many(list(search_results))
    search_id_vector_pairs = [(item["image_id"], item["vector"])
                              for item in query_results]
    source_vector = Database().retrieve_one(image_id)["vector"]

    all_images = functions.find_similarity(source_vector,
                                           search_id_vector_pairs)
    similar_images = all_images[:m]

    print(similar_images)

    visualizer.visualize_lsh(image_id, similar_images)

    return similar_images, all_images

Ejemplo n.º 5

0

Mostrar archivo

Archivo: cloud_main.py Proyecto: yueliangshuile/image_retrieval_backend

 def cloud_main(self, file_count):
     with open('encrypted_vectors/feature_vectors.json') as data_file:
         feature_loaded = json.load(data_file)
     with open('encrypted_indices/indices.json') as data_file:
         indices_loaded = json.load(data_file)
     with open('encrypted_query/query_vectors.json') as data_file:
         query_loaded = json.load(data_file)
     d = Decrypt()
     feature_vectors = d.decrypt_indices_vector(bytes(feature_loaded))
     indices = d.decrypt_indices_vector(bytes(indices_loaded))
     query_vectors = d.decrypt_indices_vector(bytes(query_loaded))
     feature_vectors = np.frombuffer(feature_vectors, dtype=int)
     feature_vectors = np.reshape(feature_vectors, (file_count, -1))
     indices = json.loads(indices.decode())
     query_vectors = np.frombuffer(query_vectors, dtype=int)
     query_vectors = np.reshape(query_vectors, (-1, 6))
     print(feature_vectors.shape, indices, query_vectors.shape)
     l = LSH(feature_vectors, indices)
     n_neighbors, result = l.query(query_vectors, 6, 45)
     print(n_neighbors)
     cursor.execute(sql_select_Query)
     records = cursor.fetchall()
     for row in records:
         if row[0] - 1 in result:
             image_name = row[1]
             print(image_name)
             CloudAPISender().cloud_api_sender(image_name)
     # Closing the connection
     conn.close()

Ejemplo n.º 6

0

Mostrar archivo

def loadOrTrainLSHModel(forceGenerate=False):
    lshModel = None
    if os.path.exists("./pickle_files/lshModel.pickle") and not forceGenerate:
        print("LSH model found on disk")
        pickleIn = open("./pickle_files/lshModel.pickle", "rb")
        lshModel = pickle.load(pickleIn)
    else:
        print("Training LSH model")
        trainAudioDataAndRateArray = loadAllFiles("train", '')
        trainingData = generateData(trainAudioDataAndRateArray)

        lshModel = LSH()
        for data in trainingData:
            print("fileName", data[1])
            validFrameList = extractValidFrames(data[0])

            for i in range(0, validFrameList.shape[0]):
                reshapedValidFrame = validFrameList[i].reshape(1, -1)
                # print("reshapedValidFrame", reshapedValidFrame)
                lshModel.train(reshapedValidFrame, {
                    "name": data[1] + "_" + str(i),
                    "frameIndex": i
                })
                # hr.train(validFrames[1:2], data[1])

        # print("lshModel", lshModel)

        pickleOut = open("./pickle_files/lshModel.pickle", "wb")
        pickle.dump(lshModel, pickleOut)
        pickleOut.close()

    return lshModel

Ejemplo n.º 7

0

Mostrar archivo

Archivo: task5.py Proyecto: saadshaikh96/Multimedia-and-Web-Databases

def get_similar(num_layers, num_hashes, features, filepaths, query_image, num_results):
    lsh = LSH(num_layers, num_hashes, 10)
    lsh.fit(features)

    query_vec = get_cm_features_by_image_path(query_image)
    similar_indices = lsh.get_similar(query_vec, num_results)[:num_results]

    return [list(features[x]) for x in similar_indices], [filepaths[x] for x in similar_indices]

Ejemplo n.º 8

0

Mostrar archivo

Archivo: index.py Proyecto: namanarora00/Information-Retreival

 def load_lsh(self):
     '''
         Loads the buckets from the files and initalizes a LSH object using this 
         data.
     '''
     with open(self.LSH_FILE, 'rb') as f:
         data = pickle.load(f)
         self.lsh = LSH(table=data)

Ejemplo n.º 9

0

Mostrar archivo

Archivo: dnd.py Proyecto: rdspring1/Model-Free-Episodic-Control

    def __init__(self, N, D, K, L):
        self.lsh = LSH(SimHash(D, K, L), K, L)
        self.keys = np.zeros((N, D), dtype=np.float32)
        self.values = np.zeros((N, 1), dtype=np.float32)
        self.lru = np.zeros(N, dtype=np.float32)
        self.key2idx = dict()

        self.size = 0
        self.max_memory = N
        self.K = K
        self.L = L

Ejemplo n.º 10

0

Mostrar archivo

Archivo: nn.py Proyecto: mikaelbaymani/nn

def query(fname, key='key', topk=10, truncate=80):

    model = pickle.load(open(CONST.MODEL, 'rb'))

    dataframe = pd.read_csv(CONST.DATASET)
    corpus = TfidfVectorizer().fit_transform(dataframe['content'])

    lsh = LSH(corpus, model)
    index = dataframe[dataframe[key].apply(str) == str(fname)].index[0]

    dataframe['content'] = dataframe['content'].str[:int(truncate)]
    return lsh.query(corpus[index, :], int(topk),
                     10)[0].join(dataframe,
                                 on='id').sort_values('distance').iloc[:, 1:]

Ejemplo n.º 11

0

Mostrar archivo

    def __init__(self, movie_filename, rating_filename, k, m, c):

        # Hyperparameter
        self.c = c

        # read movie file and create dictionary _movie_names
        self._movie_names = {}
        f = open(movie_filename, "r", encoding="utf8")
        reader = csv.reader(f)
        next(reader)  # skips header line
        for line in reader:
            movieid = line[0]
            moviename = line[1]
            # ignore line[2], genre
            self._movie_names[movieid] = moviename
        f.close()

        # read rating file and create _movie_ratings (ratings for a movie)
        # and _user_ratings (ratings by a user) dicts
        self._movie_ratings, self._movie_time = {}, {}
        self._user_ratings, self._user_time = {}, {}
        f = open(rating_filename, "r", encoding="utf8")
        reader = csv.reader(f)
        next(reader)  # skips header line
        for line in reader:
            userid = line[0]
            movieid = line[1]
            rating = line[2]
            timestamp = line[3]
            if userid not in self._user_ratings:
                self._user_ratings[userid] = {
                }  # each user is a dict with movies and ratings
                self._user_time[userid] = {}
            self._user_ratings[userid][movieid] = float(rating)
            self._user_time[userid][movieid] = float(timestamp)

            if movieid not in self._movie_ratings:
                self._movie_ratings[movieid] = {}
                self._movie_time[movieid] = {}
            self._movie_ratings[movieid][userid] = float(rating)
            self._movie_time[movieid][userid] = float(timestamp)
        f.close()

        self.me = LSH(k, m, self._user_ratings, self._movie_ratings)

Ejemplo n.º 12

0

Mostrar archivo

Archivo: main.py Proyecto: KatsarosEf/locality-sensitive-hashing

def run(dataPath):
    nrofBands = 25
    nrOfPerms = 132
    nrOfRows = int(nrOfPerms / nrofBands)

    # Load data
    data = np.load(dataPath)
    print("Data is loaded")

    # Create an empty file to add results
    with open('results.txt', 'w') as file:
        file.write("")
        file.close()

    # Create Signature Matrix
    signatureMatrix = minHashing(data, nrOfPerms)
    pairsFound = LSH(data, signatureMatrix, nrOfRows, nrofBands)
    print(" Number of similar pairs found is : ", len(pairsFound))
    return (pairsFound)

Ejemplo n.º 13

0

Mostrar archivo

 def cloud_main(self, file_count):
     #establishing the connection
     conn = mysql.connector.connect(user='******',
                                    password='******',
                                    host='127.0.0.1',
                                    database='ImageRetrieval')
     #Creating a cursor object using the cursor() method
     cursor = conn.cursor()
     # Preparing SQL query to select a record from the database.
     sql_select_Query = "select * from images"
     with open('encrypted_vectors/feature_vectors.json') as data_file:
         feature_loaded = json.load(data_file)
     with open('encrypted_indices/indices.json') as data_file:
         indices_loaded = json.load(data_file)
     with open('encrypted_query/query_vectors.json') as data_file:
         query_loaded = json.load(data_file)
     d = Decrypt()
     feature_vectors = d.decrypt_indices_vector(bytes(feature_loaded))
     indices = d.decrypt_indices_vector(bytes(indices_loaded))
     query_vectors = d.decrypt_indices_vector(bytes(query_loaded))
     feature_vectors = np.frombuffer(feature_vectors, dtype=int)
     feature_vectors = np.reshape(feature_vectors, (file_count, -1))
     indices = json.loads(indices.decode())
     query_vectors = np.frombuffer(query_vectors, dtype=int)
     query_vectors = np.reshape(query_vectors, (-1, 6))
     print(feature_vectors.shape, indices, query_vectors.shape)
     l = LSH(feature_vectors, indices)
     n_neighbors, result = l.query(query_vectors, 6, 45)
     print(n_neighbors)
     cursor.execute(sql_select_Query)
     records = cursor.fetchall()
     for row in records:
         if row[0] - 1 in result:
             image_name = row[1]
             print(image_name)
             CloudAPISender().cloud_api_sender(image_name)
     # Closing the connection
     conn.close()

Ejemplo n.º 14

0

Mostrar archivo

from mrjob.job import MRJob
from mrjob.step import MRStep
import util
import os
from datastore import SQLiteDatastore
import config
import random

SPACE = u' '

PATH = os.path.dirname(os.path.abspath(__file__))

from lsh import LSH, DocumentTooShortError
datastore = SQLiteDatastore(config.SQLITE_PATH, False)
lsh = LSH(datastore)


class CandidatesMapReducer(MRJob):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.remaining = str()

    def steps(self):
        return [
            MRStep(
                mapper=self.mapper_paragraphs,
                mapper_final=self.mapper_paragraphs_final,
                reducer=self.reducer_minhash,
            ),
            MRStep(reducer=self.reducer_unique, ),
        ]

Ejemplo n.º 15

0

Mostrar archivo

Archivo: similarity.py Proyecto: warriersruthi/document-similarity

from create_vector import vectorize
from lsh import LSH
from parse_document import extract_from_pdf
import sys

if __name__ == "__main__":
    pdfs = sys.argv[1:]
    if pdfs:
        vector_list = [vectorize(extract_from_pdf(pdf)) for pdf in pdfs]
    else:
        print "Usage: python create_vector.py pdf1 [pdf2] [pdf3] .."
        sys.exit()
    lsh = LSH(300)
    [lsh.insert_document(title, vector) for (title, vector) in vector_list]
    # print lsh.get_similarities()
    print lsh.closest_match(vector_list[0][0])
    # return vector_list

Ejemplo n.º 16

0

Mostrar archivo

Archivo: benchmarking_permutations.py Proyecto: ADelgadoT/LSH_Protein_Sequence_Similarity

import pandas as pd
import sqlite3


uniDB = UniprotDB("Uniprot_DB.sqlite")
#Construct the protein database
"""
uniDB.deleteProteins()
protManager = ProteinsManager()
uniDB.createTables()
protManager.loadProteins("Ecolx.xml",uniDB)
protManager.loadProteins("PseA7.xml",uniDB)
"""


minhash3 = LSH(0.3,32)
minhash4 = LSH(0.3,64)
minhash4b = LSH(0.3,96)
minhash5 = LSH(0.3,128)

# Create the minhashes
proteins = uniDB.extractProteins()
"""
minhashes3, lsh3 = minhash3.calculateLSH(proteins, 3)
minhashes4, lsh4 = minhash4.calculateLSH(proteins, 3)
minhashes4b, lsh4b = minhash5.calculateLSH(proteins, 3)
minhashes5, lsh5 = minhash6.calculateLSH(proteins, 3)

minhash3.saveLSH(32)
minhash4.saveLSH(64)
minhash4b.saveLSH(96)

Ejemplo n.º 17

0

Mostrar archivo

from lsh import LSH
import cPickle as pickle

#define model
model = LSH(base_vec_num=3, iter_num=2, dimens=2)

#data
vec_dict = {
    "a": [0.1, 0.2],
    "b": [0.5, -0.2],
    "c": [-0.3, -0.1],
    "d": [1.0, 0.0],
    "e": [-3, 2],
    "f": [2, 2]
}

#build
model.build_lsh(vec_dict)

#get candidate set
for name in model.get_candidate_set([0.3, 0.9]):
    print name,
print ""

#test pickle
with open("model.pkl", 'wb') as fout:
    pickle.dump(model, fout)
with open("model.pkl", 'rb') as fin:
    dumped_model = pickle.load(fin)
    print "dumped model:"
    for name in dumped_model.get_candidate_set([0.3, 0.9]):

Ejemplo n.º 18

0

Mostrar archivo

with gzip.open(datasetPath + filenameIn + '_sample.warc.gz', mode='rb') as gzf:
    for record in warc.WARCFile(fileobj=gzf):
        record_id = record['WARC-Record-ID']
        payload = record.payload.read()
        doc_uri[record_id] = record['WARC-Target-URI']
        text = HTMLPreprocessing(payload).get_text()
        doc_dict[record_id] = text
        doc_count += 1

print 'create vectors'
tfidf = TFIDF(doc_dict)
vect_length = tfidf.vect_length  # length of the input vector
num_hashtables = 1  # number of iterations
digest_length = 0
print 'perform lsh'
lsh = LSH(digest_length, vect_length, num_hashtables=num_hashtables)
for i, k in enumerate(tfidf._id_list):
    vect = tfidf.get_vector(i)
    lsh.index(vect, extra_data=tfidf._id_list[i])
''' Query documents '''
dedup = set()
keys = lsh.hash_tables[0].keys()
i = 0
for key in keys:
    bucket = lsh.hash_tables[0].get_val(key)
    for query_object in bucket:
        candidates = lsh.query(query_object[0], distance_func='cosine')
        for c in candidates:
            candidate_key = c[0][
                1]  # warc id is appended as extra data in lsh.index()
            if candidate_key == query_object[1]:

Ejemplo n.º 19

0

Mostrar archivo

Archivo: test_models.py Proyecto: xinke0802/joint_event_detection

def lsh_cluster(unlabel):
    return LSH(unlabel)

Ejemplo n.º 20

0

Mostrar archivo

def train(device, data, schedule, mi_type, args):
    model = MI_Estimator(device, D=d, ED=ed, HD=256)
    model.to(device)
    model.train()

    optimizer = optim.Adam(model.parameters(), lr=5e-4)

    xs, ys = data
    xs = xs.to(device)
    ys = ys.to(device)

    lsh = LSH(SimHash(ed, K, L), K, L)

    estimates = []
    avg_estimate = []

    id_set = set()
    n_iters = num_iterations * batch_size
    for batch_idx in range(n_iters):
        iteration = batch_idx // batch_size
        MI = schedule[iteration]

        t = 10 if batch_idx <= 1000 else 100
        if batch_idx % t == 0:
            build(lsh, model, xs)

        optimizer.zero_grad()

        y = ys[batch_idx:batch_idx + 1]
        ey = model.embed_y(y)

        id_list = lsh.query(ey)
        id_set = id_set.union(set(id_list))
        indices = torch.LongTensor(id_list).to(device)

        nx = F.embedding(indices, xs)
        px = xs[batch_idx:batch_idx + 1]
        x = torch.cat([px, nx], dim=0)
        x = torch.unsqueeze(x, dim=0)

        mi = model(x, y, args)
        loss = -mi
        loss.backward()
        optimizer.step()

        avg_estimate.append(mi.item())
        if (batch_idx + 1) % 100 == 0:
            '''
            asim = model.cosine_similarity(x, y)
            true = torch.mean(torch.diag(asim))
            neye = 1. - torch.eye(batch_size).to(device)
            noise = torch.sum(torch.mul(asim, neye)).item() / (batch_size * (batch_size-1))
            print("MI:{} true: {:.4f}, noise: {:.4f}".format(MI, true, noise))
            '''
            avg_mi = sum(avg_estimate) / float(len(avg_estimate))
            print('{} {} MI:{}, E_MI: {:.6f}'.format(mi_type.name,
                                                     batch_idx + 1, MI,
                                                     avg_mi))
            sys.stdout.flush()

        if (batch_idx + 1) % wsize == 0:
            print(len(id_set), len(id_set) // wsize)
            id_set.clear()
            avg_mi = sum(avg_estimate) / float(len(avg_estimate))
            estimates.append(avg_mi)
            avg_estimate.clear()
    lsh.stats()
    return estimates

Ejemplo n.º 21

0

Mostrar archivo

Archivo: test_mnist.py Proyecto: jrdbuch/balltreeLSH

          ))


if __name__ == "__main__":
    data, q = load_data()

    # Brute Force
    nn_brute, nn_brute_dist = brute_force_nn(q, data)

    # Ball Tree
    bt = BallTree(data, 10)
    nn_bt = bt.query_top_down(q)

    # LSH
    hash_fn_gen = lambda: guassian_hash_generator(150, data.shape[1])
    lsh = LSH(data, hash_fn_gen, 1, 10)
    nn_lsh, performace_limit = lsh.query(q)

    # Ball tree LSH
    print('balltree lsh performance limit', performace_limit)
    lsh = LSH(data, hash_fn_gen, 1, 3)
    bt_lsh = BallTreeLSH(bt, lsh)
    nn_bt_lsh = bt_lsh.query(q, performance_limit=performace_limit)

    # compare_results
    _, nn_lsh_to_q = brute_force_nn(nn_lsh, q)
    _, nn_bt_to_q = brute_force_nn(nn_bt, q)
    _, nn_bt_lsh_to_q = brute_force_nn(nn_bt_lsh, q)

    #########
    # Stats #

Ejemplo n.º 22

0

Mostrar archivo

Archivo: owner_main.py Proyecto: souravbhatnagar/image_retrieval_backend

#MySQL related information needs to be update here
conn = mysql.connector.connect(
   user='******', password='******', host='127.0.0.1', database='ImageRetrieval')
#Creating a cursor object using the cursor() method
cursor = conn.cursor()
# Preparing SQL query to INSERT a record into the database.
insert_stmt = (
   "insert into images (path)"
   "values (%s)"
)
key = (0.1, 0.1)
num_of_random_vectors = 16
hc = HarrisCorner()
sb = SurfBow()
e = Encrypt()
l = LSH()
#Image directory path to be mentioned here
img_dir = "../images/"
feat_vec=[]

if not os.path.exists("encrypted_images/"):
        cwd = os.getcwd()
        directory = "/encrypted_images"
        os.mkdir(cwd+directory)
if not os.path.exists("encrypted_indices/"):
        cwd = os.getcwd()
        directory = "/encrypted_indices"
        os.mkdir(cwd+directory)
if not os.path.exists("encrypted_vectors/"):
        cwd = os.getcwd()
        directory = "/encrypted_vectors"

Ejemplo n.º 23

0

Mostrar archivo

Archivo: dedup.py Proyecto: stteffen58/Seminarthesis

print 'create tfidf vectors of documents'
tfidf = TFIDF(doc_dict)
''' Perform lsh '''
print time.asctime(time.localtime(time.time()))
digest_length = int(sys.argv[2])
vect_length = tfidf.vect_length
num_hashtables = 1
log += 'perform lsh with hash-length: ' + str(
    digest_length) + ', vect-length: ' + str(
        vect_length) + ', num-hashtables: ' + str(num_hashtables) + '\n'
print 'perform lsh with hash-length: ' + str(
    digest_length) + ', vect-length: ' + str(
        vect_length) + ', num-hashtables: ' + str(num_hashtables)
r = {"dict": None}
lsh = LSH(digest_length,
          vect_length,
          storage_config=r,
          num_hashtables=num_hashtables)
for i, k in enumerate(tfidf._id_list):
    vect = tfidf.get_vector(i)
    lsh.index(vect, extra_data=tfidf._id_list[i])
''' Query documents '''
log += str(time.asctime(time.localtime(time.time()))) + '\n'
log += 'query documents\n'
print time.asctime(time.localtime(time.time()))
print 'Query documents'
distance_func = "cosine"

corr = set()

for i, key in enumerate(tfidf._id_list):
    query_object = tfidf.get_vector(i)

Ejemplo n.º 24

0

Mostrar archivo

def main():

	print("Importing the Dataset")
	# importing the dataset
	text = importing_the_dataset(args.na)
	documents_number = len(text)

	print("Shingling phase")
	# shingling all the documents
	texts_shingled = []
	for element in text:
		text_shingled = Shingling.shingle(text=element, k=args.shg, char=args.shg_char)
		texts_shingled.append(text_shingled)

	# flatting the list of shingles
	flat_texts_shingled = np.hstack(np.array(texts_shingled))
	print("number of shingles: ", flat_texts_shingled.shape[0])
	unique_flat_texts_shingled = np.unique(flat_texts_shingled)
	max_value = unique_flat_texts_shingled.shape[0]
	print("number of unique shingles: ", max_value)

	# building the dictionary with { unique shingle : number }
	shingle_dic = {unique_flat_texts_shingled[i]: i for i in range(0, unique_flat_texts_shingled.shape[0])}

	# shingle into integers
	text_num = texts_shingled
	for i, txt in enumerate(texts_shingled):
		for j, shg in enumerate(txt):
			text_num[i][j] = shingle_dic[shg]

	print("MinHashing phase")
	# number of hash functions
	hash_functions = args.hashes

	# For each of the hash functions, generate a different coefficient 'a' and 'b'.   
	a = MinHashing.random_coeff(hash_functions, max_value)
	b = MinHashing.random_coeff(hash_functions, max_value)
	# next prime of greatest value
	c = MinHashing.next_prime(max_value)

	# min hashing - building the signatures matrix with [hashing_functions, number of documents]
	signatures_matrix = np.zeros((hash_functions, documents_number))
	for i, txt in enumerate(text_num):
		for j in range(0, hash_functions):
			min_hash = c + 1
			for element in txt:
				hashed_value = (a[j] * element + b[j]) % c 
				if hashed_value < min_hash:
				    min_hash = hashed_value
			signatures_matrix[j][i] = min_hash
	print("min hashing matrix shape: ", signatures_matrix.shape)

	print("Similarity Matrix phase")
	# compuitng the similarity matrix (common signatures)
	similarities = np.zeros((documents_number, documents_number))
	print(similarities.shape)
	for i in range(0, documents_number):
		for j in range(i+1, documents_number):
			similarities[i][j] = CompareSign.similarity(signatures_matrix[:,i], signatures_matrix[:,j], hash_functions)
	
	# mirroring the matrix (all similarity matrixes are symmetric)
	similarities = similarities + similarities.T

	# picking just the most similar items based on a threshold
	threshold = args.thr

	# printing the most similar items
	for i in range(0, documents_number):
		for j in range(i+1, documents_number):
			sim_tmp = similarities[i][j]
			if sim_tmp > threshold:
				print("text1: ", text[i][0:100])
				print("text2: ", text[j][0:100])
				print("common elements: ", sim_tmp)
				print("jaggard similarity: ", CompareSign.jaggard_similarity(text[i], text[j], args.shg))
				print()

	# lsh approximation
	if args.lsh == True:
		lsh = LSH(hash_functions=hash_functions, 
					c=c, 
					threshold=threshold,
					shg=args.shg,
					documents_number=documents_number, 
					signatures_matrix=signatures_matrix, 
					text=text)
		lsh.compute_lsh()

Ejemplo n.º 25

0

Mostrar archivo

from mrjob.job import MRJob
from mrjob.step import MRStep
from mrjob.protocol import TextProtocol
import util
import os
from datastore import SQLiteDatastore
import config
from lsh import LSH, DocumentTooShortError

datastore = SQLiteDatastore(config.SQLITE_PATH, False)
lsh = LSH(datastore, paragraphs=True)

class GeneratorMapReducer(MRJob):

    INPUT_PROTOCOL = TextProtocol

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.remaining = str()


    def steps(self):
        return [
            MRStep(
                mapper=self.mapper_articles,
                reducer=self.reducer_minhash,
            )
        ]


    def mapper_articles(self, article_id, article):

Ejemplo n.º 26

0

Mostrar archivo

Archivo: main.py Proyecto: bgrana/DM_SimilarObjects

def main(args):
    # Get input params
    input_dir = args["dir"]
    th = args["th"]

    # Read all files contained in the input directory
    print("Loading documents...")
    onlyfiles = [f for f in listdir(input_dir) if isfile(join(input_dir, f))]
    docs = []
    for fname in onlyfiles:
        with open(join(input_dir, fname), "r") as file:
            docs += [file.read()]

    # Clean documents removing trailing and duplicate blanks
    print("Cleaning documents...")
    docs = [re.sub('\W+', ' ', doc) for doc in docs]

    # Compute shingles of size n
    print("Computing shingles...")
    sh = Shingling(args["n"])
    shingles = sh.transform(docs)

    # Compute jaccard similarities
    print("Jaccard similarities (on hashed shingles) > " + str(th) + ":")
    similarities = {(onlyfiles[i], onlyfiles[j]):
                    compare_shingles(shingles[i], shingles[j])
                    for i in range(0, len(docs))
                    for j in range(i + 1, len(docs))}
    # Show similarities greater than the threshold
    print(
        sorted([(k, v) for k, v in similarities.items() if v > th],
               key=itemgetter(1),
               reverse=True))

    # Compute minHash signatures
    print("Computing signatures...")
    mh = MinHashing(args["k"])
    signatures = mh.transform(shingles)

    # Compute similarity esrimations
    print("Similarity estimations using minHashing > " + str(th) + ":")
    estimations = {(onlyfiles[i], onlyfiles[j]):
                   compare_signatures(signatures[:, i], signatures[:, j])
                   for i in range(0, len(docs))
                   for j in range(i + 1, len(docs))}
    # Show similarity estimations greater than a threshold
    print(
        sorted([(k, v) for k, v in estimations.items() if v > th],
               key=itemgetter(1),
               reverse=True))

    # Show Differences between estimations and real similarities
    errors = {(onlyfiles[i], onlyfiles[j]):
              abs(estimations[(onlyfiles[i], onlyfiles[j])] -
                  similarities[(onlyfiles[i], onlyfiles[j])])
              for i in range(0, len(docs)) for j in range(i + 1, len(docs))}
    # Show errors greater than 5%
    print("Estimaions with error greater than 5%:")
    print(
        sorted([(k, v) for k, v in errors.items() if v > 0.05],
               key=itemgetter(1),
               reverse=True))

    # Apply LSH to find pairs of probable similar items
    lsh = LSH(signatures, th)
    lsh.index()
    candidates = lsh.get_pairs()

    # Show candidates
    print("Identified candidates with LSH:")
    print([(onlyfiles[t[0]], onlyfiles[t[1]]) for t in candidates])

Ejemplo n.º 27

0

Mostrar archivo

Archivo: nn.py Proyecto: mikaelbaymani/nn

    if "-h" in sys.argv or "--help" in sys.argv:

        print(
            "Usage: ./nn.py [OPTION]                                    \n\n"
            "   -h | --help        Show this help message and exit        \n"
            "   --fetch <plugin>   Fetch new data with proprietary plugin \n"
            "   --train            Train LSH model                        \n"
            "   --query <ID>       Nearest Neighbor query                 \n")
        exit(0)

    if "--fetch" in sys.argv:

        pluginName = sys.argv[2].replace('.py', '')
        Dimport("%s" % pluginName, pluginName,
                FULLNAME('plugins'))(CONST.DATASET)

    if "--train" in sys.argv:

        dataframe = pd.read_csv(CONST.DATASET)
        corpus = TfidfVectorizer().fit_transform(dataframe['content'])

        lsh = LSH(corpus)
        model = lsh.train()

        pickle.dump(model, open(CONST.MODEL, 'wb'))

    if "--query" in sys.argv:
        print(query(sys.argv[2]))

# eof

Ejemplo n.º 28

0

Mostrar archivo

Archivo: plot_bmm.py Proyecto: rdspring1/LSH-Mutual-Information

def train(device, data, schedule, mi_type, args):
    model = MI_Estimator(device, D=d, ED=ed, HD=256)
    model.to(device)
    model.train()

    optimizer = optim.Adam(model.parameters(), lr=5e-4)

    xs, ys = data
    xs = xs.to(device)
    ys = ys.to(device)
    zxs = torch.cat([xs, zerot], dim=0)

    lsh = LSH(SimHash(ed, K, L), K, L)

    estimates = []
    for batch_idx, MI in enumerate(schedule):
        optimizer.zero_grad()

        # randomly select data from data distribution
        sdx_iter = (batch_idx // mi_range) * mi_range
        sdx_offset = sdx_iter * batch_size
        sdx = torch.from_numpy(
            np.random.choice(mi_range *
                             batch_size, batch_size, replace=False) +
            sdx_offset).to(device)

        t = 10 if batch_idx <= 1000 else 100
        if batch_idx % t == 0:
            # Load first section of desired size into lsh hash tables
            lxs = xs[:desired_size, :]
            assert (lxs.size(0) == desired_size)
            build(lsh, model, lxs)

            #lsh.stats()
            # Full - Load All Data
            #build(lsh, model, xs)

        # embed data
        y = F.embedding(sdx, ys).detach()
        ey = model.embed_y(y)

        # for each data sample, query lsh data structure, remove accidental hit
        # find maximum number of samples
        # create matrix and pad appropriately
        np_indices = lsh.query_remove_matrix(ey, sdx, xs.size(0))
        indices = torch.from_numpy(np_indices).to(device)

        # create mask distinguishing between samples and padding
        mask = 1.0 - torch.eq(indices, xs.size(0)).float()
        mask = torch.cat([bs_onet, mask], dim=1).detach()

        px = torch.unsqueeze(F.embedding(sdx, xs), dim=1)
        nx = F.embedding(indices, zxs, padding_idx=xs.size(0))
        x = torch.cat([px, nx], dim=1).detach()

        mi = model(x, y, mask, args)
        loss = -mi
        loss.backward()
        optimizer.step()

        estimates.append(mi.item())
        if (batch_idx + 1) % 100 == 0:
            print('{} {} MI:{}, E_MI: {:.6f}'.format(mi_type.name,
                                                     batch_idx + 1, MI,
                                                     mi.item()))
            sys.stdout.flush()
    lsh.stats()
    return estimates

Ejemplo n.º 29

0

Mostrar archivo

Archivo: benchmarking.py Proyecto: ADelgadoT/LSH_Protein_Sequence_Similarity

import pandas as pd
import sqlite3

uniDB = UniprotDB("Uniprot_DB_ec_pa_human.sqlite")
#Construct the protein database
"""
uniDB.deleteProteins()
protManager = ProteinsManager()
uniDB.createTables()
protManager.loadProteins("Ecolx.xml",uniDB)
protManager.loadProteins("PseA7.xml",uniDB)
"""
protManager = ProteinsManager()
protManager.loadProteins("Human.xml", uniDB)

minhash3 = LSH(0.3, 96)
minhash4 = LSH(0.5, 96)
minhash5 = LSH(0.5, 128)

# Create the minhashes
proteins = uniDB.extractProteins()

minhashes3, lsh3 = minhash3.calculateLSH(proteins, 3)
minhashes4, lsh4 = minhash4.calculateLSH(proteins, 3)
minhashes5, lsh5 = minhash5.calculateLSH(proteins, 3)

minhash3.saveLSH(963)
minhash4.saveLSH(965)
minhash5.saveLSH(1285)
"""
minhash3.loadLSH(963)

Ejemplo n.º 30

0

Mostrar archivo

    def run(self):
        print(\
        """Local Sensitivity Hashing-based protein similarity search.
	Options: E[X]it, [L]oad Database, [D]elete Database,
	[C]alculate LSH, [RC] Recalculate LSH, [LL] Load LSH, [S]ave LSH
	[Q]uery LSH, Query [A]ll LSH, Read [B]LAST, Compare [R]esults,
		""")
        mode = input('Choose option:')

        uniDB = UniprotDB("Uniprot_DB.sqlite")
        minhash = LSH(0.5, 96)

        while (mode != 'Exit' and mode != 'X'):

            if (mode == 'Delete Database' or mode == 'D'):
                uniDB.deleteProteins()

            if (mode == 'Load Database' or mode == 'L'):
                protManager = ProteinsManager()
                uniDB.createTables()
                filename = input(
                    'XML filename (e.g. Ecolx.xml or PseA7.xml or Human.xml): '
                )
                protManager.loadProteins(filename, uniDB)

            if (mode == 'Calculate LSH' or mode == 'C'):
                uniDB = UniprotDB("Uniprot_DB.sqlite")
                proteins = uniDB.extractProteins()
                minhashes, lsh = minhash.calculateLSH(proteins, 3)
                print("Calculated")

            if (mode == 'Recalculate LSH' or mode == 'RC'):
                jaccardThreshold = float(
                    input(
                        "Specify a Jaccard similarity threshold (default: 0.5): "
                    ))
                permutations = int(
                    input(
                        "Specify the number of permutations(default: 96) : "))
                shinglesize = int(
                    input("Specify the shingle size (default: 3): "))
                minhash = LSH(jaccardThreshold, permutations)
                proteins = uniDB.extractProteins()
                minhashes, lsh = minhash.calculateLSH(proteins, shinglesize)
                print("Recalculated")

            if (mode == 'Query LSH' or mode == 'Q'):
                protein = input('Protein accession: ')
                start_time = time.time()
                result = minhash.queryProtein(protein)
                if result is not None:
                    jaccResultsDict = minhash.checkJaccardResultsOfProtein(
                        protein, result)
                    # Return the results in sorted order, big to small Jaccard score
                    sorted_jaccResultsDict = OrderedDict(
                        sorted(jaccResultsDict.items(), key=lambda x: -x[1]))
                    for jaccRes in sorted_jaccResultsDict.items():
                        print("\nMatch with Jaccard:", jaccRes[1])
                        information = uniDB.extractProteinInformation(
                            jaccRes[0])
                        proteininfo = uniProtein(*information)
                        proteininfo.printUniProtein(printSeq=False)
                print("Runtime of query search: %s seconds " %
                      (time.time() - start_time))

            if (mode == 'Calculate All' or mode == 'CA'):
                start_time = time.time()
                uniDB = UniprotDB("Uniprot_DB.sqlite")
                #uni_DB.close()
                proteins = uniDB.extractProteins()
                #minhash.calculateLSH([protein[1] for protein in proteins])
                minhashes, lsh = minhash.calculateLSH(proteins, 3)
                for protein in proteins:
                    print("Protein ", protein[0])
                    result = minhash.queryProtein(protein[0])
                    if result is not None:
                        jaccResultsDict = minhash.checkJaccardResultsOfProtein(
                            protein[0], result)
                        sorted_jaccResultsDict = OrderedDict(
                            sorted(jaccResultsDict.items(),
                                   key=lambda x: -x[1]))
                        for jaccRes in sorted_jaccResultsDict.items():
                            print(jaccRes[0], " - Jaccard: ", jaccRes[1])
                print("Runtime of query all: %s seconds " %
                      (time.time() - start_time))

            if (mode == 'Query All LSH' or mode == 'A'):
                resultsDB = ResultsDB("Results_DB.sqlite")
                resultsDB.createLSHtable("lshresults")
                resultsDB.deleteTable("lshresults")
                resultsDB.createLSHtable("lshresults")
                for query in minhash.minhashes.keys():
                    matches = minhash.queryProtein(query)
                    for match in matches:
                        # Filter self-matches
                        if query != match:
                            jaccard = minhash.estimateJaccard(query, match)
                            resultsDB.addLSHresult(query, match, jaccard,
                                                   "lshresults")
                print(resultsDB.extractLSHresults("lshresults"))

            if (mode == 'Read BLAST Results' or mode == 'B'):
                filename = input('Filename: ')
                handle = open(filename, 'r')
                resultsDB = ResultsDB("Results_DB.sqlite")
                resultsDB.createBLASTtable()
                resultsDB.deleteBLASTresults()
                resultsDB.createBLASTtable()
                for line in handle:
                    line = line[:-1].split('\t')
                    # Extract accessions from 'sp|A0A0R6L508|MCR1_ECOLX'-like string
                    line[0] = line[0].split('|')[1]
                    line[1] = line[1].split('|')[1]
                    print(line)
                    # Filter self-matches, add to the database
                    if line[0] != line[1]:
                        resultsDB.addBLASTresult(line[0], line[1], line[2],
                                                 line[3])
                print(resultsDB.extractBLASTresults())

            if (mode == 'Compare Results' or mode == 'R'):

                # Database with all LSH and BLASTp results
                resultsDB = ResultsDB("Results_DB.sqlite")
                identity_th, alignment_th, jaccard_th = 80.0, 100, 0.5
                precisions = []
                recalls = []

                # Load in all protein ids to loop over
                uniDB = UniprotDB("Uniprot_DB.sqlite")
                proteins = uniDB.extractProteins()
                # Store all precisions and recalls per query, to calculate the average
                for query in proteins:
                    intersect = resultsDB.extractIntersectCountPerProtein(
                        query[0], 'lshresults', identity_th, alignment_th,
                        jaccard_th)
                    lshresults = resultsDB.extractLSHcountPerProtein(
                        query[0], 'lshresults', jaccard_th)
                    blastresults = resultsDB.extractBLASTcountPerProtein(
                        query[0], identity_th, alignment_th)
                    tp = intersect
                    fp = lshresults - intersect
                    fn = blastresults - intersect
                    precision = tp / (tp + fp) if (tp + fp) != 0 else -1
                    recall = tp / (tp + fn) if (tp + fn) != 0 else -1
                    # Exclude results without any similar proteins / division by zero
                    if precision != -1:
                        precisions.append(precision)
                    if recall != -1:
                        recalls.append(recall)

                print("Comparison of BLAST and LSH results:\n Number of proteins queried: %i \n Average precision: %0.3f Average recall: %0.3f\n" \
                 % (len(proteins), sum(precisions)/len(precisions), sum(recalls)/len(recalls)))

            if (mode == 'Save LSH' or mode == 'S'):
                number = int(input('Suffix number: '))
                minhash.saveLSH(number)

            if (mode == 'Load LSH' or mode == 'LL'):
                number = int(input('Suffix number: '))
                minhash.loadLSH(number)

            mode = input('Choose option: ')