def setUp(self): self.lsh = LSH(3, 2, 1) self.lsh_two_tables = LSH(3, 2, 2) # Overwrite randomly initalized planes with known values. self.lsh.planes = [np.array([[0.1, 0.2], [-0.1, -0.2], [-1.0, 1.0]])] self.lsh_two_tables.planes = [ np.array([[0.1, 0.2], [-0.1, -0.2], [-1.0, 1.0]]), np.array([[-0.1, -0.2], [0.1, 0.2], [-2.0, 2.0]]), ]
def __init__(self, digest_length, num_hashtables, dTreshold): self._dTrehold = dTreshold self._lsh = LSH(digest_length, 2**20, storage_config=None, num_hashtables=num_hashtables) self._hashvect = HashVect()
def _build_lsh(self, minhashes): ''' Builds the LSH using the given minhash vectors ''' self.lsh = LSH(minhashes) with open(self.LSH_FILE, 'wb') as f: pickle.dump(self.lsh.hash_tables, f)
def starter(image_id, m, k, l): query_results = Database().retrieve_many() id_vector_pairs = [(item["image_id"], item["vector"]) for item in query_results] search_results = LSH(l, k, id_vector_pairs).get_search_results(image_id, show=True) print( "Original dataset size: {} | Reduced search space size: {} | Reduction by {} %" .format( len(id_vector_pairs), len(search_results), float(len(id_vector_pairs) - len(search_results)) * 100 / len(id_vector_pairs), )) query_results = Database().retrieve_many(list(search_results)) search_id_vector_pairs = [(item["image_id"], item["vector"]) for item in query_results] source_vector = Database().retrieve_one(image_id)["vector"] all_images = functions.find_similarity(source_vector, search_id_vector_pairs) similar_images = all_images[:m] print(similar_images) visualizer.visualize_lsh(image_id, similar_images) return similar_images, all_images
def cloud_main(self, file_count): with open('encrypted_vectors/feature_vectors.json') as data_file: feature_loaded = json.load(data_file) with open('encrypted_indices/indices.json') as data_file: indices_loaded = json.load(data_file) with open('encrypted_query/query_vectors.json') as data_file: query_loaded = json.load(data_file) d = Decrypt() feature_vectors = d.decrypt_indices_vector(bytes(feature_loaded)) indices = d.decrypt_indices_vector(bytes(indices_loaded)) query_vectors = d.decrypt_indices_vector(bytes(query_loaded)) feature_vectors = np.frombuffer(feature_vectors, dtype=int) feature_vectors = np.reshape(feature_vectors, (file_count, -1)) indices = json.loads(indices.decode()) query_vectors = np.frombuffer(query_vectors, dtype=int) query_vectors = np.reshape(query_vectors, (-1, 6)) print(feature_vectors.shape, indices, query_vectors.shape) l = LSH(feature_vectors, indices) n_neighbors, result = l.query(query_vectors, 6, 45) print(n_neighbors) cursor.execute(sql_select_Query) records = cursor.fetchall() for row in records: if row[0] - 1 in result: image_name = row[1] print(image_name) CloudAPISender().cloud_api_sender(image_name) # Closing the connection conn.close()
def loadOrTrainLSHModel(forceGenerate=False): lshModel = None if os.path.exists("./pickle_files/lshModel.pickle") and not forceGenerate: print("LSH model found on disk") pickleIn = open("./pickle_files/lshModel.pickle", "rb") lshModel = pickle.load(pickleIn) else: print("Training LSH model") trainAudioDataAndRateArray = loadAllFiles("train", '') trainingData = generateData(trainAudioDataAndRateArray) lshModel = LSH() for data in trainingData: print("fileName", data[1]) validFrameList = extractValidFrames(data[0]) for i in range(0, validFrameList.shape[0]): reshapedValidFrame = validFrameList[i].reshape(1, -1) # print("reshapedValidFrame", reshapedValidFrame) lshModel.train(reshapedValidFrame, { "name": data[1] + "_" + str(i), "frameIndex": i }) # hr.train(validFrames[1:2], data[1]) # print("lshModel", lshModel) pickleOut = open("./pickle_files/lshModel.pickle", "wb") pickle.dump(lshModel, pickleOut) pickleOut.close() return lshModel
def get_similar(num_layers, num_hashes, features, filepaths, query_image, num_results): lsh = LSH(num_layers, num_hashes, 10) lsh.fit(features) query_vec = get_cm_features_by_image_path(query_image) similar_indices = lsh.get_similar(query_vec, num_results)[:num_results] return [list(features[x]) for x in similar_indices], [filepaths[x] for x in similar_indices]
def load_lsh(self): ''' Loads the buckets from the files and initalizes a LSH object using this data. ''' with open(self.LSH_FILE, 'rb') as f: data = pickle.load(f) self.lsh = LSH(table=data)
def __init__(self, N, D, K, L): self.lsh = LSH(SimHash(D, K, L), K, L) self.keys = np.zeros((N, D), dtype=np.float32) self.values = np.zeros((N, 1), dtype=np.float32) self.lru = np.zeros(N, dtype=np.float32) self.key2idx = dict() self.size = 0 self.max_memory = N self.K = K self.L = L
def query(fname, key='key', topk=10, truncate=80): model = pickle.load(open(CONST.MODEL, 'rb')) dataframe = pd.read_csv(CONST.DATASET) corpus = TfidfVectorizer().fit_transform(dataframe['content']) lsh = LSH(corpus, model) index = dataframe[dataframe[key].apply(str) == str(fname)].index[0] dataframe['content'] = dataframe['content'].str[:int(truncate)] return lsh.query(corpus[index, :], int(topk), 10)[0].join(dataframe, on='id').sort_values('distance').iloc[:, 1:]
def __init__(self, movie_filename, rating_filename, k, m, c): # Hyperparameter self.c = c # read movie file and create dictionary _movie_names self._movie_names = {} f = open(movie_filename, "r", encoding="utf8") reader = csv.reader(f) next(reader) # skips header line for line in reader: movieid = line[0] moviename = line[1] # ignore line[2], genre self._movie_names[movieid] = moviename f.close() # read rating file and create _movie_ratings (ratings for a movie) # and _user_ratings (ratings by a user) dicts self._movie_ratings, self._movie_time = {}, {} self._user_ratings, self._user_time = {}, {} f = open(rating_filename, "r", encoding="utf8") reader = csv.reader(f) next(reader) # skips header line for line in reader: userid = line[0] movieid = line[1] rating = line[2] timestamp = line[3] if userid not in self._user_ratings: self._user_ratings[userid] = { } # each user is a dict with movies and ratings self._user_time[userid] = {} self._user_ratings[userid][movieid] = float(rating) self._user_time[userid][movieid] = float(timestamp) if movieid not in self._movie_ratings: self._movie_ratings[movieid] = {} self._movie_time[movieid] = {} self._movie_ratings[movieid][userid] = float(rating) self._movie_time[movieid][userid] = float(timestamp) f.close() self.me = LSH(k, m, self._user_ratings, self._movie_ratings)
def run(dataPath): nrofBands = 25 nrOfPerms = 132 nrOfRows = int(nrOfPerms / nrofBands) # Load data data = np.load(dataPath) print("Data is loaded") # Create an empty file to add results with open('results.txt', 'w') as file: file.write("") file.close() # Create Signature Matrix signatureMatrix = minHashing(data, nrOfPerms) pairsFound = LSH(data, signatureMatrix, nrOfRows, nrofBands) print(" Number of similar pairs found is : ", len(pairsFound)) return (pairsFound)
def cloud_main(self, file_count): #establishing the connection conn = mysql.connector.connect(user='******', password='******', host='127.0.0.1', database='ImageRetrieval') #Creating a cursor object using the cursor() method cursor = conn.cursor() # Preparing SQL query to select a record from the database. sql_select_Query = "select * from images" with open('encrypted_vectors/feature_vectors.json') as data_file: feature_loaded = json.load(data_file) with open('encrypted_indices/indices.json') as data_file: indices_loaded = json.load(data_file) with open('encrypted_query/query_vectors.json') as data_file: query_loaded = json.load(data_file) d = Decrypt() feature_vectors = d.decrypt_indices_vector(bytes(feature_loaded)) indices = d.decrypt_indices_vector(bytes(indices_loaded)) query_vectors = d.decrypt_indices_vector(bytes(query_loaded)) feature_vectors = np.frombuffer(feature_vectors, dtype=int) feature_vectors = np.reshape(feature_vectors, (file_count, -1)) indices = json.loads(indices.decode()) query_vectors = np.frombuffer(query_vectors, dtype=int) query_vectors = np.reshape(query_vectors, (-1, 6)) print(feature_vectors.shape, indices, query_vectors.shape) l = LSH(feature_vectors, indices) n_neighbors, result = l.query(query_vectors, 6, 45) print(n_neighbors) cursor.execute(sql_select_Query) records = cursor.fetchall() for row in records: if row[0] - 1 in result: image_name = row[1] print(image_name) CloudAPISender().cloud_api_sender(image_name) # Closing the connection conn.close()
from mrjob.job import MRJob from mrjob.step import MRStep import util import os from datastore import SQLiteDatastore import config import random SPACE = u' ' PATH = os.path.dirname(os.path.abspath(__file__)) from lsh import LSH, DocumentTooShortError datastore = SQLiteDatastore(config.SQLITE_PATH, False) lsh = LSH(datastore) class CandidatesMapReducer(MRJob): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.remaining = str() def steps(self): return [ MRStep( mapper=self.mapper_paragraphs, mapper_final=self.mapper_paragraphs_final, reducer=self.reducer_minhash, ), MRStep(reducer=self.reducer_unique, ), ]
from create_vector import vectorize from lsh import LSH from parse_document import extract_from_pdf import sys if __name__ == "__main__": pdfs = sys.argv[1:] if pdfs: vector_list = [vectorize(extract_from_pdf(pdf)) for pdf in pdfs] else: print "Usage: python create_vector.py pdf1 [pdf2] [pdf3] .." sys.exit() lsh = LSH(300) [lsh.insert_document(title, vector) for (title, vector) in vector_list] # print lsh.get_similarities() print lsh.closest_match(vector_list[0][0]) # return vector_list
import pandas as pd import sqlite3 uniDB = UniprotDB("Uniprot_DB.sqlite") #Construct the protein database """ uniDB.deleteProteins() protManager = ProteinsManager() uniDB.createTables() protManager.loadProteins("Ecolx.xml",uniDB) protManager.loadProteins("PseA7.xml",uniDB) """ minhash3 = LSH(0.3,32) minhash4 = LSH(0.3,64) minhash4b = LSH(0.3,96) minhash5 = LSH(0.3,128) # Create the minhashes proteins = uniDB.extractProteins() """ minhashes3, lsh3 = minhash3.calculateLSH(proteins, 3) minhashes4, lsh4 = minhash4.calculateLSH(proteins, 3) minhashes4b, lsh4b = minhash5.calculateLSH(proteins, 3) minhashes5, lsh5 = minhash6.calculateLSH(proteins, 3) minhash3.saveLSH(32) minhash4.saveLSH(64) minhash4b.saveLSH(96)
from lsh import LSH import cPickle as pickle #define model model = LSH(base_vec_num=3, iter_num=2, dimens=2) #data vec_dict = { "a": [0.1, 0.2], "b": [0.5, -0.2], "c": [-0.3, -0.1], "d": [1.0, 0.0], "e": [-3, 2], "f": [2, 2] } #build model.build_lsh(vec_dict) #get candidate set for name in model.get_candidate_set([0.3, 0.9]): print name, print "" #test pickle with open("model.pkl", 'wb') as fout: pickle.dump(model, fout) with open("model.pkl", 'rb') as fin: dumped_model = pickle.load(fin) print "dumped model:" for name in dumped_model.get_candidate_set([0.3, 0.9]):
with gzip.open(datasetPath + filenameIn + '_sample.warc.gz', mode='rb') as gzf: for record in warc.WARCFile(fileobj=gzf): record_id = record['WARC-Record-ID'] payload = record.payload.read() doc_uri[record_id] = record['WARC-Target-URI'] text = HTMLPreprocessing(payload).get_text() doc_dict[record_id] = text doc_count += 1 print 'create vectors' tfidf = TFIDF(doc_dict) vect_length = tfidf.vect_length # length of the input vector num_hashtables = 1 # number of iterations digest_length = 0 print 'perform lsh' lsh = LSH(digest_length, vect_length, num_hashtables=num_hashtables) for i, k in enumerate(tfidf._id_list): vect = tfidf.get_vector(i) lsh.index(vect, extra_data=tfidf._id_list[i]) ''' Query documents ''' dedup = set() keys = lsh.hash_tables[0].keys() i = 0 for key in keys: bucket = lsh.hash_tables[0].get_val(key) for query_object in bucket: candidates = lsh.query(query_object[0], distance_func='cosine') for c in candidates: candidate_key = c[0][ 1] # warc id is appended as extra data in lsh.index() if candidate_key == query_object[1]:
def lsh_cluster(unlabel): return LSH(unlabel)
def train(device, data, schedule, mi_type, args): model = MI_Estimator(device, D=d, ED=ed, HD=256) model.to(device) model.train() optimizer = optim.Adam(model.parameters(), lr=5e-4) xs, ys = data xs = xs.to(device) ys = ys.to(device) lsh = LSH(SimHash(ed, K, L), K, L) estimates = [] avg_estimate = [] id_set = set() n_iters = num_iterations * batch_size for batch_idx in range(n_iters): iteration = batch_idx // batch_size MI = schedule[iteration] t = 10 if batch_idx <= 1000 else 100 if batch_idx % t == 0: build(lsh, model, xs) optimizer.zero_grad() y = ys[batch_idx:batch_idx + 1] ey = model.embed_y(y) id_list = lsh.query(ey) id_set = id_set.union(set(id_list)) indices = torch.LongTensor(id_list).to(device) nx = F.embedding(indices, xs) px = xs[batch_idx:batch_idx + 1] x = torch.cat([px, nx], dim=0) x = torch.unsqueeze(x, dim=0) mi = model(x, y, args) loss = -mi loss.backward() optimizer.step() avg_estimate.append(mi.item()) if (batch_idx + 1) % 100 == 0: ''' asim = model.cosine_similarity(x, y) true = torch.mean(torch.diag(asim)) neye = 1. - torch.eye(batch_size).to(device) noise = torch.sum(torch.mul(asim, neye)).item() / (batch_size * (batch_size-1)) print("MI:{} true: {:.4f}, noise: {:.4f}".format(MI, true, noise)) ''' avg_mi = sum(avg_estimate) / float(len(avg_estimate)) print('{} {} MI:{}, E_MI: {:.6f}'.format(mi_type.name, batch_idx + 1, MI, avg_mi)) sys.stdout.flush() if (batch_idx + 1) % wsize == 0: print(len(id_set), len(id_set) // wsize) id_set.clear() avg_mi = sum(avg_estimate) / float(len(avg_estimate)) estimates.append(avg_mi) avg_estimate.clear() lsh.stats() return estimates
)) if __name__ == "__main__": data, q = load_data() # Brute Force nn_brute, nn_brute_dist = brute_force_nn(q, data) # Ball Tree bt = BallTree(data, 10) nn_bt = bt.query_top_down(q) # LSH hash_fn_gen = lambda: guassian_hash_generator(150, data.shape[1]) lsh = LSH(data, hash_fn_gen, 1, 10) nn_lsh, performace_limit = lsh.query(q) # Ball tree LSH print('balltree lsh performance limit', performace_limit) lsh = LSH(data, hash_fn_gen, 1, 3) bt_lsh = BallTreeLSH(bt, lsh) nn_bt_lsh = bt_lsh.query(q, performance_limit=performace_limit) # compare_results _, nn_lsh_to_q = brute_force_nn(nn_lsh, q) _, nn_bt_to_q = brute_force_nn(nn_bt, q) _, nn_bt_lsh_to_q = brute_force_nn(nn_bt_lsh, q) ######### # Stats #
#MySQL related information needs to be update here conn = mysql.connector.connect( user='******', password='******', host='127.0.0.1', database='ImageRetrieval') #Creating a cursor object using the cursor() method cursor = conn.cursor() # Preparing SQL query to INSERT a record into the database. insert_stmt = ( "insert into images (path)" "values (%s)" ) key = (0.1, 0.1) num_of_random_vectors = 16 hc = HarrisCorner() sb = SurfBow() e = Encrypt() l = LSH() #Image directory path to be mentioned here img_dir = "../images/" feat_vec=[] if not os.path.exists("encrypted_images/"): cwd = os.getcwd() directory = "/encrypted_images" os.mkdir(cwd+directory) if not os.path.exists("encrypted_indices/"): cwd = os.getcwd() directory = "/encrypted_indices" os.mkdir(cwd+directory) if not os.path.exists("encrypted_vectors/"): cwd = os.getcwd() directory = "/encrypted_vectors"
print 'create tfidf vectors of documents' tfidf = TFIDF(doc_dict) ''' Perform lsh ''' print time.asctime(time.localtime(time.time())) digest_length = int(sys.argv[2]) vect_length = tfidf.vect_length num_hashtables = 1 log += 'perform lsh with hash-length: ' + str( digest_length) + ', vect-length: ' + str( vect_length) + ', num-hashtables: ' + str(num_hashtables) + '\n' print 'perform lsh with hash-length: ' + str( digest_length) + ', vect-length: ' + str( vect_length) + ', num-hashtables: ' + str(num_hashtables) r = {"dict": None} lsh = LSH(digest_length, vect_length, storage_config=r, num_hashtables=num_hashtables) for i, k in enumerate(tfidf._id_list): vect = tfidf.get_vector(i) lsh.index(vect, extra_data=tfidf._id_list[i]) ''' Query documents ''' log += str(time.asctime(time.localtime(time.time()))) + '\n' log += 'query documents\n' print time.asctime(time.localtime(time.time())) print 'Query documents' distance_func = "cosine" corr = set() for i, key in enumerate(tfidf._id_list): query_object = tfidf.get_vector(i)
def main(): print("Importing the Dataset") # importing the dataset text = importing_the_dataset(args.na) documents_number = len(text) print("Shingling phase") # shingling all the documents texts_shingled = [] for element in text: text_shingled = Shingling.shingle(text=element, k=args.shg, char=args.shg_char) texts_shingled.append(text_shingled) # flatting the list of shingles flat_texts_shingled = np.hstack(np.array(texts_shingled)) print("number of shingles: ", flat_texts_shingled.shape[0]) unique_flat_texts_shingled = np.unique(flat_texts_shingled) max_value = unique_flat_texts_shingled.shape[0] print("number of unique shingles: ", max_value) # building the dictionary with { unique shingle : number } shingle_dic = {unique_flat_texts_shingled[i]: i for i in range(0, unique_flat_texts_shingled.shape[0])} # shingle into integers text_num = texts_shingled for i, txt in enumerate(texts_shingled): for j, shg in enumerate(txt): text_num[i][j] = shingle_dic[shg] print("MinHashing phase") # number of hash functions hash_functions = args.hashes # For each of the hash functions, generate a different coefficient 'a' and 'b'. a = MinHashing.random_coeff(hash_functions, max_value) b = MinHashing.random_coeff(hash_functions, max_value) # next prime of greatest value c = MinHashing.next_prime(max_value) # min hashing - building the signatures matrix with [hashing_functions, number of documents] signatures_matrix = np.zeros((hash_functions, documents_number)) for i, txt in enumerate(text_num): for j in range(0, hash_functions): min_hash = c + 1 for element in txt: hashed_value = (a[j] * element + b[j]) % c if hashed_value < min_hash: min_hash = hashed_value signatures_matrix[j][i] = min_hash print("min hashing matrix shape: ", signatures_matrix.shape) print("Similarity Matrix phase") # compuitng the similarity matrix (common signatures) similarities = np.zeros((documents_number, documents_number)) print(similarities.shape) for i in range(0, documents_number): for j in range(i+1, documents_number): similarities[i][j] = CompareSign.similarity(signatures_matrix[:,i], signatures_matrix[:,j], hash_functions) # mirroring the matrix (all similarity matrixes are symmetric) similarities = similarities + similarities.T # picking just the most similar items based on a threshold threshold = args.thr # printing the most similar items for i in range(0, documents_number): for j in range(i+1, documents_number): sim_tmp = similarities[i][j] if sim_tmp > threshold: print("text1: ", text[i][0:100]) print("text2: ", text[j][0:100]) print("common elements: ", sim_tmp) print("jaggard similarity: ", CompareSign.jaggard_similarity(text[i], text[j], args.shg)) print() # lsh approximation if args.lsh == True: lsh = LSH(hash_functions=hash_functions, c=c, threshold=threshold, shg=args.shg, documents_number=documents_number, signatures_matrix=signatures_matrix, text=text) lsh.compute_lsh()
from mrjob.job import MRJob from mrjob.step import MRStep from mrjob.protocol import TextProtocol import util import os from datastore import SQLiteDatastore import config from lsh import LSH, DocumentTooShortError datastore = SQLiteDatastore(config.SQLITE_PATH, False) lsh = LSH(datastore, paragraphs=True) class GeneratorMapReducer(MRJob): INPUT_PROTOCOL = TextProtocol def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.remaining = str() def steps(self): return [ MRStep( mapper=self.mapper_articles, reducer=self.reducer_minhash, ) ] def mapper_articles(self, article_id, article):
def main(args): # Get input params input_dir = args["dir"] th = args["th"] # Read all files contained in the input directory print("Loading documents...") onlyfiles = [f for f in listdir(input_dir) if isfile(join(input_dir, f))] docs = [] for fname in onlyfiles: with open(join(input_dir, fname), "r") as file: docs += [file.read()] # Clean documents removing trailing and duplicate blanks print("Cleaning documents...") docs = [re.sub('\W+', ' ', doc) for doc in docs] # Compute shingles of size n print("Computing shingles...") sh = Shingling(args["n"]) shingles = sh.transform(docs) # Compute jaccard similarities print("Jaccard similarities (on hashed shingles) > " + str(th) + ":") similarities = {(onlyfiles[i], onlyfiles[j]): compare_shingles(shingles[i], shingles[j]) for i in range(0, len(docs)) for j in range(i + 1, len(docs))} # Show similarities greater than the threshold print( sorted([(k, v) for k, v in similarities.items() if v > th], key=itemgetter(1), reverse=True)) # Compute minHash signatures print("Computing signatures...") mh = MinHashing(args["k"]) signatures = mh.transform(shingles) # Compute similarity esrimations print("Similarity estimations using minHashing > " + str(th) + ":") estimations = {(onlyfiles[i], onlyfiles[j]): compare_signatures(signatures[:, i], signatures[:, j]) for i in range(0, len(docs)) for j in range(i + 1, len(docs))} # Show similarity estimations greater than a threshold print( sorted([(k, v) for k, v in estimations.items() if v > th], key=itemgetter(1), reverse=True)) # Show Differences between estimations and real similarities errors = {(onlyfiles[i], onlyfiles[j]): abs(estimations[(onlyfiles[i], onlyfiles[j])] - similarities[(onlyfiles[i], onlyfiles[j])]) for i in range(0, len(docs)) for j in range(i + 1, len(docs))} # Show errors greater than 5% print("Estimaions with error greater than 5%:") print( sorted([(k, v) for k, v in errors.items() if v > 0.05], key=itemgetter(1), reverse=True)) # Apply LSH to find pairs of probable similar items lsh = LSH(signatures, th) lsh.index() candidates = lsh.get_pairs() # Show candidates print("Identified candidates with LSH:") print([(onlyfiles[t[0]], onlyfiles[t[1]]) for t in candidates])
if "-h" in sys.argv or "--help" in sys.argv: print( "Usage: ./nn.py [OPTION] \n\n" " -h | --help Show this help message and exit \n" " --fetch <plugin> Fetch new data with proprietary plugin \n" " --train Train LSH model \n" " --query <ID> Nearest Neighbor query \n") exit(0) if "--fetch" in sys.argv: pluginName = sys.argv[2].replace('.py', '') Dimport("%s" % pluginName, pluginName, FULLNAME('plugins'))(CONST.DATASET) if "--train" in sys.argv: dataframe = pd.read_csv(CONST.DATASET) corpus = TfidfVectorizer().fit_transform(dataframe['content']) lsh = LSH(corpus) model = lsh.train() pickle.dump(model, open(CONST.MODEL, 'wb')) if "--query" in sys.argv: print(query(sys.argv[2])) # eof
def train(device, data, schedule, mi_type, args): model = MI_Estimator(device, D=d, ED=ed, HD=256) model.to(device) model.train() optimizer = optim.Adam(model.parameters(), lr=5e-4) xs, ys = data xs = xs.to(device) ys = ys.to(device) zxs = torch.cat([xs, zerot], dim=0) lsh = LSH(SimHash(ed, K, L), K, L) estimates = [] for batch_idx, MI in enumerate(schedule): optimizer.zero_grad() # randomly select data from data distribution sdx_iter = (batch_idx // mi_range) * mi_range sdx_offset = sdx_iter * batch_size sdx = torch.from_numpy( np.random.choice(mi_range * batch_size, batch_size, replace=False) + sdx_offset).to(device) t = 10 if batch_idx <= 1000 else 100 if batch_idx % t == 0: # Load first section of desired size into lsh hash tables lxs = xs[:desired_size, :] assert (lxs.size(0) == desired_size) build(lsh, model, lxs) #lsh.stats() # Full - Load All Data #build(lsh, model, xs) # embed data y = F.embedding(sdx, ys).detach() ey = model.embed_y(y) # for each data sample, query lsh data structure, remove accidental hit # find maximum number of samples # create matrix and pad appropriately np_indices = lsh.query_remove_matrix(ey, sdx, xs.size(0)) indices = torch.from_numpy(np_indices).to(device) # create mask distinguishing between samples and padding mask = 1.0 - torch.eq(indices, xs.size(0)).float() mask = torch.cat([bs_onet, mask], dim=1).detach() px = torch.unsqueeze(F.embedding(sdx, xs), dim=1) nx = F.embedding(indices, zxs, padding_idx=xs.size(0)) x = torch.cat([px, nx], dim=1).detach() mi = model(x, y, mask, args) loss = -mi loss.backward() optimizer.step() estimates.append(mi.item()) if (batch_idx + 1) % 100 == 0: print('{} {} MI:{}, E_MI: {:.6f}'.format(mi_type.name, batch_idx + 1, MI, mi.item())) sys.stdout.flush() lsh.stats() return estimates
import pandas as pd import sqlite3 uniDB = UniprotDB("Uniprot_DB_ec_pa_human.sqlite") #Construct the protein database """ uniDB.deleteProteins() protManager = ProteinsManager() uniDB.createTables() protManager.loadProteins("Ecolx.xml",uniDB) protManager.loadProteins("PseA7.xml",uniDB) """ protManager = ProteinsManager() protManager.loadProteins("Human.xml", uniDB) minhash3 = LSH(0.3, 96) minhash4 = LSH(0.5, 96) minhash5 = LSH(0.5, 128) # Create the minhashes proteins = uniDB.extractProteins() minhashes3, lsh3 = minhash3.calculateLSH(proteins, 3) minhashes4, lsh4 = minhash4.calculateLSH(proteins, 3) minhashes5, lsh5 = minhash5.calculateLSH(proteins, 3) minhash3.saveLSH(963) minhash4.saveLSH(965) minhash5.saveLSH(1285) """ minhash3.loadLSH(963)
def run(self): print(\ """Local Sensitivity Hashing-based protein similarity search. Options: E[X]it, [L]oad Database, [D]elete Database, [C]alculate LSH, [RC] Recalculate LSH, [LL] Load LSH, [S]ave LSH [Q]uery LSH, Query [A]ll LSH, Read [B]LAST, Compare [R]esults, """) mode = input('Choose option:') uniDB = UniprotDB("Uniprot_DB.sqlite") minhash = LSH(0.5, 96) while (mode != 'Exit' and mode != 'X'): if (mode == 'Delete Database' or mode == 'D'): uniDB.deleteProteins() if (mode == 'Load Database' or mode == 'L'): protManager = ProteinsManager() uniDB.createTables() filename = input( 'XML filename (e.g. Ecolx.xml or PseA7.xml or Human.xml): ' ) protManager.loadProteins(filename, uniDB) if (mode == 'Calculate LSH' or mode == 'C'): uniDB = UniprotDB("Uniprot_DB.sqlite") proteins = uniDB.extractProteins() minhashes, lsh = minhash.calculateLSH(proteins, 3) print("Calculated") if (mode == 'Recalculate LSH' or mode == 'RC'): jaccardThreshold = float( input( "Specify a Jaccard similarity threshold (default: 0.5): " )) permutations = int( input( "Specify the number of permutations(default: 96) : ")) shinglesize = int( input("Specify the shingle size (default: 3): ")) minhash = LSH(jaccardThreshold, permutations) proteins = uniDB.extractProteins() minhashes, lsh = minhash.calculateLSH(proteins, shinglesize) print("Recalculated") if (mode == 'Query LSH' or mode == 'Q'): protein = input('Protein accession: ') start_time = time.time() result = minhash.queryProtein(protein) if result is not None: jaccResultsDict = minhash.checkJaccardResultsOfProtein( protein, result) # Return the results in sorted order, big to small Jaccard score sorted_jaccResultsDict = OrderedDict( sorted(jaccResultsDict.items(), key=lambda x: -x[1])) for jaccRes in sorted_jaccResultsDict.items(): print("\nMatch with Jaccard:", jaccRes[1]) information = uniDB.extractProteinInformation( jaccRes[0]) proteininfo = uniProtein(*information) proteininfo.printUniProtein(printSeq=False) print("Runtime of query search: %s seconds " % (time.time() - start_time)) if (mode == 'Calculate All' or mode == 'CA'): start_time = time.time() uniDB = UniprotDB("Uniprot_DB.sqlite") #uni_DB.close() proteins = uniDB.extractProteins() #minhash.calculateLSH([protein[1] for protein in proteins]) minhashes, lsh = minhash.calculateLSH(proteins, 3) for protein in proteins: print("Protein ", protein[0]) result = minhash.queryProtein(protein[0]) if result is not None: jaccResultsDict = minhash.checkJaccardResultsOfProtein( protein[0], result) sorted_jaccResultsDict = OrderedDict( sorted(jaccResultsDict.items(), key=lambda x: -x[1])) for jaccRes in sorted_jaccResultsDict.items(): print(jaccRes[0], " - Jaccard: ", jaccRes[1]) print("Runtime of query all: %s seconds " % (time.time() - start_time)) if (mode == 'Query All LSH' or mode == 'A'): resultsDB = ResultsDB("Results_DB.sqlite") resultsDB.createLSHtable("lshresults") resultsDB.deleteTable("lshresults") resultsDB.createLSHtable("lshresults") for query in minhash.minhashes.keys(): matches = minhash.queryProtein(query) for match in matches: # Filter self-matches if query != match: jaccard = minhash.estimateJaccard(query, match) resultsDB.addLSHresult(query, match, jaccard, "lshresults") print(resultsDB.extractLSHresults("lshresults")) if (mode == 'Read BLAST Results' or mode == 'B'): filename = input('Filename: ') handle = open(filename, 'r') resultsDB = ResultsDB("Results_DB.sqlite") resultsDB.createBLASTtable() resultsDB.deleteBLASTresults() resultsDB.createBLASTtable() for line in handle: line = line[:-1].split('\t') # Extract accessions from 'sp|A0A0R6L508|MCR1_ECOLX'-like string line[0] = line[0].split('|')[1] line[1] = line[1].split('|')[1] print(line) # Filter self-matches, add to the database if line[0] != line[1]: resultsDB.addBLASTresult(line[0], line[1], line[2], line[3]) print(resultsDB.extractBLASTresults()) if (mode == 'Compare Results' or mode == 'R'): # Database with all LSH and BLASTp results resultsDB = ResultsDB("Results_DB.sqlite") identity_th, alignment_th, jaccard_th = 80.0, 100, 0.5 precisions = [] recalls = [] # Load in all protein ids to loop over uniDB = UniprotDB("Uniprot_DB.sqlite") proteins = uniDB.extractProteins() # Store all precisions and recalls per query, to calculate the average for query in proteins: intersect = resultsDB.extractIntersectCountPerProtein( query[0], 'lshresults', identity_th, alignment_th, jaccard_th) lshresults = resultsDB.extractLSHcountPerProtein( query[0], 'lshresults', jaccard_th) blastresults = resultsDB.extractBLASTcountPerProtein( query[0], identity_th, alignment_th) tp = intersect fp = lshresults - intersect fn = blastresults - intersect precision = tp / (tp + fp) if (tp + fp) != 0 else -1 recall = tp / (tp + fn) if (tp + fn) != 0 else -1 # Exclude results without any similar proteins / division by zero if precision != -1: precisions.append(precision) if recall != -1: recalls.append(recall) print("Comparison of BLAST and LSH results:\n Number of proteins queried: %i \n Average precision: %0.3f Average recall: %0.3f\n" \ % (len(proteins), sum(precisions)/len(precisions), sum(recalls)/len(recalls))) if (mode == 'Save LSH' or mode == 'S'): number = int(input('Suffix number: ')) minhash.saveLSH(number) if (mode == 'Load LSH' or mode == 'LL'): number = int(input('Suffix number: ')) minhash.loadLSH(number) mode = input('Choose option: ')