コード例 #1
0
 def cloud_main(self, file_count):
     with open('encrypted_vectors/feature_vectors.json') as data_file:
         feature_loaded = json.load(data_file)
     with open('encrypted_indices/indices.json') as data_file:
         indices_loaded = json.load(data_file)
     with open('encrypted_query/query_vectors.json') as data_file:
         query_loaded = json.load(data_file)
     d = Decrypt()
     feature_vectors = d.decrypt_indices_vector(bytes(feature_loaded))
     indices = d.decrypt_indices_vector(bytes(indices_loaded))
     query_vectors = d.decrypt_indices_vector(bytes(query_loaded))
     feature_vectors = np.frombuffer(feature_vectors, dtype=int)
     feature_vectors = np.reshape(feature_vectors, (file_count, -1))
     indices = json.loads(indices.decode())
     query_vectors = np.frombuffer(query_vectors, dtype=int)
     query_vectors = np.reshape(query_vectors, (-1, 6))
     print(feature_vectors.shape, indices, query_vectors.shape)
     l = LSH(feature_vectors, indices)
     n_neighbors, result = l.query(query_vectors, 6, 45)
     print(n_neighbors)
     cursor.execute(sql_select_Query)
     records = cursor.fetchall()
     for row in records:
         if row[0] - 1 in result:
             image_name = row[1]
             print(image_name)
             CloudAPISender().cloud_api_sender(image_name)
     # Closing the connection
     conn.close()
コード例 #2
0
def loadOrTrainLSHModel(forceGenerate=False):
    lshModel = None
    if os.path.exists("./pickle_files/lshModel.pickle") and not forceGenerate:
        print("LSH model found on disk")
        pickleIn = open("./pickle_files/lshModel.pickle", "rb")
        lshModel = pickle.load(pickleIn)
    else:
        print("Training LSH model")
        trainAudioDataAndRateArray = loadAllFiles("train", '')
        trainingData = generateData(trainAudioDataAndRateArray)

        lshModel = LSH()
        for data in trainingData:
            print("fileName", data[1])
            validFrameList = extractValidFrames(data[0])

            for i in range(0, validFrameList.shape[0]):
                reshapedValidFrame = validFrameList[i].reshape(1, -1)
                # print("reshapedValidFrame", reshapedValidFrame)
                lshModel.train(reshapedValidFrame, {
                    "name": data[1] + "_" + str(i),
                    "frameIndex": i
                })
                # hr.train(validFrames[1:2], data[1])

        # print("lshModel", lshModel)

        pickleOut = open("./pickle_files/lshModel.pickle", "wb")
        pickle.dump(lshModel, pickleOut)
        pickleOut.close()

    return lshModel
コード例 #3
0
def get_similar(num_layers, num_hashes, features, filepaths, query_image, num_results):
    lsh = LSH(num_layers, num_hashes, 10)
    lsh.fit(features)

    query_vec = get_cm_features_by_image_path(query_image)
    similar_indices = lsh.get_similar(query_vec, num_results)[:num_results]

    return [list(features[x]) for x in similar_indices], [filepaths[x] for x in similar_indices]
コード例 #4
0
ファイル: test_lsh.py プロジェクト: pdoyle5000/lsh
    def setUp(self):
        self.lsh = LSH(3, 2, 1)
        self.lsh_two_tables = LSH(3, 2, 2)

        # Overwrite randomly initalized planes with known values.
        self.lsh.planes = [np.array([[0.1, 0.2], [-0.1, -0.2], [-1.0, 1.0]])]
        self.lsh_two_tables.planes = [
            np.array([[0.1, 0.2], [-0.1, -0.2], [-1.0, 1.0]]),
            np.array([[-0.1, -0.2], [0.1, 0.2], [-2.0, 2.0]]),
        ]
コード例 #5
0
    def __init__(self, N, D, K, L):
        self.lsh = LSH(SimHash(D, K, L), K, L)
        self.keys = np.zeros((N, D), dtype=np.float32)
        self.values = np.zeros((N, 1), dtype=np.float32)
        self.lru = np.zeros(N, dtype=np.float32)
        self.key2idx = dict()

        self.size = 0
        self.max_memory = N
        self.K = K
        self.L = L
コード例 #6
0
ファイル: nn.py プロジェクト: mikaelbaymani/nn
def query(fname, key='key', topk=10, truncate=80):

    model = pickle.load(open(CONST.MODEL, 'rb'))

    dataframe = pd.read_csv(CONST.DATASET)
    corpus = TfidfVectorizer().fit_transform(dataframe['content'])

    lsh = LSH(corpus, model)
    index = dataframe[dataframe[key].apply(str) == str(fname)].index[0]

    dataframe['content'] = dataframe['content'].str[:int(truncate)]
    return lsh.query(corpus[index, :], int(topk),
                     10)[0].join(dataframe,
                                 on='id').sort_values('distance').iloc[:, 1:]
コード例 #7
0
 def _build_lsh(self, minhashes):
     '''
         Builds the LSH using the given minhash vectors
     '''
     self.lsh = LSH(minhashes)
     with open(self.LSH_FILE, 'wb') as f:
         pickle.dump(self.lsh.hash_tables, f)
コード例 #8
0
 def __init__(self, digest_length, num_hashtables, dTreshold):
     self._dTrehold = dTreshold
     self._lsh = LSH(digest_length,
                     2**20,
                     storage_config=None,
                     num_hashtables=num_hashtables)
     self._hashvect = HashVect()
コード例 #9
0
ファイル: index.py プロジェクト: lunaryan/lsh
 def query(self, input_list, topk=10, key_dist=1, dist_func=LSH.cosine_dist):
     query_docinfo = ''
     if self.docinfo_lsh is None:
         query_docinfo = np.array(input_list, dtype='float32')
     else:
         query_docinfo = bitarray(''.join(self.docinfo_lsh.hash(input_list)))
         
     rs_dict = {}
     if key_dist >= 0:
         query_key_list = self.lsh.hash(input_list)
         for i, query_key in enumerate(query_key_list):
             for k in LSH.get_keys_str(query_key, key_dist):
                 k += ('_' + str(i))
                 if k in self.index_dict:
                     for idx in self.index_dict[k]:
                         if idx in rs_dict:
                             continue
                         dist = dist_func(self.docinfo_list[idx], query_docinfo)
                         docid = self.docid_list[idx]
                         rs_dict[docid] = dist
     else:
         # brute force search
         for docid, docinfo in zip(self.docid_list, self.docinfo_list):
             dist = dist_func(docinfo, query_docinfo)
             rs_dict[docid] = dist
     print 'Candidates: %d' % len(rs_dict)
     return (sorted(rs_dict.items(), key=itemgetter(1), reverse=False)[: topk], len(rs_dict))
コード例 #10
0
ファイル: task5.py プロジェクト: satrajitrik/HandImages
def starter(image_id, m, k, l):
    query_results = Database().retrieve_many()
    id_vector_pairs = [(item["image_id"], item["vector"])
                       for item in query_results]

    search_results = LSH(l, k, id_vector_pairs).get_search_results(image_id,
                                                                   show=True)

    print(
        "Original dataset size: {} | Reduced search space size: {} | Reduction by {} %"
        .format(
            len(id_vector_pairs),
            len(search_results),
            float(len(id_vector_pairs) - len(search_results)) * 100 /
            len(id_vector_pairs),
        ))
    query_results = Database().retrieve_many(list(search_results))
    search_id_vector_pairs = [(item["image_id"], item["vector"])
                              for item in query_results]
    source_vector = Database().retrieve_one(image_id)["vector"]

    all_images = functions.find_similarity(source_vector,
                                           search_id_vector_pairs)
    similar_images = all_images[:m]

    print(similar_images)

    visualizer.visualize_lsh(image_id, similar_images)

    return similar_images, all_images
コード例 #11
0
ファイル: klsh.py プロジェクト: JohPa8696/LSHiForest
	def __init__(self, nbits=3, kernel='rbf', kernel_kwds='range', para_p_max=300, para_p_exp=0.5, para_t_max=30, para_t_ratio=4, weight_pool_size=50):
		LSH.__init__(self, weight_pool_size)
		self._nbits = nbits
		self._kernel_func = None
		self._kernel_kwds = kernel_kwds
		self._check_kernel(kernel, kernel_kwds)
		self._train_data = None
		self._para_p = para_p_max
		self._para_p_exp = para_p_exp
		self._para_t = para_t_max
		self._para_t_ratio = para_t_ratio
		self._K_half = None
		self._weight_pool_size = weight_pool_size
		self._e_s_pool = None 
		self._weight_pool = None 
		self._k = None
コード例 #12
0
 def load_lsh(self):
     '''
         Loads the buckets from the files and initalizes a LSH object using this 
         data.
     '''
     with open(self.LSH_FILE, 'rb') as f:
         data = pickle.load(f)
         self.lsh = LSH(table=data)
コード例 #13
0
    def __init__(self, movie_filename, rating_filename, k, m, c):

        # Hyperparameter
        self.c = c

        # read movie file and create dictionary _movie_names
        self._movie_names = {}
        f = open(movie_filename, "r", encoding="utf8")
        reader = csv.reader(f)
        next(reader)  # skips header line
        for line in reader:
            movieid = line[0]
            moviename = line[1]
            # ignore line[2], genre
            self._movie_names[movieid] = moviename
        f.close()

        # read rating file and create _movie_ratings (ratings for a movie)
        # and _user_ratings (ratings by a user) dicts
        self._movie_ratings, self._movie_time = {}, {}
        self._user_ratings, self._user_time = {}, {}
        f = open(rating_filename, "r", encoding="utf8")
        reader = csv.reader(f)
        next(reader)  # skips header line
        for line in reader:
            userid = line[0]
            movieid = line[1]
            rating = line[2]
            timestamp = line[3]
            if userid not in self._user_ratings:
                self._user_ratings[userid] = {
                }  # each user is a dict with movies and ratings
                self._user_time[userid] = {}
            self._user_ratings[userid][movieid] = float(rating)
            self._user_time[userid][movieid] = float(timestamp)

            if movieid not in self._movie_ratings:
                self._movie_ratings[movieid] = {}
                self._movie_time[movieid] = {}
            self._movie_ratings[movieid][userid] = float(rating)
            self._movie_time[movieid][userid] = float(timestamp)
        f.close()

        self.me = LSH(k, m, self._user_ratings, self._movie_ratings)
コード例 #14
0
 def cloud_main(self, file_count):
     #establishing the connection
     conn = mysql.connector.connect(user='******',
                                    password='******',
                                    host='127.0.0.1',
                                    database='ImageRetrieval')
     #Creating a cursor object using the cursor() method
     cursor = conn.cursor()
     # Preparing SQL query to select a record from the database.
     sql_select_Query = "select * from images"
     with open('encrypted_vectors/feature_vectors.json') as data_file:
         feature_loaded = json.load(data_file)
     with open('encrypted_indices/indices.json') as data_file:
         indices_loaded = json.load(data_file)
     with open('encrypted_query/query_vectors.json') as data_file:
         query_loaded = json.load(data_file)
     d = Decrypt()
     feature_vectors = d.decrypt_indices_vector(bytes(feature_loaded))
     indices = d.decrypt_indices_vector(bytes(indices_loaded))
     query_vectors = d.decrypt_indices_vector(bytes(query_loaded))
     feature_vectors = np.frombuffer(feature_vectors, dtype=int)
     feature_vectors = np.reshape(feature_vectors, (file_count, -1))
     indices = json.loads(indices.decode())
     query_vectors = np.frombuffer(query_vectors, dtype=int)
     query_vectors = np.reshape(query_vectors, (-1, 6))
     print(feature_vectors.shape, indices, query_vectors.shape)
     l = LSH(feature_vectors, indices)
     n_neighbors, result = l.query(query_vectors, 6, 45)
     print(n_neighbors)
     cursor.execute(sql_select_Query)
     records = cursor.fetchall()
     for row in records:
         if row[0] - 1 in result:
             image_name = row[1]
             print(image_name)
             CloudAPISender().cloud_api_sender(image_name)
     # Closing the connection
     conn.close()
コード例 #15
0
def run(dataPath):
    nrofBands = 25
    nrOfPerms = 132
    nrOfRows = int(nrOfPerms / nrofBands)

    # Load data
    data = np.load(dataPath)
    print("Data is loaded")

    # Create an empty file to add results
    with open('results.txt', 'w') as file:
        file.write("")
        file.close()

    # Create Signature Matrix
    signatureMatrix = minHashing(data, nrOfPerms)
    pairsFound = LSH(data, signatureMatrix, nrOfRows, nrofBands)
    print(" Number of similar pairs found is : ", len(pairsFound))
    return (pairsFound)
コード例 #16
0
from create_vector import vectorize
from lsh import LSH
from parse_document import extract_from_pdf
import sys

if __name__ == "__main__":
    pdfs = sys.argv[1:]
    if pdfs:
        vector_list = [vectorize(extract_from_pdf(pdf)) for pdf in pdfs]
    else:
        print "Usage: python create_vector.py pdf1 [pdf2] [pdf3] .."
        sys.exit()
    lsh = LSH(300)
    [lsh.insert_document(title, vector) for (title, vector) in vector_list]
    # print lsh.get_similarities()
    print lsh.closest_match(vector_list[0][0])
    # return vector_list
import pandas as pd
import sqlite3


uniDB = UniprotDB("Uniprot_DB.sqlite")
#Construct the protein database
"""
uniDB.deleteProteins()
protManager = ProteinsManager()
uniDB.createTables()
protManager.loadProteins("Ecolx.xml",uniDB)
protManager.loadProteins("PseA7.xml",uniDB)
"""


minhash3 = LSH(0.3,32)
minhash4 = LSH(0.3,64)
minhash4b = LSH(0.3,96)
minhash5 = LSH(0.3,128)

# Create the minhashes
proteins = uniDB.extractProteins()
"""
minhashes3, lsh3 = minhash3.calculateLSH(proteins, 3)
minhashes4, lsh4 = minhash4.calculateLSH(proteins, 3)
minhashes4b, lsh4b = minhash5.calculateLSH(proteins, 3)
minhashes5, lsh5 = minhash6.calculateLSH(proteins, 3)

minhash3.saveLSH(32)
minhash4.saveLSH(64)
minhash4b.saveLSH(96)
コード例 #18
0
from lsh import LSH
import cPickle as pickle

#define model
model = LSH(base_vec_num=3, iter_num=2, dimens=2)

#data
vec_dict = {
    "a": [0.1, 0.2],
    "b": [0.5, -0.2],
    "c": [-0.3, -0.1],
    "d": [1.0, 0.0],
    "e": [-3, 2],
    "f": [2, 2]
}

#build
model.build_lsh(vec_dict)

#get candidate set
for name in model.get_candidate_set([0.3, 0.9]):
    print name,
print ""

#test pickle
with open("model.pkl", 'wb') as fout:
    pickle.dump(model, fout)
with open("model.pkl", 'rb') as fin:
    dumped_model = pickle.load(fin)
    print "dumped model:"
    for name in dumped_model.get_candidate_set([0.3, 0.9]):
コード例 #19
0
#coding: utf-8

from lsh import LSH

lsh = LSH(L=10,k=5,d=11)
lsh.loadDataSet('training_set1.txt')
vect = [1,10,1,11,1,13,1,12,1,1,9]
knn_vects = lsh.knn(vect,3)
print knn_vects
コード例 #20
0
ファイル: dedup.py プロジェクト: stteffen58/Seminarthesis
print 'create tfidf vectors of documents'
tfidf = TFIDF(doc_dict)
''' Perform lsh '''
print time.asctime(time.localtime(time.time()))
digest_length = int(sys.argv[2])
vect_length = tfidf.vect_length
num_hashtables = 1
log += 'perform lsh with hash-length: ' + str(
    digest_length) + ', vect-length: ' + str(
        vect_length) + ', num-hashtables: ' + str(num_hashtables) + '\n'
print 'perform lsh with hash-length: ' + str(
    digest_length) + ', vect-length: ' + str(
        vect_length) + ', num-hashtables: ' + str(num_hashtables)
r = {"dict": None}
lsh = LSH(digest_length,
          vect_length,
          storage_config=r,
          num_hashtables=num_hashtables)
for i, k in enumerate(tfidf._id_list):
    vect = tfidf.get_vector(i)
    lsh.index(vect, extra_data=tfidf._id_list[i])
''' Query documents '''
log += str(time.asctime(time.localtime(time.time()))) + '\n'
log += 'query documents\n'
print time.asctime(time.localtime(time.time()))
print 'Query documents'
distance_func = "cosine"

corr = set()

for i, key in enumerate(tfidf._id_list):
    query_object = tfidf.get_vector(i)
コード例 #21
0
#MySQL related information needs to be update here
conn = mysql.connector.connect(
   user='******', password='******', host='127.0.0.1', database='ImageRetrieval')
#Creating a cursor object using the cursor() method
cursor = conn.cursor()
# Preparing SQL query to INSERT a record into the database.
insert_stmt = (
   "insert into images (path)"
   "values (%s)"
)
key = (0.1, 0.1)
num_of_random_vectors = 16
hc = HarrisCorner()
sb = SurfBow()
e = Encrypt()
l = LSH()
#Image directory path to be mentioned here
img_dir = "../images/"
feat_vec=[]

if not os.path.exists("encrypted_images/"):
        cwd = os.getcwd()
        directory = "/encrypted_images"
        os.mkdir(cwd+directory)
if not os.path.exists("encrypted_indices/"):
        cwd = os.getcwd()
        directory = "/encrypted_indices"
        os.mkdir(cwd+directory)
if not os.path.exists("encrypted_vectors/"):
        cwd = os.getcwd()
        directory = "/encrypted_vectors"
コード例 #22
0
def train(device, data, schedule, mi_type, args):
    model = MI_Estimator(device, D=d, ED=ed, HD=256)
    model.to(device)
    model.train()

    optimizer = optim.Adam(model.parameters(), lr=5e-4)

    xs, ys = data
    xs = xs.to(device)
    ys = ys.to(device)
    zxs = torch.cat([xs, zerot], dim=0)

    lsh = LSH(SimHash(ed, K, L), K, L)

    estimates = []
    for batch_idx, MI in enumerate(schedule):
        optimizer.zero_grad()

        # randomly select data from data distribution
        sdx_iter = (batch_idx // mi_range) * mi_range
        sdx_offset = sdx_iter * batch_size
        sdx = torch.from_numpy(
            np.random.choice(mi_range *
                             batch_size, batch_size, replace=False) +
            sdx_offset).to(device)

        t = 10 if batch_idx <= 1000 else 100
        if batch_idx % t == 0:
            # Load first section of desired size into lsh hash tables
            lxs = xs[:desired_size, :]
            assert (lxs.size(0) == desired_size)
            build(lsh, model, lxs)

            #lsh.stats()
            # Full - Load All Data
            #build(lsh, model, xs)

        # embed data
        y = F.embedding(sdx, ys).detach()
        ey = model.embed_y(y)

        # for each data sample, query lsh data structure, remove accidental hit
        # find maximum number of samples
        # create matrix and pad appropriately
        np_indices = lsh.query_remove_matrix(ey, sdx, xs.size(0))
        indices = torch.from_numpy(np_indices).to(device)

        # create mask distinguishing between samples and padding
        mask = 1.0 - torch.eq(indices, xs.size(0)).float()
        mask = torch.cat([bs_onet, mask], dim=1).detach()

        px = torch.unsqueeze(F.embedding(sdx, xs), dim=1)
        nx = F.embedding(indices, zxs, padding_idx=xs.size(0))
        x = torch.cat([px, nx], dim=1).detach()

        mi = model(x, y, mask, args)
        loss = -mi
        loss.backward()
        optimizer.step()

        estimates.append(mi.item())
        if (batch_idx + 1) % 100 == 0:
            print('{} {} MI:{}, E_MI: {:.6f}'.format(mi_type.name,
                                                     batch_idx + 1, MI,
                                                     mi.item()))
            sys.stdout.flush()
    lsh.stats()
    return estimates
コード例 #23
0
ファイル: main.py プロジェクト: bgrana/DM_SimilarObjects
def main(args):
    # Get input params
    input_dir = args["dir"]
    th = args["th"]

    # Read all files contained in the input directory
    print("Loading documents...")
    onlyfiles = [f for f in listdir(input_dir) if isfile(join(input_dir, f))]
    docs = []
    for fname in onlyfiles:
        with open(join(input_dir, fname), "r") as file:
            docs += [file.read()]

    # Clean documents removing trailing and duplicate blanks
    print("Cleaning documents...")
    docs = [re.sub('\W+', ' ', doc) for doc in docs]

    # Compute shingles of size n
    print("Computing shingles...")
    sh = Shingling(args["n"])
    shingles = sh.transform(docs)

    # Compute jaccard similarities
    print("Jaccard similarities (on hashed shingles) > " + str(th) + ":")
    similarities = {(onlyfiles[i], onlyfiles[j]):
                    compare_shingles(shingles[i], shingles[j])
                    for i in range(0, len(docs))
                    for j in range(i + 1, len(docs))}
    # Show similarities greater than the threshold
    print(
        sorted([(k, v) for k, v in similarities.items() if v > th],
               key=itemgetter(1),
               reverse=True))

    # Compute minHash signatures
    print("Computing signatures...")
    mh = MinHashing(args["k"])
    signatures = mh.transform(shingles)

    # Compute similarity esrimations
    print("Similarity estimations using minHashing > " + str(th) + ":")
    estimations = {(onlyfiles[i], onlyfiles[j]):
                   compare_signatures(signatures[:, i], signatures[:, j])
                   for i in range(0, len(docs))
                   for j in range(i + 1, len(docs))}
    # Show similarity estimations greater than a threshold
    print(
        sorted([(k, v) for k, v in estimations.items() if v > th],
               key=itemgetter(1),
               reverse=True))

    # Show Differences between estimations and real similarities
    errors = {(onlyfiles[i], onlyfiles[j]):
              abs(estimations[(onlyfiles[i], onlyfiles[j])] -
                  similarities[(onlyfiles[i], onlyfiles[j])])
              for i in range(0, len(docs)) for j in range(i + 1, len(docs))}
    # Show errors greater than 5%
    print("Estimaions with error greater than 5%:")
    print(
        sorted([(k, v) for k, v in errors.items() if v > 0.05],
               key=itemgetter(1),
               reverse=True))

    # Apply LSH to find pairs of probable similar items
    lsh = LSH(signatures, th)
    lsh.index()
    candidates = lsh.get_pairs()

    # Show candidates
    print("Identified candidates with LSH:")
    print([(onlyfiles[t[0]], onlyfiles[t[1]]) for t in candidates])
コード例 #24
0
from mrjob.job import MRJob
from mrjob.step import MRStep
import util
import os
from datastore import SQLiteDatastore
import config
import random

SPACE = u' '

PATH = os.path.dirname(os.path.abspath(__file__))

from lsh import LSH, DocumentTooShortError
datastore = SQLiteDatastore(config.SQLITE_PATH, False)
lsh = LSH(datastore)


class CandidatesMapReducer(MRJob):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.remaining = str()

    def steps(self):
        return [
            MRStep(
                mapper=self.mapper_paragraphs,
                mapper_final=self.mapper_paragraphs_final,
                reducer=self.reducer_minhash,
            ),
            MRStep(reducer=self.reducer_unique, ),
        ]
コード例 #25
0
ファイル: test_lsh.py プロジェクト: windkeepblow/LSH
from lsh import LSH
import cPickle as pickle

#define model
model = LSH(base_vec_num=3, iter_num=2, dimens=2)

#data
vec_dict = {
    "a":[0.1,0.2],
    "b":[0.5,-0.2],
    "c":[-0.3,-0.1],
    "d":[1.0,0.0],
    "e":[-3,2],
    "f":[2,2]
}

#build
model.build_lsh(vec_dict)

#get candidate set
for name in model.get_candidate_set([0.3,0.9]):
    print name,
print ""

#test pickle 
with open("model.pkl", 'wb') as fout:
    pickle.dump(model, fout)
with open("model.pkl", 'rb') as fin:
    dumped_model = pickle.load(fin)
    print "dumped model:"
    for name in dumped_model.get_candidate_set([0.3,0.9]):
コード例 #26
0
class DND:
    MAX_SIZE = 25
    TM = 0.1

    def __init__(self, N, D, K, L):
        self.lsh = LSH(SimHash(D, K, L), K, L)
        self.keys = np.zeros((N, D), dtype=np.float32)
        self.values = np.zeros((N, 1), dtype=np.float32)
        self.lru = np.zeros(N, dtype=np.float32)
        self.key2idx = dict()

        self.size = 0
        self.max_memory = N
        self.K = K
        self.L = L

    def __contains__(self, key):
        return tuple(key) in self.key2idx

    def __getitem__(self, key):
        try:
            index = self.key2idx[tuple(key)]
            self.lru[index] += DND.TM
            return self.values[index]
        except:
            return None

    def __setitem__(self, key, value):
        item = tuple(key)
        try:
            # 1) Find memory index for key vector
            index = self.key2idx[item]
        except:
            # 2) Add key vector if not present
            if self.size >= self.max_memory:
                # 3) If memory is full, select LRU memory index and remove from LSH hash tables
                index = np.argmin(self.lru)
                self.lsh.erase(self.keys[index], index)
            else:
                index = self.size
                self.size += 1

            # Rehash key into LSH hash tables
            self.lsh.insert(key, index)
            self.key2idx[item] = index

            # Add new key to memory
            self.keys[index] = key
        finally:
            # Update memory value
            self.values[index] = value
            self.lru[index] += DND.TM

    def retrieve(self, query):
        # Collect memory indices from LSH hash tables
        indices, cL = self.lsh.query(query.data, DND.MAX_SIZE)

        # Gather keys and values from memory
        keys = self.keys[indices]
        values = self.values[indices]
        self.lru[indices] += DND.TM

        assert (keys.shape[0] == values.shape[0])
        return keys, values, indices, cL
コード例 #27
0
from mrjob.job import MRJob
from mrjob.step import MRStep
from mrjob.protocol import TextProtocol
import util
import os
from datastore import SQLiteDatastore
import config
from lsh import LSH, DocumentTooShortError

datastore = SQLiteDatastore(config.SQLITE_PATH, False)
lsh = LSH(datastore, paragraphs=True)

class GeneratorMapReducer(MRJob):

    INPUT_PROTOCOL = TextProtocol

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.remaining = str()


    def steps(self):
        return [
            MRStep(
                mapper=self.mapper_articles,
                reducer=self.reducer_minhash,
            )
        ]


    def mapper_articles(self, article_id, article):
コード例 #28
0
    def run(self):
        print(\
        """Local Sensitivity Hashing-based protein similarity search.
	Options: E[X]it, [L]oad Database, [D]elete Database,
	[C]alculate LSH, [RC] Recalculate LSH, [LL] Load LSH, [S]ave LSH
	[Q]uery LSH, Query [A]ll LSH, Read [B]LAST, Compare [R]esults,
		""")
        mode = input('Choose option:')

        uniDB = UniprotDB("Uniprot_DB.sqlite")
        minhash = LSH(0.5, 96)

        while (mode != 'Exit' and mode != 'X'):

            if (mode == 'Delete Database' or mode == 'D'):
                uniDB.deleteProteins()

            if (mode == 'Load Database' or mode == 'L'):
                protManager = ProteinsManager()
                uniDB.createTables()
                filename = input(
                    'XML filename (e.g. Ecolx.xml or PseA7.xml or Human.xml): '
                )
                protManager.loadProteins(filename, uniDB)

            if (mode == 'Calculate LSH' or mode == 'C'):
                uniDB = UniprotDB("Uniprot_DB.sqlite")
                proteins = uniDB.extractProteins()
                minhashes, lsh = minhash.calculateLSH(proteins, 3)
                print("Calculated")

            if (mode == 'Recalculate LSH' or mode == 'RC'):
                jaccardThreshold = float(
                    input(
                        "Specify a Jaccard similarity threshold (default: 0.5): "
                    ))
                permutations = int(
                    input(
                        "Specify the number of permutations(default: 96) : "))
                shinglesize = int(
                    input("Specify the shingle size (default: 3): "))
                minhash = LSH(jaccardThreshold, permutations)
                proteins = uniDB.extractProteins()
                minhashes, lsh = minhash.calculateLSH(proteins, shinglesize)
                print("Recalculated")

            if (mode == 'Query LSH' or mode == 'Q'):
                protein = input('Protein accession: ')
                start_time = time.time()
                result = minhash.queryProtein(protein)
                if result is not None:
                    jaccResultsDict = minhash.checkJaccardResultsOfProtein(
                        protein, result)
                    # Return the results in sorted order, big to small Jaccard score
                    sorted_jaccResultsDict = OrderedDict(
                        sorted(jaccResultsDict.items(), key=lambda x: -x[1]))
                    for jaccRes in sorted_jaccResultsDict.items():
                        print("\nMatch with Jaccard:", jaccRes[1])
                        information = uniDB.extractProteinInformation(
                            jaccRes[0])
                        proteininfo = uniProtein(*information)
                        proteininfo.printUniProtein(printSeq=False)
                print("Runtime of query search: %s seconds " %
                      (time.time() - start_time))

            if (mode == 'Calculate All' or mode == 'CA'):
                start_time = time.time()
                uniDB = UniprotDB("Uniprot_DB.sqlite")
                #uni_DB.close()
                proteins = uniDB.extractProteins()
                #minhash.calculateLSH([protein[1] for protein in proteins])
                minhashes, lsh = minhash.calculateLSH(proteins, 3)
                for protein in proteins:
                    print("Protein ", protein[0])
                    result = minhash.queryProtein(protein[0])
                    if result is not None:
                        jaccResultsDict = minhash.checkJaccardResultsOfProtein(
                            protein[0], result)
                        sorted_jaccResultsDict = OrderedDict(
                            sorted(jaccResultsDict.items(),
                                   key=lambda x: -x[1]))
                        for jaccRes in sorted_jaccResultsDict.items():
                            print(jaccRes[0], " - Jaccard: ", jaccRes[1])
                print("Runtime of query all: %s seconds " %
                      (time.time() - start_time))

            if (mode == 'Query All LSH' or mode == 'A'):
                resultsDB = ResultsDB("Results_DB.sqlite")
                resultsDB.createLSHtable("lshresults")
                resultsDB.deleteTable("lshresults")
                resultsDB.createLSHtable("lshresults")
                for query in minhash.minhashes.keys():
                    matches = minhash.queryProtein(query)
                    for match in matches:
                        # Filter self-matches
                        if query != match:
                            jaccard = minhash.estimateJaccard(query, match)
                            resultsDB.addLSHresult(query, match, jaccard,
                                                   "lshresults")
                print(resultsDB.extractLSHresults("lshresults"))

            if (mode == 'Read BLAST Results' or mode == 'B'):
                filename = input('Filename: ')
                handle = open(filename, 'r')
                resultsDB = ResultsDB("Results_DB.sqlite")
                resultsDB.createBLASTtable()
                resultsDB.deleteBLASTresults()
                resultsDB.createBLASTtable()
                for line in handle:
                    line = line[:-1].split('\t')
                    # Extract accessions from 'sp|A0A0R6L508|MCR1_ECOLX'-like string
                    line[0] = line[0].split('|')[1]
                    line[1] = line[1].split('|')[1]
                    print(line)
                    # Filter self-matches, add to the database
                    if line[0] != line[1]:
                        resultsDB.addBLASTresult(line[0], line[1], line[2],
                                                 line[3])
                print(resultsDB.extractBLASTresults())

            if (mode == 'Compare Results' or mode == 'R'):

                # Database with all LSH and BLASTp results
                resultsDB = ResultsDB("Results_DB.sqlite")
                identity_th, alignment_th, jaccard_th = 80.0, 100, 0.5
                precisions = []
                recalls = []

                # Load in all protein ids to loop over
                uniDB = UniprotDB("Uniprot_DB.sqlite")
                proteins = uniDB.extractProteins()
                # Store all precisions and recalls per query, to calculate the average
                for query in proteins:
                    intersect = resultsDB.extractIntersectCountPerProtein(
                        query[0], 'lshresults', identity_th, alignment_th,
                        jaccard_th)
                    lshresults = resultsDB.extractLSHcountPerProtein(
                        query[0], 'lshresults', jaccard_th)
                    blastresults = resultsDB.extractBLASTcountPerProtein(
                        query[0], identity_th, alignment_th)
                    tp = intersect
                    fp = lshresults - intersect
                    fn = blastresults - intersect
                    precision = tp / (tp + fp) if (tp + fp) != 0 else -1
                    recall = tp / (tp + fn) if (tp + fn) != 0 else -1
                    # Exclude results without any similar proteins / division by zero
                    if precision != -1:
                        precisions.append(precision)
                    if recall != -1:
                        recalls.append(recall)

                print("Comparison of BLAST and LSH results:\n Number of proteins queried: %i \n Average precision: %0.3f Average recall: %0.3f\n" \
                 % (len(proteins), sum(precisions)/len(precisions), sum(recalls)/len(recalls)))

            if (mode == 'Save LSH' or mode == 'S'):
                number = int(input('Suffix number: '))
                minhash.saveLSH(number)

            if (mode == 'Load LSH' or mode == 'LL'):
                number = int(input('Suffix number: '))
                minhash.loadLSH(number)

            mode = input('Choose option: ')
コード例 #29
0
ファイル: nn.py プロジェクト: mikaelbaymani/nn
    if "-h" in sys.argv or "--help" in sys.argv:

        print(
            "Usage: ./nn.py [OPTION]                                    \n\n"
            "   -h | --help        Show this help message and exit        \n"
            "   --fetch <plugin>   Fetch new data with proprietary plugin \n"
            "   --train            Train LSH model                        \n"
            "   --query <ID>       Nearest Neighbor query                 \n")
        exit(0)

    if "--fetch" in sys.argv:

        pluginName = sys.argv[2].replace('.py', '')
        Dimport("%s" % pluginName, pluginName,
                FULLNAME('plugins'))(CONST.DATASET)

    if "--train" in sys.argv:

        dataframe = pd.read_csv(CONST.DATASET)
        corpus = TfidfVectorizer().fit_transform(dataframe['content'])

        lsh = LSH(corpus)
        model = lsh.train()

        pickle.dump(model, open(CONST.MODEL, 'wb'))

    if "--query" in sys.argv:
        print(query(sys.argv[2]))

# eof
コード例 #30
0
def lsh_cluster(unlabel):
    return LSH(unlabel)
コード例 #31
0
import pandas as pd
import sqlite3

uniDB = UniprotDB("Uniprot_DB_ec_pa_human.sqlite")
#Construct the protein database
"""
uniDB.deleteProteins()
protManager = ProteinsManager()
uniDB.createTables()
protManager.loadProteins("Ecolx.xml",uniDB)
protManager.loadProteins("PseA7.xml",uniDB)
"""
protManager = ProteinsManager()
protManager.loadProteins("Human.xml", uniDB)

minhash3 = LSH(0.3, 96)
minhash4 = LSH(0.5, 96)
minhash5 = LSH(0.5, 128)

# Create the minhashes
proteins = uniDB.extractProteins()

minhashes3, lsh3 = minhash3.calculateLSH(proteins, 3)
minhashes4, lsh4 = minhash4.calculateLSH(proteins, 3)
minhashes5, lsh5 = minhash5.calculateLSH(proteins, 3)

minhash3.saveLSH(963)
minhash4.saveLSH(965)
minhash5.saveLSH(1285)
"""
minhash3.loadLSH(963)
コード例 #32
0
with gzip.open(datasetPath + filenameIn + '_sample.warc.gz', mode='rb') as gzf:
    for record in warc.WARCFile(fileobj=gzf):
        record_id = record['WARC-Record-ID']
        payload = record.payload.read()
        doc_uri[record_id] = record['WARC-Target-URI']
        text = HTMLPreprocessing(payload).get_text()
        doc_dict[record_id] = text
        doc_count += 1

print 'create vectors'
tfidf = TFIDF(doc_dict)
vect_length = tfidf.vect_length  # length of the input vector
num_hashtables = 1  # number of iterations
digest_length = 0
print 'perform lsh'
lsh = LSH(digest_length, vect_length, num_hashtables=num_hashtables)
for i, k in enumerate(tfidf._id_list):
    vect = tfidf.get_vector(i)
    lsh.index(vect, extra_data=tfidf._id_list[i])
''' Query documents '''
dedup = set()
keys = lsh.hash_tables[0].keys()
i = 0
for key in keys:
    bucket = lsh.hash_tables[0].get_val(key)
    for query_object in bucket:
        candidates = lsh.query(query_object[0], distance_func='cosine')
        for c in candidates:
            candidate_key = c[0][
                1]  # warc id is appended as extra data in lsh.index()
            if candidate_key == query_object[1]: