def test_can_query_using_existing_row(self): X = csr_matrix([[3, 0, 0, 0, 0, 0, -1], [0, 1, 0, 0, 0, 0, 1], [1, 1, 1, 1, 1, 1, 1]]) # One class number for each input point y = ["0", "blah", "kill"] # I've changed the last 0 to a 1 X_sim = csr_matrix([[1, 1, 1, 1, 1, 1, 1]]) lsh = LSH(4, X.shape[1], num_hashtables=1, storage_config={"dict": None}) for ix in range(X.shape[0]): x = X.getrow(ix) c = y[ix] lsh.index(x, extra_data=c) # find the point in X nearest to X_sim points = lsh.query(X_sim, num_results=1) print(points[0]) self.assertEqual(type(points[0][0][0]), csr_matrix) truth = points[0][0][0].todense() == X_sim.todense() self.assertTrue(truth.all())
def test_can_query_using_existing_row(self): X = csr_matrix( [ [ 3, 0, 0, 0, 0, 0, -1], [ 0, 1, 0, 0, 0, 0, 1], [ 1, 1, 1, 1, 1, 1, 1] ]) # One class number for each input point y = [ "0", "blah", "kill"] # I've changed the last 0 to a 1 X_sim = csr_matrix( [ [ 1, 1, 1, 1, 1, 1, 1]]) lsh = LSH( 4, X.shape[1], num_hashtables=1, storage_config={"dict":None}) for ix in xrange(X.shape[0]): x = X.getrow(ix) c = y[ix] lsh.index( x, extra_data=c) # find the point in X nearest to X_sim points = lsh.query(X_sim, num_results=1) print points[0] self.assertEqual( type(points[0][0][0]), csr_matrix) truth = points[0][0][0].todense() == X_sim.todense() self.assertTrue( truth.all())
def find_nearest_neighbors(in_file, out_file, path_to_table, num_test, random_seed, hash_size, num_neighbors): posts = load_sparse_csr(args.in_file) num_samples, num_dimensions = posts.shape # Train / test split train_index, test_index = next(iter(ShuffleSplit(num_samples, n_iter=1, test_size=num_test, random_state=random_seed))) # Load or create the LSH table object if path_to_table: if isfile(path_to_table): lsh = pickle.load(open(path_to_table)) print('LSH table loaded') else: raise Exception("LSH table doesn't exist") else: lsh = LSH(args.hash_size, num_dimensions, num_hashtables=1, storage_config={"dict": None}) # Both the train and test posts are included, so that saving the output is easier # and the table can be stored and used for any train/test split for ix in xrange(num_samples): x = posts.getrow(ix) lsh.index(x, extra_data=ix) print('LSH table constructed') # Dump the table, so that it can be reused later pickle.dump(lsh, open('lsh_table', 'w')) # Perform the nearest neighbor search for each of the samples in the test set nearest_neighbors = dict() for ix in test_index: neighbors = lsh.query(posts.getrow(ix)) # The closest element is always the element itself, so need to iterate to num_neighbors + 1 nearest_neighbors[ix] = [neighbors[i][0][1] for i in range(num_neighbors + 1)] print('Nearest neighbors found') # Save the results to csv nn_file = csv.writer(open(out_file, 'w'), doublequote=False, escapechar='\\') nn_file.writerow(['id'] + ['n{}'.format(i) for i in range(1, num_neighbors + 1)]) for row in nearest_neighbors.itervalues(): nn_file.writerow(row)
#vector representing document vec = [0] * len(featTraj.keys()) for word in tweet['tokens']: if word == "": continue else: vec[keys.index(word)] = featTraj[word][t] #normalise norm = np.linalg.norm(vec) + 1e-6 vec /= norm #sparse vec for LSH lib sparseVec = csr_matrix(vec) #find the nearest tweet id , -1 if not found if t != 0: min_dis, idTweet = fsd(hashTable, sparseVec) lshIds.append(idTweet) mDIS.append(min_dis) if 1 - min_dis < similarityThreshold: output = bucketTweet(idx, idTweet, min_dis) #print (output) else: bucketTweet(idx, 0, 0) #insert new tweet in the table hashTable.index(sparseVec, extra_data=idx) #check for burst if idx % burstCheckFrequency == 0: checkBurst(idx)
for a in t_vectors: #t_vectors[a]['ti_table'] t_vector_list[a] = v_standardise(t_vectors[a]['ti_table']) if t_x > corpus_size: #don't run for the first tweet y = [a for a in t_vector_list] X = lil_matrix([t_vector_list[a] for a in t_vector_list]) X = X.tocsr() t_5 = datetime.datetime.now() lsh = LSH(8, X.shape[1], num_hashtables=1, storage_config={"dict":None}) for ix in xrange(X.shape[0]): x = X.getrow(ix) _c = y[ix] lsh.index(x, extra_data=_c) t_2 = datetime.datetime.now() print "Time interval 5: " + str(t_2 - t_5) #now implt tf idf sorting #print "Time interval 1: " + str(t_2 - t_1) #idf: for a in temp_vocab: try: temp_vector[a] = math.log(t_x/vocab[a]) except ValueError: pass #tf:
class SparseLshClass: ID = 0 SENTIMENT = 1 TEXT = 2 _text_image_list = [] _query_val = [] _processed_data = np.zeros((1, 1)) _query_data = np.zeros((1, 1)) _length_of_data = 0 _data_csr_matrix = csr_matrix(0) _lsh = [] # IMAGE CREATION PROPERTIES _full_images = False _gray_scale = False _color_images = False _image_shape = np.zeros((1, 1)) textData = [] def __init__(self, text_data, num_text_entries, image_shape=(64, 64), hash_size=64, num_hashtables=1, full_images=False, gray_scale=False, color_images=False): ''' Creates a Sparse LSH handler class to create, query, and generate binary codes from a SparseLSH object. This class can be used to perform manual querying of the constructed SparseLSH indexer, or can be used to retrieve binary codes from a given set of _text. It can also be used as a Sentiment Classifier based off of the averaged sentiment returned from the SparseLSH Indexer. :param text_data: A list of string with the :param num_text_entries: The number of entries that are to be stored in the SparseLSH Indexer :param image_shape: The shape of the images that are :param hash_size: The size of the binary code that is generated by the SparseLSH Indexer :param num_hashtables: The number of Hash Tables that are used in SparseLSH Indexer :param full_images: A boolean that specifies whether images will fill the image with a sample _text until it has been completely filled. :param gray_scale: A boolean that specifies whether an image will be generated as gray_scale or smoothed. If color_images is True, the resulting image will be smoothed :param color_images: A boolean to specify whether the generated image will be RGB or Binary. If True, the image will be an RGB image. ''' # initialise settings self.textData = text_data self._length_of_data = num_text_entries self._full_images = full_images self._gray_scale = gray_scale self._color_images = color_images self._image_shape = image_shape # process _processed_data into _processed_data array self.get_data(self.textData, num_data_rows=num_text_entries) # initialise Numpy Data array and configure LSH Matrix self.initialise_data_array() self.configure_lsh_matrix(hash_size=hash_size, num_hashtables=num_hashtables) def get_data(self, text_data, num_data_rows=5000): ''' Store _text _processed_data as ImageClass object :param text_data: The _text _processed_data that is to be added to the Indexer. :param num_data_rows: The number of entries that are to be added to the Indexer. :return: None ''' for i in range(0, num_data_rows - 1): self._text_image_list.append( Ic(text_data[i], 0, image_shape=self._image_shape, is_colour=self._color_images, gray_scale=self._gray_scale, full_image=self._full_images)) def get_data_row(self, index): ''' Get a specific _text string based upon the provided index. :param index: An index for the requested set of _text. :return: a _text string for the specified index. ''' return self.textData[index] def initialise_data_array(self): ''' Intialises the data array that will be fed to the SparseLSH Index. :return: None ''' # initialise processed data array self._processed_data = np.zeros( (len(self._text_image_list), self._image_shape[1])) i = 0 for row in self._text_image_list: dat = row.get_image_vector() for j in range(0, row.arrayRows[1]): self._processed_data[i][j] = dat[0][j] i = i + 1 def initialise_query_array(self): ''' Intialises the query array that will be used to query the SparseLSH Index. :return: ''' # initialise query data array self._query_data = np.zeros((1, self._image_shape[1])) dat = self._query_val.get_image_vector() for j in range(0, self._query_val._image_shape[1]): self._query_data[0][j] = dat[0][j] def query(self, text, num_queries, id_val, image_shape, full_image=False, gray_scale=False, color_image=False): ''' Queries the SparseLSH index with the specified _text and image configurations. :param text: The _text string that is to be used to query the SparseLSH Index. :param num_queries: The number of results returned by the query. :param id_val: The ID for the _text. :param image_shape: The shape of the generated Image :param full_image: A boolean that specifies whether images will fill the image with a sample _text until it has been completely filled. :param gray_scale: A boolean that specifies whether an image will be generated as gray_scale or smoothed. If color_images is True, the resulting image will be smoothed :param color_image: A boolean to specify whether the generated image will be RGB or Binary. If True, the image will be an RGB image. :return: The results vector from the SparseLSH Index. ''' # intialise query object self._query_val = Ic(text, id_val=id_val, image_shape=image_shape, full_image=full_image, gray_scale=gray_scale, is_colour=color_image) self.initialise_query_array() # query the Index and return the results results = self._lsh.query(csr_matrix(self._query_data), num_results=num_queries) return results def configure_lsh_matrix(self, hash_size=4, num_hashtables=1): ''' Configures the SparseLSH Indexer object with the hash sizes and hash tables. :param hash_size: The size of the binary codes that will be returned from the LSH hashing function. :param num_hashtables: The number of hash tables that will be used to perform index search querying. :return: None ''' self._data_csr_matrix = csr_matrix(self._processed_data) self._lsh = LSH(hash_size, self._data_csr_matrix.shape[1], num_hashtables=num_hashtables, storage_config={"dict": None}) # get list of sentiment to index with data for querying sentiment_list = [] for row in self._text_image_list: sentiment_list.append(row.sentiment) # build index for ix in range(self._data_csr_matrix.shape[0]): data_vals = self._data_csr_matrix.getrow(ix) sentiment_vals = sentiment_list[ix] self._lsh.index(data_vals, extra_data=sentiment_vals) def configure_query_matrix(self): ''' Configures the query matrix into a form that can be fed into the Index. :return: SciPy CSR_Matrix object of the query data. ''' return csr_matrix(self._query_data) def get_sentiment_of_text(self, text, num_returned_results=5, text_id=0, image_shape=10, threshold=0.5, full_image=False, gray_scale=False, color_image=False): ''' Determines the sentiment of given _text by determining the average sentiment from the queried results. :param text: A string of _text that is to be used to query the Index to determine its sentiment. :param num_returned_results: The number of results that a query returns. :param text_id: The ID of a given set of _text. This is used if their is a specific ID associated with the _text data. :param image_shape: The shape of the image that the _text will be converted to. :param threshold: The threshold that will be used to determine if the sentiment of the _text is positive or negative. If the average sentiment of the returned results is above or equal to the threshold, it will return a positive sentiment (True), otherwise it will return a negative sentiment (False). :param full_image: A boolean that specifies whether images will fill the image with a sample _text until it has been completely filled. :param gray_scale: A boolean that specifies whether an image will be generated as gray_scale or smoothed. If color_images is True, the resulting image will be smoothed :param color_image: A boolean to specify whether the generated image will be RGB or Binary. If True, the image will be an RGB image. :return: A boolean to indicate whether the sentiment is positive or negative. If True, the sentiment of the _text is positive. If False, the sentiment of the _text is negative. ''' results = self.query(text, num_returned_results, text_id, image_shape, full_image=full_image, gray_scale=gray_scale, color_image=color_image) # calculates the sentiment from the average number of queries average = float( self.get_average_sentiment(results, num_results=num_returned_results)) if average >= threshold: return True else: return False def get_query_code(self, text, image_shape, id_val=0, full_image=False, gray_scale=False, color_image=False): ''' Gets the binary code representation of a given _text string that has been converted into an image based upon the given image configuration. :param text: A string of _text that is to be used to get a binary code based on the image generated from the _text. :param image_shape: The shape of the image that the _text will be converted to. :param id_val: The ID for the _text. :param full_image: A boolean that specifies whether images will fill the image with a sample _text until it has been completely filled. :param gray_scale: A boolean that specifies whether an image will be generated as gray_scale or smoothed. If color_images is True, the resulting image will be smoothed :param color_image: A boolean to specify whether the generated image will be RGB or Binary. If True, the image will be an RGB image. :return: The binary code representation of the provided _text that has been converted into an image ''' # convert _text to image self._query_val = Ic(text, id_val=id_val, image_shape=image_shape, gray_scale=gray_scale, full_image=full_image, is_colour=color_image) # initialise query data structure and perform hash function on query self.initialise_query_array() query_encode = self._lsh.get_binary_code(self._query_data) return query_encode @staticmethod def get_average_sentiment(results, num_results): ''' Averages the sentiment of all of the returned results. :param results: An array of all results that were retrieved from the SparseLSH Index query. :param num_results: The number of returned results. :return: A float containing the averaged result of the returned sentiments. ''' average = 0 for i in range(0, num_results): average = float(average + results[i][0][1]) / float(num_results) return average @staticmethod def read_in_text_data(file_location): ''' Reads in data from a specified _text file. :param file_location: The location of the file to retrieve the data from. :return: A list with all of the data entries. ''' text_data = [] with open(file_location, 'rb') as csvfile: reader = csv.reader(csvfile, delimiter=',', quotechar='|') for row in reader: text_data.append(row) return text_data