Ejemplo n.º 1
0
##  retained clusters on the basis of closeness of the distance between the sample
##  and the cluster means.

##  Call syntax:
##
##         Clustering_with_LSH_with_sample_based_merging.py


from LocalitySensitiveHashing import *

datafile = "data_for_lsh.csv"

lsh = LocalitySensitiveHashing( 
               datafile = datafile,
               dim = 10,
               r = 50,                # number of rows in each band for r-wise AND in each band
               b = 100,               # number of bands for b-wise OR over all b bands
               expected_num_of_clusters = 10,
      )
lsh.get_data_from_csv()
lsh.show_data_for_lsh()
lsh.initialize_hash_store()
lsh.hash_all_data()
lsh.display_contents_of_all_hash_bins_pre_lsh()

similarity_groups = lsh.lsh_basic_for_neighborhood_clusters()
coalesced_similarity_groups = lsh.merge_similarity_groups_with_coalescence( similarity_groups )

merged_similarity_groups = lsh.merge_similarity_groups_with_l2norm_sample_based( coalesced_similarity_groups )

lsh.evaluate_quality_of_similarity_groups( merged_similarity_groups )
 def setUp(self):
     self.lsh = LocalitySensitiveHashing.LocalitySensitiveHashing(
         datafile="data_for_lsh.csv", dim=10, r=50, b=100)
     self.lsh.get_data_from_csv()
     self.lsh.initialize_hash_store()
Ejemplo n.º 3
0
 def setUp(self):
     self.lsh = LocalitySensitiveHashing.LocalitySensitiveHashing( 
              datafile = "data_for_lsh.csv",  dim = 10, r = 50, b = 100)
     self.lsh.get_data_from_csv()
     self.data_dict = self.lsh._data_dict
Ejemplo n.º 4
0
#!/usr/bin/env python

##  LSH_basic_for_demonstraing_nearest_neighbors.py

##  This script demonstrates the basic functioning of the LSH algorithm for finding
##  the nearest neighbors of given data elements.

##  Call syntax:
##
##         LSH_basic_for_demonstrating_nearest_neighbors.py
##
##  The last method called in this script, lsh_basic_for_nearest_neighbors(), places
##  the user in an interactive mode.  The script asks you to enter the name of a data
##  record from from the file that was processed by the LSH algorithm.  It
##  subsequently returns the nearest neighbors of that data record.

from LocalitySensitiveHashing import *

lsh = LocalitySensitiveHashing(
    datafile="data_for_lsh.csv",
    dim=10,
    r=50,  # number of rows in each band for r-wise AND in each band
    b=100,  # number of bands for b-wise OR over all b bands
)
lsh.get_data_from_csv()
lsh.show_data_for_lsh()
lsh.initialize_hash_store()
lsh.hash_all_data()
lsh.display_contents_of_all_hash_bins_pre_lsh()
similarity_neighborhoods = lsh.lsh_basic_for_nearest_neighbors()
Ejemplo n.º 5
0
# save features
if not os.path.exists('./tmp'):
    os.mkdir('tmp')
with open('./tmp/' + str(params.dim) + '_dim_features.txt', 'w') as fout:
    for idx, vector in zip(corpus, reduce_tfidf):
        vector_ = str(np.array(vector, np.float16).tolist()).replace(
            '\n', '').strip('[').strip(']')
        fout.write(idx[0] + ', ' + vector_ + '\n')
feature_file = './tmp/' + str(params.dim) + '_dim_features.txt'

# clustering
lsh = LocalitySensitiveHashing(
    datafile=feature_file,
    dim=200,
    r=50,
    b=100,
    expected_num_of_clusters=params.expected,
)
lsh.get_data_from_csv()
lsh.initialize_hash_store()
lsh.hash_all_data()
similarity_groups = lsh.lsh_basic_for_neighborhood_clusters()
coalesced_similarity_groups = lsh.merge_similarity_groups_with_coalescence(
    similarity_groups)
merged_similarity_groups = lsh.merge_similarity_groups_with_l2norm_sample_based(
    coalesced_similarity_groups)

# saving results
if not os.path.exists('./results'):
    os.mkdir('results')