def test1(): from ilshclus import load_index from ilshclus import simhash_estimate log = get_ilshlogger() log.debug("working with 20newsgroup data...") ix = load_index('./hash_index_20newsgroup-small.bin') #ix.total_docs #ix.input_dim #ix.nr_of_bands #ix.band_size S = simhash_estimate(ix) txtcol.sparse_mat_to_cluto_graph( S, "20newsgtest_est{0}_{1}_sim.dat".format(ix.nr_of_bands, ix.band_size)) # some example docs. for i, j in [(0, 1), (0, 2), (0, 4), (0, 5), (0, 6), (0, 7), (0, 8), (0, 9), (1, 0), (1, 2)]: match_prop = len([ buck_id for tab in ix.Index for buck_id in tab if i in tab[buck_id] and j in tab[buck_id] ]) / ix.nr_of_bands log.debug("Est. similarity between {0} and {1} is {2}".format( i, j, S[i, j])) log.debug( "Proportion of matches in the index was {0}".format(match_prop)) log.debug("Thus, cos(0.5*pi*(1-{0})) = {1}".format( match_prop, np.cos((np.pi / 2) * (1 - match_prop)))) log.debug("test finished!")
def test(): from ilshclus import load_index from ilshclus import simhash_estimate log = get_ilshlogger() log.debug("cargando el indice...") ix = load_index('./hash_index_DOE.data') log.debug("indice cargado") log.debug(ix.total_docs) #ix.input_dim #ix.nr_of_bands #ix.band_size S = simhash_estimate(ix) # some example docs. for i,j in [(0, 1), (0, 2), (0, 4), (0, 5), (0, 6), (0, 7), (0, 8), (0, 9), (1, 0), (1, 2)]: match_prop = len([buck_id for tab in ix.Index for buck_id in tab if i in tab[buck_id] and j in tab[buck_id]]) / ix.nr_of_bands log.debug("Est. similarity between {0} and {1} is {2}".format(i,j,S[i,j])) log.debug("Proportion of matches in the index was {0}".format(match_prop)) log.debug("Thus, cos(0.5*pi*(1-{0})) = {1}".format(match_prop, np.cos((np.pi / 2) * (1 - match_prop) ) ) ) log.debug("test finished!")
def index_to_cluto(index_file, output_mat): log = get_ilshlogger() log.debug("loading index {0}...".format(index_file)) x = load_index(index_file) log.debug("calculating similarity...") xi = simhash_estimate(x) log.debug("storing sparse distance mat to {0}...".format(output_mat)) txtcol.sparse_mat_to_cluto_graph(data=xi, outputfile=output_mat) log.debug("cluto matrix done...")
def index_text_data_FR(nr_of_bands=500, band_length=5, outputdir='.'): log = get_ilshlogger() log.debug("Fetching corpus...") docterms, labels = txtcol.get_corpus_FR() log.debug("Indexing collection") outputpath = './hash_index_FR_b{1}r{2}.data'.format( outputdir, nr_of_bands, band_length) hI = ic.HashingBasedIndex(docterms.shape[1], nr_of_bands=nr_of_bands, band_length=band_length) hI.index_collection(docterms) log.debug("Saving index to disk...") ic.save_index(hI, outputpath) log.debug("Index written into {0}.".format(outputpath))
def build_ilsh_index(objTxtData: LabeledTextData, nr_of_bands=500, band_length=5, outputdir='.', outputfname=None): """ Creates the index for a given LabeledTextData object. """ log = get_ilshlogger() hI = ic.HashingBasedIndex(objTxtData.docterm.shape[1], nr_of_bands=nr_of_bands, band_length=band_length) hI.index_collection(objTxtData.docterm) if not outputfname is None: outputpath = os.path.join( outputdir, '{0}_b{1}r{2}.data'.format(outputfname, nr_of_bands, band_length)) log.debug("Indexing collection into {0}".format(outputpath)) log.debug("Saving index to disk...") ic.save_index(hI, outputpath) log.debug("Index written into {0}.".format(outputpath)) return hI
def index_small_20ng(nr_of_bands=500, band_length=5, outputdir='.'): """ 10 documents are indexed for testing purposes. :return: NOne """ log = get_ilshlogger() log.debug("Fetching corpus...") # corpusVectors = txtcol.get20ngCorpusData() corpusVectors = txtcol.getSmall20ngCorpusData() # matrix and column labels docterm, features = ic.get_vectors(corpusVectors) log.debug("Indexing collection") outputpath = './hash_index_20newsgroup-small_b{1}r{2}.data'.format( outputdir, nr_of_bands, band_length) hI = ic.HashingBasedIndex(len(features), nr_of_bands=nr_of_bands, band_length=band_length) hI.index_collection(docterm) log.debug("Saving index to disk...") ic.save_index(hI, outputpath) log.debug("Index written into {0}.".format(outputpath))
'/home/juan/incremental-clustering/incremental-clustering-out') end = time.time() log.debug("Elapsed time {0:.3f} secs.".format(end - start)) log.debug("Estimating similarity...") start = time.time() #initial_cos_sim = simhash_estimate(initialIndex) log.debug("#items:{0} #bands:{1} band_size:{2} ... {3}".format( initialIndex.total_docs, initialIndex.nr_of_bands, initialIndex.band_size, compute_index_properties(initialIndex))) end = time.time() log.debug("Elapsed time {0:.3f} secs.".format(end - start)) if __name__ == '__main__': log = get_ilshlogger() dataObj = None datadir = '/home/juan/datasets/text-data' TXTCOL = AvailableCollections.DOE log.debug("Loading data...") start = time.time() if TXTCOL is AvailableCollections.DOE: dataObj = DOEData(basedir=datadir) elif TXTCOL is AvailableCollections.FR: dataObj = FRData(basedir=datadir) elif TXTCOL is AvailableCollections.AP: dataObj = APData(basedir=datadir) elif TXTCOL is AvailableCollections.ZF: dataObj = ZFData(basedir=datadir) elif TXTCOL is AvailableCollections.WSJ: