Example #1
0
def test1():
    from ilshclus import load_index
    from ilshclus import simhash_estimate
    log = get_ilshlogger()
    log.debug("working with 20newsgroup data...")
    ix = load_index('./hash_index_20newsgroup-small.bin')
    #ix.total_docs
    #ix.input_dim
    #ix.nr_of_bands
    #ix.band_size

    S = simhash_estimate(ix)
    txtcol.sparse_mat_to_cluto_graph(
        S, "20newsgtest_est{0}_{1}_sim.dat".format(ix.nr_of_bands,
                                                   ix.band_size))
    # some example docs.
    for i, j in [(0, 1), (0, 2), (0, 4), (0, 5), (0, 6), (0, 7), (0, 8),
                 (0, 9), (1, 0), (1, 2)]:
        match_prop = len([
            buck_id for tab in ix.Index
            for buck_id in tab if i in tab[buck_id] and j in tab[buck_id]
        ]) / ix.nr_of_bands
        log.debug("Est. similarity between {0} and {1} is {2}".format(
            i, j, S[i, j]))
        log.debug(
            "Proportion of matches in the index was {0}".format(match_prop))
        log.debug("Thus, cos(0.5*pi*(1-{0})) = {1}".format(
            match_prop, np.cos((np.pi / 2) * (1 - match_prop))))
    log.debug("test finished!")
Example #2
0
def test():
    from ilshclus import load_index
    from ilshclus import simhash_estimate
    log = get_ilshlogger()
    log.debug("cargando el indice...")
    ix = load_index('./hash_index_DOE.data')
    log.debug("indice cargado")
    log.debug(ix.total_docs)
    #ix.input_dim
    #ix.nr_of_bands
    #ix.band_size

    S = simhash_estimate(ix)
    # some example docs.
    for i,j in [(0, 1),
        (0, 2),
        (0, 4),
         (0, 5),
         (0, 6),
         (0, 7),
         (0, 8),
         (0, 9),
         (1, 0),
         (1, 2)]:
        match_prop = len([buck_id for tab in ix.Index for buck_id in tab if i in tab[buck_id] and j in tab[buck_id]]) / ix.nr_of_bands
        log.debug("Est. similarity between {0} and {1} is {2}".format(i,j,S[i,j]))
        log.debug("Proportion of matches in the index was {0}".format(match_prop))
        log.debug("Thus, cos(0.5*pi*(1-{0})) = {1}".format(match_prop, np.cos((np.pi / 2) * (1 - match_prop) ) ) )
    log.debug("test finished!")
Example #3
0
def index_to_cluto(index_file, output_mat):
    log = get_ilshlogger()

    log.debug("loading index {0}...".format(index_file))
    x = load_index(index_file)

    log.debug("calculating similarity...")
    xi = simhash_estimate(x)

    log.debug("storing sparse distance mat to {0}...".format(output_mat))
    txtcol.sparse_mat_to_cluto_graph(data=xi, outputfile=output_mat)
    log.debug("cluto matrix done...")
Example #4
0
def index_text_data_FR(nr_of_bands=500, band_length=5, outputdir='.'):
    log = get_ilshlogger()
    log.debug("Fetching corpus...")
    docterms, labels = txtcol.get_corpus_FR()
    log.debug("Indexing collection")
    outputpath = './hash_index_FR_b{1}r{2}.data'.format(
        outputdir, nr_of_bands, band_length)
    hI = ic.HashingBasedIndex(docterms.shape[1],
                              nr_of_bands=nr_of_bands,
                              band_length=band_length)
    hI.index_collection(docterms)

    log.debug("Saving index to disk...")
    ic.save_index(hI, outputpath)
    log.debug("Index written into {0}.".format(outputpath))
Example #5
0
def build_ilsh_index(objTxtData: LabeledTextData,
                     nr_of_bands=500,
                     band_length=5,
                     outputdir='.',
                     outputfname=None):
    """
    Creates the index for a given LabeledTextData object.
    """
    log = get_ilshlogger()
    hI = ic.HashingBasedIndex(objTxtData.docterm.shape[1],
                              nr_of_bands=nr_of_bands,
                              band_length=band_length)
    hI.index_collection(objTxtData.docterm)

    if not outputfname is None:
        outputpath = os.path.join(
            outputdir, '{0}_b{1}r{2}.data'.format(outputfname, nr_of_bands,
                                                  band_length))
        log.debug("Indexing collection into {0}".format(outputpath))
        log.debug("Saving index to disk...")
        ic.save_index(hI, outputpath)
        log.debug("Index written into {0}.".format(outputpath))

    return hI
Example #6
0
def index_small_20ng(nr_of_bands=500, band_length=5, outputdir='.'):
    """
    10 documents are indexed for testing purposes.
    :return: NOne
    """
    log = get_ilshlogger()
    log.debug("Fetching corpus...")
    # corpusVectors = txtcol.get20ngCorpusData()
    corpusVectors = txtcol.getSmall20ngCorpusData()

    # matrix and column labels
    docterm, features = ic.get_vectors(corpusVectors)

    log.debug("Indexing collection")
    outputpath = './hash_index_20newsgroup-small_b{1}r{2}.data'.format(
        outputdir, nr_of_bands, band_length)
    hI = ic.HashingBasedIndex(len(features),
                              nr_of_bands=nr_of_bands,
                              band_length=band_length)
    hI.index_collection(docterm)

    log.debug("Saving index to disk...")
    ic.save_index(hI, outputpath)
    log.debug("Index written into {0}.".format(outputpath))
Example #7
0
            '/home/juan/incremental-clustering/incremental-clustering-out')
        end = time.time()
        log.debug("Elapsed time {0:.3f} secs.".format(end - start))

        log.debug("Estimating similarity...")
        start = time.time()
        #initial_cos_sim = simhash_estimate(initialIndex)
        log.debug("#items:{0} #bands:{1} band_size:{2} ... {3}".format(
            initialIndex.total_docs, initialIndex.nr_of_bands,
            initialIndex.band_size, compute_index_properties(initialIndex)))
        end = time.time()
        log.debug("Elapsed time {0:.3f} secs.".format(end - start))


if __name__ == '__main__':
    log = get_ilshlogger()
    dataObj = None
    datadir = '/home/juan/datasets/text-data'
    TXTCOL = AvailableCollections.DOE

    log.debug("Loading data...")
    start = time.time()
    if TXTCOL is AvailableCollections.DOE:
        dataObj = DOEData(basedir=datadir)
    elif TXTCOL is AvailableCollections.FR:
        dataObj = FRData(basedir=datadir)
    elif TXTCOL is AvailableCollections.AP:
        dataObj = APData(basedir=datadir)
    elif TXTCOL is AvailableCollections.ZF:
        dataObj = ZFData(basedir=datadir)
    elif TXTCOL is AvailableCollections.WSJ: