Example #1
0
 def test_deferred(self):
     v1 = [1, 0, 0, 0, 3, 4, 5, 0, 0, 0, 0, 6, 7, 8, 0, 0, 0, 0, 0, 0, 9, 10, 4]
     v2 = [2, 0, 0, 0, 4, 3, 8, 0, 0, 0, 0, 4, 7, 10, 0, 0, 0, 0, 0, 0, 9, 0, 0]
     gen = libMHCUDA.minhash_cuda_init(len(v1), 128, devices=1, verbosity=2)
     vars = libMHCUDA.minhash_cuda_retrieve_vars(gen)
     libMHCUDA.minhash_cuda_fini(gen)
     gen = libMHCUDA.minhash_cuda_init(
         len(v1), 128, devices=1, deferred=True, verbosity=2)
     libMHCUDA.minhash_cuda_assign_vars(gen, *vars)
     bgen = WeightedMinHashGenerator.__new__(WeightedMinHashGenerator)
     bgen.dim = len(v1)
     bgen.rs, bgen.ln_cs, bgen.betas = vars
     bgen.sample_size = 128
     bgen.seed = None
     m = csr_matrix(numpy.array([v1, v2], dtype=numpy.float32))
     hashes = libMHCUDA.minhash_cuda_calc(gen, m)
     libMHCUDA.minhash_cuda_fini(gen)
     self.assertEqual(hashes.shape, (2, 128, 2))
     true_hashes = numpy.array([bgen.minhash(v1).hashvalues,
                                bgen.minhash(v2).hashvalues], dtype=numpy.uint32)
     self.assertEqual(true_hashes.shape, (2, 128, 2))
     try:
         self.assertTrue((hashes == true_hashes).all())
     except AssertionError as e:
         print("---- TRUE ----")
         print(true_hashes)
         print("---- FALSE ----")
         print(hashes)
         raise e from None
Example #2
0
 def test_float(self):
     v1 = [
         0,          1.0497366,  0.8494359,  0.66231006, 0.66231006, 0.8494359,
         0,          0.66231006, 0.33652836, 0,           0,         0.5359344,
         0.8494359,  0.66231006, 1.0497366,  0.33652836, 0.66231006, 0.8494359,
         0.6800841,  0.33652836]
     gen = libMHCUDA.minhash_cuda_init(len(v1), 128, devices=1, seed=7, verbosity=2)
     vars = libMHCUDA.minhash_cuda_retrieve_vars(gen)
     bgen = WeightedMinHashGenerator.__new__(WeightedMinHashGenerator)
     bgen.dim = len(v1)
     bgen.rs, bgen.ln_cs, bgen.betas = vars
     bgen.sample_size = 128
     bgen.seed = None
     m = csr_matrix(numpy.array(v1, dtype=numpy.float32))
     hashes = libMHCUDA.minhash_cuda_calc(gen, m).astype(numpy.int32)
     libMHCUDA.minhash_cuda_fini(gen)
     self.assertEqual(hashes.shape, (1, 128, 2))
     true_hashes = numpy.array([bgen.minhash(v1).hashvalues], dtype=numpy.int32)
     self.assertEqual(true_hashes.shape, (1, 128, 2))
     try:
         self.assertTrue((hashes == true_hashes).all())
     except AssertionError as e:
         print("---- TRUE ----")
         print(true_hashes)
         print("---- FALSE ----")
         print(hashes)
         raise e from None
Example #3
0
 def test_random_vars(self):
     gen = libMHCUDA.minhash_cuda_init(1000, 128, devices=1, verbosity=2)
     rs, ln_cs, betas = libMHCUDA.minhash_cuda_retrieve_vars(gen)
     libMHCUDA.minhash_cuda_fini(gen)
     cs = numpy.exp(ln_cs)
     a, loc, scale = gamma.fit(rs)
     self.assertTrue(1.97 < a < 2.03)
     self.assertTrue(-0.01 < loc < 0.01)
     self.assertTrue(0.98 < scale < 1.02)
     a, loc, scale = gamma.fit(cs)
     self.assertTrue(1.97 < a < 2.03)
     self.assertTrue(-0.01 < loc < 0.01)
     self.assertTrue(0.98 < scale < 1.02)
     bmin, bmax = uniform.fit(betas)
     self.assertTrue(0 <= bmin < 0.001)
     self.assertTrue(0.999 <= bmax <= 1)
Example #4
0
def hash_batches(args):
    log = logging.getLogger("hash")
    log.info("Loading files from %s", args.input)
    loader = BOWLoader(args.input)
    log.info("%d batches", len(loader))

    # Check batches
    if not loader:
        return

    htnum, band_size = calc_hashtable_params(
        args.threshold, args.size, args.false_positive_weight, args.false_negative_weight)
    log.info("Number of hash tables: %d", htnum)
    log.info("Band size: %d", band_size)
    cassandra_utils.configure(args)
    spark_args = filter_kwargs(args.__dict__, create_spark)
    spark = create_spark("hash-%s" % uuid4(), **spark_args).sparkContext
    import libMHCUDA  # delayed import which requires CUDA and friends
    tables = args.tables
    gen = voc_size = None
    try:
        for i, bow in enumerate(loader):
            if voc_size is None:
                voc_size = bow.matrix.shape[-1]
                log.info("Initializing the generator")
                deferred = os.path.isfile(args.params)
                gen = libMHCUDA.minhash_cuda_init(
                    voc_size, args.size, seed=args.seed, devices=args.devices,
                    verbosity=args.mhc_verbosity,
                    deferred=deferred)
                if deferred:
                    model = WeightedMinHashParameters().load(args.params)
                    libMHCUDA.minhash_cuda_assign_vars(gen, model.rs, model.ln_cs, model.betas)
                else:
                    log.info("Writing %s", args.params)
                    params = libMHCUDA.minhash_cuda_retrieve_vars(gen)
                    WeightedMinHashParameters().construct(*params).save(args.params)
            if bow.matrix.shape[-1] != voc_size:
                raise ValueError("The vocabulary sizes do not match: %d != %d"
                                 % (bow.matrix.shape[-1], voc_size))
            log.info("Processing batch %d / %d", i + 1, len(loader))
            # Modify features if needed
            # TODO(vmarkovtsev): port to the new structure
            # batches = modify_feature_weights(batches, args)
            hashes = libMHCUDA.minhash_cuda_calc(gen, bow.matrix)
            job = [(k, h) for k, h in zip(bow.documents, hashes)]
            log.info("Saving the hashtables")
            df = spark.parallelize(job).flatMap(HashExploder(htnum, band_size)).toDF()
            df.write \
                .format("org.apache.spark.sql.cassandra") \
                .mode("append") \
                .options(table=tables["hashtables"], keyspace=args.keyspace) \
                .save()
            df.write \
                .format("org.apache.spark.sql.cassandra") \
                .mode("append") \
                .options(table=tables["hashtables2"], keyspace=args.keyspace) \
                .save()
            log.info("Saving the hashes")
            spark.parallelize(job) \
                .map(lambda x: Row(sha1=x[0], value=bytearray(x[1].data))) \
                .toDF() \
                .write \
                .format("org.apache.spark.sql.cassandra") \
                .mode("append") \
                .options(table=tables["hashes"], keyspace=args.keyspace) \
                .save()
    finally:
        libMHCUDA.minhash_cuda_fini(gen)
Example #5
0
def hash_batches(args):
    log = logging.getLogger("hash")
    log.info("Loading files from %s", args.input)
    loader = BOWLoader(args.input)
    log.info("%d batches", len(loader))

    # Check batches
    if not loader:
        return

    htnum, band_size = calc_hashtable_params(args.threshold, args.size,
                                             args.false_positive_weight,
                                             args.false_negative_weight)
    log.info("Number of hash tables: %d", htnum)
    log.info("Band size: %d", band_size)
    cassandra_utils.configure(args)
    spark = create_spark("hash-%s" % uuid4(), **args.__dict__).sparkContext
    import libMHCUDA  # delayed import which requires CUDA and friends
    tables = args.tables
    gen = voc_size = None
    try:
        for i, bow in enumerate(loader):
            if voc_size is None:
                voc_size = bow.matrix.shape[-1]
                log.info("Initializing the generator")
                deferred = os.path.isfile(args.params)
                gen = libMHCUDA.minhash_cuda_init(voc_size,
                                                  args.size,
                                                  seed=args.seed,
                                                  devices=args.devices,
                                                  verbosity=args.mhc_verbosity,
                                                  deferred=deferred)
                if deferred:
                    model = WeightedMinHashParameters().load(args.params)
                    libMHCUDA.minhash_cuda_assign_vars(gen, model.rs,
                                                       model.ln_cs,
                                                       model.betas)
                else:
                    log.info("Writing %s", args.params)
                    params = libMHCUDA.minhash_cuda_retrieve_vars(gen)
                    WeightedMinHashParameters().construct(*params).save(
                        args.params)
            if bow.matrix.shape[-1] != voc_size:
                raise ValueError(
                    "The vocabulary sizes do not match: %d != %d" %
                    (bow.matrix.shape[-1], voc_size))
            log.info("Processing batch %d / %d", i + 1, len(loader))
            # Modify features if needed
            # TODO(vmarkovtsev): port to the new structure
            # batches = modify_feature_weights(batches, args)
            hashes = libMHCUDA.minhash_cuda_calc(gen, bow.matrix)
            job = [(k, h) for k, h in zip(bow.documents, hashes)]
            log.info("Saving the hashtables")
            df = spark.parallelize(job).flatMap(HashExploder(
                htnum, band_size)).toDF()
            df.write \
                .format("org.apache.spark.sql.cassandra") \
                .mode("append") \
                .options(table=tables["hashtables"], keyspace=args.keyspace) \
                .save()
            df.write \
                .format("org.apache.spark.sql.cassandra") \
                .mode("append") \
                .options(table=tables["hashtables2"], keyspace=args.keyspace) \
                .save()
            log.info("Saving the hashes")
            spark.parallelize(job) \
                .map(lambda x: Row(sha1=x[0], value=bytearray(x[1].data))) \
                .toDF() \
                .write \
                .format("org.apache.spark.sql.cassandra") \
                .mode("append") \
                .options(table=tables["hashes"], keyspace=args.keyspace) \
                .save()
    finally:
        libMHCUDA.minhash_cuda_fini(gen)