def test_deferred(self): v1 = [1, 0, 0, 0, 3, 4, 5, 0, 0, 0, 0, 6, 7, 8, 0, 0, 0, 0, 0, 0, 9, 10, 4] v2 = [2, 0, 0, 0, 4, 3, 8, 0, 0, 0, 0, 4, 7, 10, 0, 0, 0, 0, 0, 0, 9, 0, 0] gen = libMHCUDA.minhash_cuda_init(len(v1), 128, devices=1, verbosity=2) vars = libMHCUDA.minhash_cuda_retrieve_vars(gen) libMHCUDA.minhash_cuda_fini(gen) gen = libMHCUDA.minhash_cuda_init( len(v1), 128, devices=1, deferred=True, verbosity=2) libMHCUDA.minhash_cuda_assign_vars(gen, *vars) bgen = WeightedMinHashGenerator.__new__(WeightedMinHashGenerator) bgen.dim = len(v1) bgen.rs, bgen.ln_cs, bgen.betas = vars bgen.sample_size = 128 bgen.seed = None m = csr_matrix(numpy.array([v1, v2], dtype=numpy.float32)) hashes = libMHCUDA.minhash_cuda_calc(gen, m) libMHCUDA.minhash_cuda_fini(gen) self.assertEqual(hashes.shape, (2, 128, 2)) true_hashes = numpy.array([bgen.minhash(v1).hashvalues, bgen.minhash(v2).hashvalues], dtype=numpy.uint32) self.assertEqual(true_hashes.shape, (2, 128, 2)) try: self.assertTrue((hashes == true_hashes).all()) except AssertionError as e: print("---- TRUE ----") print(true_hashes) print("---- FALSE ----") print(hashes) raise e from None
def test_float(self): v1 = [ 0, 1.0497366, 0.8494359, 0.66231006, 0.66231006, 0.8494359, 0, 0.66231006, 0.33652836, 0, 0, 0.5359344, 0.8494359, 0.66231006, 1.0497366, 0.33652836, 0.66231006, 0.8494359, 0.6800841, 0.33652836] gen = libMHCUDA.minhash_cuda_init(len(v1), 128, devices=1, seed=7, verbosity=2) vars = libMHCUDA.minhash_cuda_retrieve_vars(gen) bgen = WeightedMinHashGenerator.__new__(WeightedMinHashGenerator) bgen.dim = len(v1) bgen.rs, bgen.ln_cs, bgen.betas = vars bgen.sample_size = 128 bgen.seed = None m = csr_matrix(numpy.array(v1, dtype=numpy.float32)) hashes = libMHCUDA.minhash_cuda_calc(gen, m).astype(numpy.int32) libMHCUDA.minhash_cuda_fini(gen) self.assertEqual(hashes.shape, (1, 128, 2)) true_hashes = numpy.array([bgen.minhash(v1).hashvalues], dtype=numpy.int32) self.assertEqual(true_hashes.shape, (1, 128, 2)) try: self.assertTrue((hashes == true_hashes).all()) except AssertionError as e: print("---- TRUE ----") print(true_hashes) print("---- FALSE ----") print(hashes) raise e from None
def test_random_vars(self): gen = libMHCUDA.minhash_cuda_init(1000, 128, devices=1, verbosity=2) rs, ln_cs, betas = libMHCUDA.minhash_cuda_retrieve_vars(gen) libMHCUDA.minhash_cuda_fini(gen) cs = numpy.exp(ln_cs) a, loc, scale = gamma.fit(rs) self.assertTrue(1.97 < a < 2.03) self.assertTrue(-0.01 < loc < 0.01) self.assertTrue(0.98 < scale < 1.02) a, loc, scale = gamma.fit(cs) self.assertTrue(1.97 < a < 2.03) self.assertTrue(-0.01 < loc < 0.01) self.assertTrue(0.98 < scale < 1.02) bmin, bmax = uniform.fit(betas) self.assertTrue(0 <= bmin < 0.001) self.assertTrue(0.999 <= bmax <= 1)
def hash_batches(args): log = logging.getLogger("hash") log.info("Loading files from %s", args.input) loader = BOWLoader(args.input) log.info("%d batches", len(loader)) # Check batches if not loader: return htnum, band_size = calc_hashtable_params( args.threshold, args.size, args.false_positive_weight, args.false_negative_weight) log.info("Number of hash tables: %d", htnum) log.info("Band size: %d", band_size) cassandra_utils.configure(args) spark_args = filter_kwargs(args.__dict__, create_spark) spark = create_spark("hash-%s" % uuid4(), **spark_args).sparkContext import libMHCUDA # delayed import which requires CUDA and friends tables = args.tables gen = voc_size = None try: for i, bow in enumerate(loader): if voc_size is None: voc_size = bow.matrix.shape[-1] log.info("Initializing the generator") deferred = os.path.isfile(args.params) gen = libMHCUDA.minhash_cuda_init( voc_size, args.size, seed=args.seed, devices=args.devices, verbosity=args.mhc_verbosity, deferred=deferred) if deferred: model = WeightedMinHashParameters().load(args.params) libMHCUDA.minhash_cuda_assign_vars(gen, model.rs, model.ln_cs, model.betas) else: log.info("Writing %s", args.params) params = libMHCUDA.minhash_cuda_retrieve_vars(gen) WeightedMinHashParameters().construct(*params).save(args.params) if bow.matrix.shape[-1] != voc_size: raise ValueError("The vocabulary sizes do not match: %d != %d" % (bow.matrix.shape[-1], voc_size)) log.info("Processing batch %d / %d", i + 1, len(loader)) # Modify features if needed # TODO(vmarkovtsev): port to the new structure # batches = modify_feature_weights(batches, args) hashes = libMHCUDA.minhash_cuda_calc(gen, bow.matrix) job = [(k, h) for k, h in zip(bow.documents, hashes)] log.info("Saving the hashtables") df = spark.parallelize(job).flatMap(HashExploder(htnum, band_size)).toDF() df.write \ .format("org.apache.spark.sql.cassandra") \ .mode("append") \ .options(table=tables["hashtables"], keyspace=args.keyspace) \ .save() df.write \ .format("org.apache.spark.sql.cassandra") \ .mode("append") \ .options(table=tables["hashtables2"], keyspace=args.keyspace) \ .save() log.info("Saving the hashes") spark.parallelize(job) \ .map(lambda x: Row(sha1=x[0], value=bytearray(x[1].data))) \ .toDF() \ .write \ .format("org.apache.spark.sql.cassandra") \ .mode("append") \ .options(table=tables["hashes"], keyspace=args.keyspace) \ .save() finally: libMHCUDA.minhash_cuda_fini(gen)
def hash_batches(args): log = logging.getLogger("hash") log.info("Loading files from %s", args.input) loader = BOWLoader(args.input) log.info("%d batches", len(loader)) # Check batches if not loader: return htnum, band_size = calc_hashtable_params(args.threshold, args.size, args.false_positive_weight, args.false_negative_weight) log.info("Number of hash tables: %d", htnum) log.info("Band size: %d", band_size) cassandra_utils.configure(args) spark = create_spark("hash-%s" % uuid4(), **args.__dict__).sparkContext import libMHCUDA # delayed import which requires CUDA and friends tables = args.tables gen = voc_size = None try: for i, bow in enumerate(loader): if voc_size is None: voc_size = bow.matrix.shape[-1] log.info("Initializing the generator") deferred = os.path.isfile(args.params) gen = libMHCUDA.minhash_cuda_init(voc_size, args.size, seed=args.seed, devices=args.devices, verbosity=args.mhc_verbosity, deferred=deferred) if deferred: model = WeightedMinHashParameters().load(args.params) libMHCUDA.minhash_cuda_assign_vars(gen, model.rs, model.ln_cs, model.betas) else: log.info("Writing %s", args.params) params = libMHCUDA.minhash_cuda_retrieve_vars(gen) WeightedMinHashParameters().construct(*params).save( args.params) if bow.matrix.shape[-1] != voc_size: raise ValueError( "The vocabulary sizes do not match: %d != %d" % (bow.matrix.shape[-1], voc_size)) log.info("Processing batch %d / %d", i + 1, len(loader)) # Modify features if needed # TODO(vmarkovtsev): port to the new structure # batches = modify_feature_weights(batches, args) hashes = libMHCUDA.minhash_cuda_calc(gen, bow.matrix) job = [(k, h) for k, h in zip(bow.documents, hashes)] log.info("Saving the hashtables") df = spark.parallelize(job).flatMap(HashExploder( htnum, band_size)).toDF() df.write \ .format("org.apache.spark.sql.cassandra") \ .mode("append") \ .options(table=tables["hashtables"], keyspace=args.keyspace) \ .save() df.write \ .format("org.apache.spark.sql.cassandra") \ .mode("append") \ .options(table=tables["hashtables2"], keyspace=args.keyspace) \ .save() log.info("Saving the hashes") spark.parallelize(job) \ .map(lambda x: Row(sha1=x[0], value=bytearray(x[1].data))) \ .toDF() \ .write \ .format("org.apache.spark.sql.cassandra") \ .mode("append") \ .options(table=tables["hashes"], keyspace=args.keyspace) \ .save() finally: libMHCUDA.minhash_cuda_fini(gen)