def _test_calc_big(self, devices): numpy.random.seed(0) data = numpy.random.randint(0, 100, (6400, 130)) mask = numpy.random.randint(0, 5, data.shape) data *= (mask >= 4) del mask bgen = WeightedMinHashGenerator(data.shape[-1]) gen = libMHCUDA.minhash_cuda_init(data.shape[-1], 128, devices=devices, verbosity=2) libMHCUDA.minhash_cuda_assign_vars(gen, bgen.rs, bgen.ln_cs, bgen.betas) m = csr_matrix(data, dtype=numpy.float32) print(m.nnz / (m.shape[0] * m.shape[1])) ts = time() hashes = libMHCUDA.minhash_cuda_calc(gen, m) print("libMHCUDA:", time() - ts) libMHCUDA.minhash_cuda_fini(gen) self.assertEqual(hashes.shape, (len(data), 128, 2)) ts = time() true_hashes = numpy.array([bgen.minhash(line).hashvalues for line in data], dtype=numpy.uint32) print("datasketch:", time() - ts) self.assertEqual(true_hashes.shape, (len(data), 128, 2)) try: self.assertTrue((hashes == true_hashes).all()) except AssertionError as e: for r in range(hashes.shape[0]): if (hashes[r] != true_hashes[r]).any(): print("first invalid row:", r) print(hashes[r]) print(true_hashes[r]) break raise e from None
def test_deferred(self): v1 = [1, 0, 0, 0, 3, 4, 5, 0, 0, 0, 0, 6, 7, 8, 0, 0, 0, 0, 0, 0, 9, 10, 4] v2 = [2, 0, 0, 0, 4, 3, 8, 0, 0, 0, 0, 4, 7, 10, 0, 0, 0, 0, 0, 0, 9, 0, 0] gen = libMHCUDA.minhash_cuda_init(len(v1), 128, devices=1, verbosity=2) vars = libMHCUDA.minhash_cuda_retrieve_vars(gen) libMHCUDA.minhash_cuda_fini(gen) gen = libMHCUDA.minhash_cuda_init( len(v1), 128, devices=1, deferred=True, verbosity=2) libMHCUDA.minhash_cuda_assign_vars(gen, *vars) bgen = WeightedMinHashGenerator.__new__(WeightedMinHashGenerator) bgen.dim = len(v1) bgen.rs, bgen.ln_cs, bgen.betas = vars bgen.sample_size = 128 bgen.seed = None m = csr_matrix(numpy.array([v1, v2], dtype=numpy.float32)) hashes = libMHCUDA.minhash_cuda_calc(gen, m) libMHCUDA.minhash_cuda_fini(gen) self.assertEqual(hashes.shape, (2, 128, 2)) true_hashes = numpy.array([bgen.minhash(v1).hashvalues, bgen.minhash(v2).hashvalues], dtype=numpy.uint32) self.assertEqual(true_hashes.shape, (2, 128, 2)) try: self.assertTrue((hashes == true_hashes).all()) except AssertionError as e: print("---- TRUE ----") print(true_hashes) print("---- FALSE ----") print(hashes) raise e from None
def test_calc_tiny(self): v1 = [ 1, 0, 0, 0, 3, 4, 5, 0, 0, 0, 0, 6, 7, 8, 0, 0, 0, 0, 0, 0, 9, 10, 4 ] v2 = [ 2, 0, 0, 0, 4, 3, 8, 0, 0, 0, 0, 4, 7, 10, 0, 0, 0, 0, 0, 0, 9, 0, 0 ] bgen = WeightedMinHashGenerator(len(v1)) gen = libMHCUDA.minhash_cuda_init(len(v1), 128, devices=1, verbosity=2) libMHCUDA.minhash_cuda_assign_vars(gen, bgen.rs, bgen.ln_cs, bgen.betas) m = csr_matrix(numpy.array([v1, v2], dtype=numpy.float32)) hashes = libMHCUDA.minhash_cuda_calc(gen, m) libMHCUDA.minhash_cuda_fini(gen) self.assertEqual(hashes.shape, (2, 128, 2)) true_hashes = numpy.array( [bgen.minhash(v1).hashvalues, bgen.minhash(v2).hashvalues], dtype=numpy.uint32) self.assertEqual(true_hashes.shape, (2, 128, 2)) try: self.assertTrue((hashes == true_hashes).all()) except AssertionError as e: print("---- TRUE ----") print(true_hashes) print("---- FALSE ----") print(hashes) raise e from None
def run_test(v): k = sum([len(part) for part in v]) bgen = WeightedMinHashGenerator(len(k)) gen = libMHCUDA.minhash_cuda_init(len(k), 128, devices=4, verbosity=2) libMHCUDA.minhash_cuda_assign_vars(gen, bgen.rs, bgen.ln_cs, bgen.betas) m = csr_matrix(numpy.array(v, dtype=numpy.float32)) hashes = None try: hashes = libMHCUDA.minhash_cuda_calc(gen, m) finally: self.assertIsNotNone(hashes) self.assertEqual(hashes.shape, (1, 128, 2)) libMHCUDA.minhash_cuda_fini(gen)
def hash_batches(args): log = logging.getLogger("hash") log.info("Loading files from %s", args.input) loader = BOWLoader(args.input) log.info("%d batches", len(loader)) # Check batches if not loader: return htnum, band_size = calc_hashtable_params( args.threshold, args.size, args.false_positive_weight, args.false_negative_weight) log.info("Number of hash tables: %d", htnum) log.info("Band size: %d", band_size) cassandra_utils.configure(args) spark_args = filter_kwargs(args.__dict__, create_spark) spark = create_spark("hash-%s" % uuid4(), **spark_args).sparkContext import libMHCUDA # delayed import which requires CUDA and friends tables = args.tables gen = voc_size = None try: for i, bow in enumerate(loader): if voc_size is None: voc_size = bow.matrix.shape[-1] log.info("Initializing the generator") deferred = os.path.isfile(args.params) gen = libMHCUDA.minhash_cuda_init( voc_size, args.size, seed=args.seed, devices=args.devices, verbosity=args.mhc_verbosity, deferred=deferred) if deferred: model = WeightedMinHashParameters().load(args.params) libMHCUDA.minhash_cuda_assign_vars(gen, model.rs, model.ln_cs, model.betas) else: log.info("Writing %s", args.params) params = libMHCUDA.minhash_cuda_retrieve_vars(gen) WeightedMinHashParameters().construct(*params).save(args.params) if bow.matrix.shape[-1] != voc_size: raise ValueError("The vocabulary sizes do not match: %d != %d" % (bow.matrix.shape[-1], voc_size)) log.info("Processing batch %d / %d", i + 1, len(loader)) # Modify features if needed # TODO(vmarkovtsev): port to the new structure # batches = modify_feature_weights(batches, args) hashes = libMHCUDA.minhash_cuda_calc(gen, bow.matrix) job = [(k, h) for k, h in zip(bow.documents, hashes)] log.info("Saving the hashtables") df = spark.parallelize(job).flatMap(HashExploder(htnum, band_size)).toDF() df.write \ .format("org.apache.spark.sql.cassandra") \ .mode("append") \ .options(table=tables["hashtables"], keyspace=args.keyspace) \ .save() df.write \ .format("org.apache.spark.sql.cassandra") \ .mode("append") \ .options(table=tables["hashtables2"], keyspace=args.keyspace) \ .save() log.info("Saving the hashes") spark.parallelize(job) \ .map(lambda x: Row(sha1=x[0], value=bytearray(x[1].data))) \ .toDF() \ .write \ .format("org.apache.spark.sql.cassandra") \ .mode("append") \ .options(table=tables["hashes"], keyspace=args.keyspace) \ .save() finally: libMHCUDA.minhash_cuda_fini(gen)
def hash_batches(args): log = logging.getLogger("hash") log.info("Loading files from %s", args.input) loader = BOWLoader(args.input) log.info("%d batches", len(loader)) # Check batches if not loader: return htnum, band_size = calc_hashtable_params(args.threshold, args.size, args.false_positive_weight, args.false_negative_weight) log.info("Number of hash tables: %d", htnum) log.info("Band size: %d", band_size) cassandra_utils.configure(args) spark = create_spark("hash-%s" % uuid4(), **args.__dict__).sparkContext import libMHCUDA # delayed import which requires CUDA and friends tables = args.tables gen = voc_size = None try: for i, bow in enumerate(loader): if voc_size is None: voc_size = bow.matrix.shape[-1] log.info("Initializing the generator") deferred = os.path.isfile(args.params) gen = libMHCUDA.minhash_cuda_init(voc_size, args.size, seed=args.seed, devices=args.devices, verbosity=args.mhc_verbosity, deferred=deferred) if deferred: model = WeightedMinHashParameters().load(args.params) libMHCUDA.minhash_cuda_assign_vars(gen, model.rs, model.ln_cs, model.betas) else: log.info("Writing %s", args.params) params = libMHCUDA.minhash_cuda_retrieve_vars(gen) WeightedMinHashParameters().construct(*params).save( args.params) if bow.matrix.shape[-1] != voc_size: raise ValueError( "The vocabulary sizes do not match: %d != %d" % (bow.matrix.shape[-1], voc_size)) log.info("Processing batch %d / %d", i + 1, len(loader)) # Modify features if needed # TODO(vmarkovtsev): port to the new structure # batches = modify_feature_weights(batches, args) hashes = libMHCUDA.minhash_cuda_calc(gen, bow.matrix) job = [(k, h) for k, h in zip(bow.documents, hashes)] log.info("Saving the hashtables") df = spark.parallelize(job).flatMap(HashExploder( htnum, band_size)).toDF() df.write \ .format("org.apache.spark.sql.cassandra") \ .mode("append") \ .options(table=tables["hashtables"], keyspace=args.keyspace) \ .save() df.write \ .format("org.apache.spark.sql.cassandra") \ .mode("append") \ .options(table=tables["hashtables2"], keyspace=args.keyspace) \ .save() log.info("Saving the hashes") spark.parallelize(job) \ .map(lambda x: Row(sha1=x[0], value=bytearray(x[1].data))) \ .toDF() \ .write \ .format("org.apache.spark.sql.cassandra") \ .mode("append") \ .options(table=tables["hashes"], keyspace=args.keyspace) \ .save() finally: libMHCUDA.minhash_cuda_fini(gen)