def evaluate_communities(args): log = logging.getLogger("evalcc") model = CommunitiesModel().load(args.input) patch_tables(args) spark = create_spark("evalcc-%s" % uuid4(), **args.__dict__) log.info("Preparing the communities' RDD") items = [] for i, c in progress_bar(enumerate(model.communities), log, expected_size=len(model.communities)): for m in c: if m < len(model.id_to_element): items.append(Row(sha1=model.id_to_element[m], community=i)) log.info("Running") items_in_spark = spark.sparkContext.parallelize(items).toDF() bags = spark \ .read \ .format("org.apache.spark.sql.cassandra") \ .options(table=args.tables["bags"], keyspace=args.keyspace) \ .load() log.info("Loaded the bags, calculating the vocabulary") vocabulary = bags.drop("sha1", "value").distinct().rdd.map(lambda x: x.item).collect() vocabulary = {v: i for i, v in enumerate(vocabulary)} log.info("Vocabulary size: %d", len(vocabulary)) element_to_id = {e: i for i, e in enumerate(model.id_to_element)} metrics = items_in_spark.join(bags, "sha1").rdd \ .map(lambda r: (r.community, (element_to_id[r.sha1], vocabulary[r.item], r.value))) \ .groupByKey() \ .map(CommunityEvaluator(args.threshold, len(vocabulary))) \ .reduce(lambda v1, v2: [v1[i] + v2[i] for i in range(4)]) log.info("Total misses: %d", metrics[0]) log.info("Average normalized misses: %f", metrics[1] / len(model.communities)) log.info("Total loss: %f", metrics[2]) log.info("Average normalized loss: %f", numpy.sqrt(metrics[3] / len(model.communities)))
def setUp(self): data = [Row(to_index="to_index%d" % i, value=i) for i in range(10)] self.data = data self.sc = create_spark("test") self.data_rdd = self.sc.sparkContext \ .parallelize(range(len(data))) \ .map(lambda x: data[x])
def create_engine(session_name, repositories, repository_format=EngineDefault.REPOSITORY_FORMAT, bblfsh=EngineDefault.BBLFSH, engine=EngineDefault.VERSION, config=SparkDefault.CONFIG, packages=SparkDefault.JAR_PACKAGES, spark=SparkDefault.MASTER_ADDRESS, spark_local_dir=SparkDefault.LOCAL_DIR, spark_log_level=SparkDefault.LOG_LEVEL, dep_zip=SparkDefault.DEP_ZIP, memory=SparkDefault.MEMORY): config += (get_bblfsh_dependency(bblfsh), ) packages += (get_engine_package(engine), ) session = create_spark(session_name, spark=spark, spark_local_dir=spark_local_dir, config=config, packages=packages, spark_log_level=spark_log_level, dep_zip=dep_zip, memory=memory) logging.getLogger("engine").info("Initializing engine on %s", repositories) return Engine(session, repositories, repository_format)
def create_spark_for_test(name="test"): config = [] packages = [] bblfsh = "localhost" engine = get_engine_version() add_engine_dependencies(engine=engine, config=config, packages=packages) add_bblfsh_dependencies(bblfsh=bblfsh, config=config) return create_spark(name, config=config, packages=packages)
def test_error(self): with self.assertRaises(ValueError): create_or_load_ordered_df(argparse.Namespace(docfreq_in=None), 10, None) with self.assertRaises(ValueError): session = create_spark("test_df_util") uast_extractor = ParquetLoader(session, paths.PARQUET_DIR) \ .link(Moder("file")) \ .link(UastRow2Document()) \ .link(UastDeserializer()) \ .link(Uast2BagFeatures(IdentifiersBagExtractor())) create_or_load_ordered_df(argparse.Namespace(docfreq_in=None), None, uast_extractor)
def test_create(self): session = create_spark("test_df_util") uast_extractor = ParquetLoader(session, paths.PARQUET_DIR) \ .link(UastRow2Document()) ndocs = uast_extractor.link(Counter()).execute() uast_extractor = uast_extractor.link(UastDeserializer()) \ .link(Uast2BagFeatures([IdentifiersBagExtractor()])) with tempfile.TemporaryDirectory() as tmpdir: tmp_path = os.path.join(tmpdir, "df.asdf") args = argparse.Namespace(docfreq_in=None, docfreq_out=tmp_path, min_docfreq=1, vocabulary_size=1000) df_model = create_or_load_ordered_df(args, ndocs, uast_extractor) self.assertEqual(df_model.docs, ndocs) self.assertTrue(os.path.exists(tmp_path))
def create_parquet_loader(session_name, repositories, config=SparkDefault.CONFIG, packages=SparkDefault.PACKAGES, spark=SparkDefault.MASTER_ADDRESS, spark_local_dir=SparkDefault.LOCAL_DIR, spark_log_level=SparkDefault.LOG_LEVEL, memory=SparkDefault.MEMORY, dep_zip=False): config = assemble_spark_config(config=config, memory=memory) session = create_spark(session_name, spark=spark, spark_local_dir=spark_local_dir, config=config, packages=packages, spark_log_level=spark_log_level, dep_zip=dep_zip) log = logging.getLogger("parquet") log.info("Initializing on %s", repositories) parquet = ParquetLoader(session, repositories) return parquet
def setUp(self): self.sc = create_spark("test") df = DocumentFrequencies().construct(10, {str(i): i for i in range(1, 5)}) self.tfidf = TFIDF(df=df) class Columns: """ Stores column names for return value. """ token = "t" document = "d" value = "v" self.tfidf.Columns = Columns
def test_create(self): session = create_spark("test_quant_util") extractor = ChildrenBagExtractor() with tempfile.NamedTemporaryFile(mode="r+b", suffix="-quant.asdf") as tmp: path = tmp.name uast_extractor = ParquetLoader(session, paths.PARQUET_DIR) \ .link(Moder("file")) \ .link(UastRow2Document()) \ .link(UastDeserializer()) create_or_apply_quant(path, [extractor], uast_extractor) self.assertIsNotNone(extractor.levels) self.assertTrue(os.path.exists(path)) model_levels = QuantizationLevels().load( source=path)._levels["children"] for key in model_levels: self.assertListEqual(list(model_levels[key]), list(extractor.levels[key]))
def detect_communities(args): log = logging.getLogger("cmd") ccsmodel = ConnectedComponentsModel().load(args.input) log.info("Building the connected components") ccs = defaultdict(list) for i, c in enumerate(ccsmodel.id_to_cc): ccs[c].append(i) buckmat = ccsmodel.id_to_buckets buckindices = buckmat.indices buckindptr = buckmat.indptr total_nvertices = buckmat.shape[0] linear = args.edges in ("linear", "1") graphs = [] communities = [] if not linear: log.info("Transposing the matrix") buckmat_csc = buckmat.T.tocsr() fat_ccs = [] for vertices in ccs.values(): if len(vertices) == 1: continue if len(vertices) == 2: communities.append(vertices) continue fat_ccs.append(vertices) log.info("Building %d graphs", len(fat_ccs)) for vertices in progress_bar(fat_ccs, log, expected_size=len(fat_ccs)): if linear: edges = [] weights = [] bucket_weights = buckmat.sum(axis=0) buckets = set() for i in vertices: for j in range(buckindptr[i], buckindptr[i + 1]): bucket = buckindices[j] weights.append(bucket_weights[0, bucket]) bucket += total_nvertices buckets.add(bucket) edges.append((str(i), str(bucket))) else: edges = set() weights = None buckets = set() for i in vertices: for j in range(buckindptr[i], buckindptr[i + 1]): buckets.add(buckindices[j]) for bucket in buckets: buckverts = \ buckmat_csc.indices[buckmat_csc.indptr[bucket]:buckmat_csc.indptr[bucket + 1]] for i, x in enumerate(buckverts): for y in buckverts: if x < y: edges.add((str(x), str(y))) buckets.clear() edges = list(edges) graph = Graph(directed=False) graph.add_vertices(list(map(str, vertices + list(buckets)))) graph.add_edges(edges) graph.edge_weights = weights graphs.append(graph) log.info("Launching the community detection") detector = CommunityDetector(algorithm=args.algorithm, config=args.params) if not args.no_spark: spark = create_spark("cmd-%s" % uuid4(), **args.__dict__).sparkContext communities.extend(spark.parallelize(graphs).flatMap(detector).collect()) else: communities.extend(chain.from_iterable(progress_bar( (detector(g) for g in graphs), log, expected_size=len(graphs)))) log.info("Overall communities: %d", len(communities)) log.info("Average community size: %.1f", numpy.mean([len(c) for c in communities])) log.info("Median community size: %.1f", numpy.median([len(c) for c in communities])) log.info("Max community size: %d", max(map(len, communities))) log.info("Writing %s", args.output) CommunitiesModel().construct(communities, ccsmodel.id_to_element).save(args.output)
def setUp(self): self.sc = create_spark("test") self.bag2tf = BagFeatures2TermFreq()
def hash_batches(args): log = logging.getLogger("hash") log.info("Loading files from %s", args.input) loader = BOWLoader(args.input) log.info("%d batches", len(loader)) # Check batches if not loader: return htnum, band_size = calc_hashtable_params( args.threshold, args.size, args.false_positive_weight, args.false_negative_weight) log.info("Number of hash tables: %d", htnum) log.info("Band size: %d", band_size) cassandra_utils.configure(args) spark_args = filter_kwargs(args.__dict__, create_spark) spark = create_spark("hash-%s" % uuid4(), **spark_args).sparkContext import libMHCUDA # delayed import which requires CUDA and friends tables = args.tables gen = voc_size = None try: for i, bow in enumerate(loader): if voc_size is None: voc_size = bow.matrix.shape[-1] log.info("Initializing the generator") deferred = os.path.isfile(args.params) gen = libMHCUDA.minhash_cuda_init( voc_size, args.size, seed=args.seed, devices=args.devices, verbosity=args.mhc_verbosity, deferred=deferred) if deferred: model = WeightedMinHashParameters().load(args.params) libMHCUDA.minhash_cuda_assign_vars(gen, model.rs, model.ln_cs, model.betas) else: log.info("Writing %s", args.params) params = libMHCUDA.minhash_cuda_retrieve_vars(gen) WeightedMinHashParameters().construct(*params).save(args.params) if bow.matrix.shape[-1] != voc_size: raise ValueError("The vocabulary sizes do not match: %d != %d" % (bow.matrix.shape[-1], voc_size)) log.info("Processing batch %d / %d", i + 1, len(loader)) # Modify features if needed # TODO(vmarkovtsev): port to the new structure # batches = modify_feature_weights(batches, args) hashes = libMHCUDA.minhash_cuda_calc(gen, bow.matrix) job = [(k, h) for k, h in zip(bow.documents, hashes)] log.info("Saving the hashtables") df = spark.parallelize(job).flatMap(HashExploder(htnum, band_size)).toDF() df.write \ .format("org.apache.spark.sql.cassandra") \ .mode("append") \ .options(table=tables["hashtables"], keyspace=args.keyspace) \ .save() df.write \ .format("org.apache.spark.sql.cassandra") \ .mode("append") \ .options(table=tables["hashtables2"], keyspace=args.keyspace) \ .save() log.info("Saving the hashes") spark.parallelize(job) \ .map(lambda x: Row(sha1=x[0], value=bytearray(x[1].data))) \ .toDF() \ .write \ .format("org.apache.spark.sql.cassandra") \ .mode("append") \ .options(table=tables["hashes"], keyspace=args.keyspace) \ .save() finally: libMHCUDA.minhash_cuda_fini(gen)
def create_spark_for_test(name="test"): packages = (get_engine_package(get_engine_version()), ) config = (get_bblfsh_dependency("localhost"), ) return create_spark(name, config=config, packages=packages)
def hash_batches(args): log = logging.getLogger("hash") log.info("Loading files from %s", args.input) loader = BOWLoader(args.input) log.info("%d batches", len(loader)) # Check batches if not loader: return htnum, band_size = calc_hashtable_params(args.threshold, args.size, args.false_positive_weight, args.false_negative_weight) log.info("Number of hash tables: %d", htnum) log.info("Band size: %d", band_size) cassandra_utils.configure(args) spark = create_spark("hash-%s" % uuid4(), **args.__dict__).sparkContext import libMHCUDA # delayed import which requires CUDA and friends tables = args.tables gen = voc_size = None try: for i, bow in enumerate(loader): if voc_size is None: voc_size = bow.matrix.shape[-1] log.info("Initializing the generator") deferred = os.path.isfile(args.params) gen = libMHCUDA.minhash_cuda_init(voc_size, args.size, seed=args.seed, devices=args.devices, verbosity=args.mhc_verbosity, deferred=deferred) if deferred: model = WeightedMinHashParameters().load(args.params) libMHCUDA.minhash_cuda_assign_vars(gen, model.rs, model.ln_cs, model.betas) else: log.info("Writing %s", args.params) params = libMHCUDA.minhash_cuda_retrieve_vars(gen) WeightedMinHashParameters().construct(*params).save( args.params) if bow.matrix.shape[-1] != voc_size: raise ValueError( "The vocabulary sizes do not match: %d != %d" % (bow.matrix.shape[-1], voc_size)) log.info("Processing batch %d / %d", i + 1, len(loader)) # Modify features if needed # TODO(vmarkovtsev): port to the new structure # batches = modify_feature_weights(batches, args) hashes = libMHCUDA.minhash_cuda_calc(gen, bow.matrix) job = [(k, h) for k, h in zip(bow.documents, hashes)] log.info("Saving the hashtables") df = spark.parallelize(job).flatMap(HashExploder( htnum, band_size)).toDF() df.write \ .format("org.apache.spark.sql.cassandra") \ .mode("append") \ .options(table=tables["hashtables"], keyspace=args.keyspace) \ .save() df.write \ .format("org.apache.spark.sql.cassandra") \ .mode("append") \ .options(table=tables["hashtables2"], keyspace=args.keyspace) \ .save() log.info("Saving the hashes") spark.parallelize(job) \ .map(lambda x: Row(sha1=x[0], value=bytearray(x[1].data))) \ .toDF() \ .write \ .format("org.apache.spark.sql.cassandra") \ .mode("append") \ .options(table=tables["hashes"], keyspace=args.keyspace) \ .save() finally: libMHCUDA.minhash_cuda_fini(gen)
def setUp(self): self.sc = create_spark("test")
def setUp(self): self.sc = create_spark("test") self.bag2df = BagFeatures2DocFreq()
def create_spark_for_test(name="test"): if sys.version_info >= (3, 7): raise SkipTest("Python 3.7 is not yet supported.") packages = (get_engine_package(get_engine_version()), ) config = (get_bblfsh_dependency("localhost"), ) return create_spark(name, config=config, packages=packages)