def source2bags(args): cassandra_utils.configure(args) return repos2bow_entry_template( args, select=lambda: DzhigurdaFiles(args.dzhigurda), before_deserialize=lambda: MetadataSaver(args.keyspace, args.tables[ "meta"]))
def source2bags(args): cassandra_utils.configure(args) return repos2bow_entry_template( args, select=lambda: DzhigurdaFiles(args.dzhigurda), cache_hook=lambda: MetadataSaver(args.keyspace, args.tables["meta"]), save_hook=lambda: BagsSaver(args.keyspace, args.tables["bags"]))
def source2bags(args): cassandra_utils.configure(args) if not args.skip_metadata: cache_hook = lambda: MetadataSaver(args.keyspace, args.tables["meta"]) else: cache_hook = None if not args.skip_bags_to_db: save_hook = lambda: BagsSaver(args.keyspace, args.tables["bags"]) else: save_hook = None return repos2bow_template(args, select=lambda: DzhigurdaFiles(args.dzhigurda), cache_hook=cache_hook, save_hook=save_hook)
def source2bags(args): log = logging.getLogger("bags") if os.path.exists(args.batches): log.critical("%s must not exist", args.batches) return 1 if not args.config: args.config = [] try: cassandra_utils.configure(args) engine = create_engine("source2bags-%s" % uuid4(), **args.__dict__) extractors = [ __extractors__[s](args.min_docfreq, **__extractors__[s].get_kwargs_fromcmdline(args)) for s in args.feature ] pipeline = Engine(engine, explain=args.explain).link( DzhigurdaFiles(args.dzhigurda)) uasts = pipeline.link(UastExtractor(languages=[args.language])) if args.persist is not None: uasts = uasts.link(Cacher(args.persist)) uasts.link(MetadataSaver(args.keyspace, args.tables["meta"])) uasts = uasts.link(UastDeserializer()) uasts.link(Repo2Quant(extractors, args.nb_partitions)) uasts.link(Repo2DocFreq(extractors)) pipeline.explode() bags = uasts.link(Repo2WeightedSet(extractors)) if args.persist is not None: bags = bags.link(Cacher(args.persist)) batcher = bags.link(BagsBatcher(extractors)) batcher.link(BagsBatchSaver(args.batches, batcher)) bags.link(BagsSaver(args.keyspace, args.tables["bags"])) bags.explode() log.info("Writing %s", args.docfreq) batcher.model.save(args.docfreq) if args.graph: log.info("Dumping the graph to %s", args.graph) with open(args.graph, "w") as f: pipeline.graph(stream=f) finally: if args.pause: input("Press Enter to exit...")
def evaluate_communities(args): log = logging.getLogger("evalcc") model = CommunitiesModel().load(args.input) configure(args) spark = create_spark("evalcc-%s" % uuid4(), **filter_kwargs(args.__dict__, create_spark)) log.info("Preparing the communities' RDD") items = [] for i, c in progress_bar(enumerate(model.communities), log, expected_size=len(model.communities)): for m in c: if m < len(model.id_to_element): items.append(Row(sha1=model.id_to_element[m], community=i)) log.info("Running") items_in_spark = spark.sparkContext.parallelize(items).toDF() bags = spark \ .read \ .format("org.apache.spark.sql.cassandra") \ .options(table=args.tables["bags"], keyspace=args.keyspace) \ .load() log.info("Loaded the bags, calculating the vocabulary") vocabulary = bags.drop( "sha1", "value").distinct().rdd.map(lambda x: x.item).collect() vocabulary = {v: i for i, v in enumerate(vocabulary)} log.info("Vocabulary size: %d", len(vocabulary)) element_to_id = {e: i for i, e in enumerate(model.id_to_element)} metrics = items_in_spark.join(bags, "sha1").rdd \ .map(lambda r: (r.community, (element_to_id[r.sha1], vocabulary[r.item], r.value))) \ .groupByKey() \ .map(CommunityEvaluator(args.threshold, len(vocabulary))) \ .reduce(lambda v1, v2: [v1[i] + v2[i] for i in range(4)]) log.info("Total misses: %d", metrics[0]) log.info("Average normalized misses: %f", metrics[1] / len(model.communities)) log.info("Total loss: %f", metrics[2]) log.info("Average normalized loss: %f", numpy.sqrt(metrics[3] / len(model.communities)))
def hash_batches(args): log = logging.getLogger("hash") log.info("Loading files from %s", args.input) loader = BOWLoader(args.input) log.info("%d batches", len(loader)) # Check batches if not loader: return htnum, band_size = calc_hashtable_params( args.threshold, args.size, args.false_positive_weight, args.false_negative_weight) log.info("Number of hash tables: %d", htnum) log.info("Band size: %d", band_size) cassandra_utils.configure(args) spark_args = filter_kwargs(args.__dict__, create_spark) spark = create_spark("hash-%s" % uuid4(), **spark_args).sparkContext import libMHCUDA # delayed import which requires CUDA and friends tables = args.tables gen = voc_size = None try: for i, bow in enumerate(loader): if voc_size is None: voc_size = bow.matrix.shape[-1] log.info("Initializing the generator") deferred = os.path.isfile(args.params) gen = libMHCUDA.minhash_cuda_init( voc_size, args.size, seed=args.seed, devices=args.devices, verbosity=args.mhc_verbosity, deferred=deferred) if deferred: model = WeightedMinHashParameters().load(args.params) libMHCUDA.minhash_cuda_assign_vars(gen, model.rs, model.ln_cs, model.betas) else: log.info("Writing %s", args.params) params = libMHCUDA.minhash_cuda_retrieve_vars(gen) WeightedMinHashParameters().construct(*params).save(args.params) if bow.matrix.shape[-1] != voc_size: raise ValueError("The vocabulary sizes do not match: %d != %d" % (bow.matrix.shape[-1], voc_size)) log.info("Processing batch %d / %d", i + 1, len(loader)) # Modify features if needed # TODO(vmarkovtsev): port to the new structure # batches = modify_feature_weights(batches, args) hashes = libMHCUDA.minhash_cuda_calc(gen, bow.matrix) job = [(k, h) for k, h in zip(bow.documents, hashes)] log.info("Saving the hashtables") df = spark.parallelize(job).flatMap(HashExploder(htnum, band_size)).toDF() df.write \ .format("org.apache.spark.sql.cassandra") \ .mode("append") \ .options(table=tables["hashtables"], keyspace=args.keyspace) \ .save() df.write \ .format("org.apache.spark.sql.cassandra") \ .mode("append") \ .options(table=tables["hashtables2"], keyspace=args.keyspace) \ .save() log.info("Saving the hashes") spark.parallelize(job) \ .map(lambda x: Row(sha1=x[0], value=bytearray(x[1].data))) \ .toDF() \ .write \ .format("org.apache.spark.sql.cassandra") \ .mode("append") \ .options(table=tables["hashes"], keyspace=args.keyspace) \ .save() finally: libMHCUDA.minhash_cuda_fini(gen)
def hash_batches(args): log = logging.getLogger("hash") log.info("Loading files from %s", args.input) loader = BOWLoader(args.input) log.info("%d batches", len(loader)) # Check batches if not loader: return htnum, band_size = calc_hashtable_params(args.threshold, args.size, args.false_positive_weight, args.false_negative_weight) log.info("Number of hash tables: %d", htnum) log.info("Band size: %d", band_size) cassandra_utils.configure(args) spark = create_spark("hash-%s" % uuid4(), **args.__dict__).sparkContext import libMHCUDA # delayed import which requires CUDA and friends tables = args.tables gen = voc_size = None try: for i, bow in enumerate(loader): if voc_size is None: voc_size = bow.matrix.shape[-1] log.info("Initializing the generator") deferred = os.path.isfile(args.params) gen = libMHCUDA.minhash_cuda_init(voc_size, args.size, seed=args.seed, devices=args.devices, verbosity=args.mhc_verbosity, deferred=deferred) if deferred: model = WeightedMinHashParameters().load(args.params) libMHCUDA.minhash_cuda_assign_vars(gen, model.rs, model.ln_cs, model.betas) else: log.info("Writing %s", args.params) params = libMHCUDA.minhash_cuda_retrieve_vars(gen) WeightedMinHashParameters().construct(*params).save( args.params) if bow.matrix.shape[-1] != voc_size: raise ValueError( "The vocabulary sizes do not match: %d != %d" % (bow.matrix.shape[-1], voc_size)) log.info("Processing batch %d / %d", i + 1, len(loader)) # Modify features if needed # TODO(vmarkovtsev): port to the new structure # batches = modify_feature_weights(batches, args) hashes = libMHCUDA.minhash_cuda_calc(gen, bow.matrix) job = [(k, h) for k, h in zip(bow.documents, hashes)] log.info("Saving the hashtables") df = spark.parallelize(job).flatMap(HashExploder( htnum, band_size)).toDF() df.write \ .format("org.apache.spark.sql.cassandra") \ .mode("append") \ .options(table=tables["hashtables"], keyspace=args.keyspace) \ .save() df.write \ .format("org.apache.spark.sql.cassandra") \ .mode("append") \ .options(table=tables["hashtables2"], keyspace=args.keyspace) \ .save() log.info("Saving the hashes") spark.parallelize(job) \ .map(lambda x: Row(sha1=x[0], value=bytearray(x[1].data))) \ .toDF() \ .write \ .format("org.apache.spark.sql.cassandra") \ .mode("append") \ .options(table=tables["hashes"], keyspace=args.keyspace) \ .save() finally: libMHCUDA.minhash_cuda_fini(gen)