def exp2(): conf = pyspark.SparkConf() conf.setMaster('local[4]') sc = pyspark.SparkContext(conf=conf) data = get_dataset_rdd(sc, INPUT_DATASET_PATH) data.cache() data_set_size = data.count() threshold = 50000 / float(data_set_size) print 'Starting alg-fpgrowth test' start = time() res = algfpgrowth.alg_fp_growth(data, threshold, 4) end = time() print 'alg-fp-growth test ended and took %d seconds' % int(end - start) pickle.dump(res, open(OUTPUT_PATH_RAND, "w")) sc.stop()
def run_spark(data_set_rdd, threshold, num_of_partitions): return algfpgrowth.alg_fp_growth(data_set_rdd, threshold, num_of_partitions)
def test_alg(self): threshold = 2 / 5 res = algfpgrowth.alg_fp_growth(self.rdd, threshold, 2)