def test_unpersist(): """Make sure that cached RDDs are unpersisted """ sm.create("test") sql = SQLContext.getOrCreate(sm.sc) rdd1 = sm.parallelize(range(10000)).cache() rdd1.count() df1 = sql.createDataFrame([('Foo', 1)]).cache() df1.count() before = set(r.id() for r in sm.sc._jsc.getPersistentRDDs().values()) with sm.clean_cache(): rdd2 = sm.parallelize(range(0, 10000, 2)) rdd2.cache() df2 = sql.createDataFrame([('Bar', 2)]) df2.cache() assert before == set(r.id() for r in sm.sc._jsc.getPersistentRDDs().values()) assert rdd1.getStorageLevel().useMemory is True assert rdd2.getStorageLevel().useMemory is False # FIXME Does not currently work! # assert df1.rdd.getStorageLevel().useMemory is True assert df2.rdd.getStorageLevel().useMemory is False
def test_bench(): """Test the benchmarking """ sm.create("test") with sm.benchmark(): rdd = sm.parallelize(range(10000)) rdd.count()
def test_deco(): """Test the decorator """ sm.create("test") @sm.assign_to_jobgroup def some_function(): rdd = sm.parallelize(range(10000)) rdd.count() some_function()
def test_report(tmpdir): """Test the decorator """ filename = tmpdir.join("report") sm.create("test", report=str(filename), reset=True) @sm.assign_to_jobgroup def some_function(): rdd = sm.parallelize(range(10000)) rdd.count() some_function() with open(str(filename), 'r') as fd: data = json.load(fd) assert len(data['timing']) == 1
def test_reset(): """Make sure that all caches are reset """ sm.create("test") sm.reset_cache() sql = SQLContext.getOrCreate(sm.sc) assert len(sm.sc._jsc.getPersistentRDDs()) == 0 rdd1 = sm.parallelize(range(10000)) rdd1.count() rdd1.persist() df1 = sql.createDataFrame([('Foo', 1)]) df1.count() df1.persist() assert len(sm.sc._jsc.getPersistentRDDs()) > 0 sm.reset_cache() assert len(sm.sc._jsc.getPersistentRDDs()) == 0
def run(file1, file2, output=True, spark_options=None, **opts): # type: (str, str, object, object, **object) -> dict # ====== Init Spark and dataframes ====== sm.create("fsumcheck", spark_config, spark_options) options = _DEFAULTS.copy() options.update(opts) df1 = sm.spark.read.schema(SCHEMA).csv(file1, sep=options["delimiter"]) df2 = sm.spark.read.schema(SCHEMA).csv(file2, sep=options["delimiter"]) # ====== Optimization ====== n_partitions = df1.rdd.getNumPartitions() shuffle_partitions = ((n_partitions - 1) // 50 + 1) * 50 if options["verbosity"]: print("Processing {} partitions (shuffle counts: {})".format( n_partitions, shuffle_partitions)) sm.conf.set("spark.sql.shuffle.partitions", shuffle_partitions) df1 = df1.repartition("filename").persist(StorageLevel.MEMORY_AND_DISK) df2 = df2.repartition("filename").persist(StorageLevel.MEMORY_AND_DISK) # ====== Checks ====== # 1 Only left and right only_left = (df1.join(df2, "filename", how="left_anti").select( df1.filename).where(df1.filename.isNotNull())) only_right = (df2.join(df1, "filename", how="left_anti").select( df2.filename).where(df2.filename.isNotNull())) # 2 Different checksum different_checksum = (df1.join( df2, "filename").where(df1.checksum != df2.checksum).select(df1.filename)) # 3 Missing field problematic_left = ( df1.where("filename is NULL OR checksum is NULL").select( F.when(df1.filename.isNull(), df1.checksum).otherwise(df1.filename).alias("entry"))) problematic_right = ( df2.where("filename is NULL OR checksum is NULL").select( F.when(df2.filename.isNull(), df2.checksum).otherwise(df2.filename).alias("entry"))) # ====== Results gathering ====== all_dfs = OrderedDict([("only_left", only_left), ("only_right", only_right), ("different_checksum", different_checksum), ("problematic_left", problematic_left), ("problematic_right", problematic_right)]) if output: if output is True: output = "fscheck_output" os.path.exists(output) or os.makedirs(output) for name, df in all_dfs.items(): df = df.cache() out_filepath = os.path.join(output, name + ".csv.dir") if options["verbosity"]: print(" - Creating " + out_filepath) df.write.csv(out_filepath, mode="overwrite") # Quick merge os.system("cat {}/*.csv > {} ".format( out_filepath, os.path.join(output, name + ".csv"))) os.system("rm -rf {}".format(out_filepath)) print(" Total entries: {}".format(df.count())) df.unpersist() return all_dfs