def portable_hash(x): """ (Copied from pyspark.rdd) This function returns consistent hash code for builtin types, especially for None and tuple with None. The algorithm is similar to that one used by CPython 2.7 >>> portable_hash(None) 0 >>> portable_hash((None, 1)) & 0xffffffff 219750521 """ if sys.version_info >= (3, 2, 3) and 'PYTHONHASHSEED' not in os.environ: raise Exception("Randomness of hash of string should be disabled via PYTHONHASHSEED") if x is None: return 0 if isinstance(x, tuple): h = 0x345678 for i in x: h ^= portable_hash(i) h *= 1000003 h &= sys.maxsize h ^= len(x) if h == -1: h = -2 return int(h) return hash(x)
def portable_hash(x): """ (Copied from pyspark.rdd) This function returns consistent hash code for builtin types, especially for None and tuple with None. The algorithm is similar to that one used by CPython 2.7 >>> portable_hash(None) 0 >>> portable_hash((None, 1)) & 0xffffffff 219750521 """ if sys.version_info >= (3, 2, 3) and 'PYTHONHASHSEED' not in os.environ: raise Exception( "Randomness of hash of string should be disabled via PYTHONHASHSEED" ) if x is None: return 0 if isinstance(x, tuple): h = 0x345678 for i in x: h ^= portable_hash(i) h *= 1000003 h &= sys.maxsize h ^= len(x) if h == -1: h = -2 return int(h) return hash(x)
def main(): conf = SparkConf() conf.set("spark.default.parallelism", "32") #conf.set("spark.executor.memory", "1g") sc = SparkContext(appName="BatteryTemp", conf=conf) def tag_filter(ll): if ll.tag == 'KernelPrintk': return True elif ll.tag == 'Kernel-Trace': try: return ll.trace_event == 'thermal_temp' except: return False return False # Load LogLine tuples #all_logs = sc.textFile(processed_dir, use_unicode=False).map(lambda x: lib.logline_from_json(x)) all_logs = sc.textFile(data_files, use_unicode=False).flatMap(ll_mapper) # Filter out any tags we don't care about #filtered = all_logs.filter(lambda line: line.tag in tags) filtered = all_logs.filter(tag_filter) # Group by (boot_id, date) so we have smaller chunks to work with keyed = filtered.keyBy(lambda ll: (ll.boot_id, ll.timestamp[:10], ll.line_num)) partitioned = keyed.repartitionAndSortWithinPartitions(partitionFunc=lambda x: portable_hash(x[:2]), keyfunc=lambda x: (x[0],x[2])) intervals = partitioned.mapPartitions(process).map(lambda x: str(x)) results = intervals.collect() fd = open("results", 'w') for res in results: s = '{}\n'.format(str(res)) #print str(res) #fd.write(str(res) + '\n') fd.write(s) fd.close()
def partition_by_first_value(key): return portable_hash(key[0])
with SparkContext("local[2]") as sc: rdd = sc.parallelize(nums) \ .map(lambda el: (el, el)) \ .partitionBy(3) \ .persist() print("Number of partitions: {}".format(rdd.getNumPartitions())) print("Partitioner: {}".format(rdd.partitioner)) print("Partitions structure: {}".format(rdd.glom().collect())) from pyspark.rdd import portable_hash num_partitions = 2 for el in nums: print("Element: [{}]: {} % {} = partition {}".format( el, portable_hash(el), num_partitions, portable_hash(el) % num_partitions)) # @get_expend_time def test_6(): transactions = [ {'name': 'Bob', 'amount': 100, 'country': 'United Kingdom'}, {'name': 'James', 'amount': 15, 'country': 'United Kingdom'}, {'name': 'Marek', 'amount': 51, 'country': 'Poland'}, {'name': 'Johannes', 'amount': 200, 'country': 'Germany'}, {'name': 'Paul', 'amount': 75, 'country': 'Poland'}, ] def country_partitioner(country): return hash(country) # Validate results num_partitions = 5
def partition_func(key): return portable_hash(key[0])
def partitioner_(x): return portable_hash(x[0]) % n
i = i + 1 df = spark.createDataFrame(data) df.show() print_partitions(df) '''Repartition data to 3 partition by Country''' '''We expect each country data to be in each partition but thats not possible as partitoning is done based on hash partitioning so different country code might fall in same partition''' numPartitions = 3 df = df.repartition(numPartitions, "Country") print_partitions(df) '''We can verify the hash value of each partition based on which it repartitions ''' udf_portable_hash = udf(lambda str: portable_hash(str)) df = df.withColumn("Hash#", udf_portable_hash(df.Country)) df = df.withColumn("Partition#", df["Hash#"] % numPartitions) df.show() '''Increasing partition to 5 puts data for each country in different partition but thats not confirmed in all scenario''' numPartitions = 5 df = df.repartition(numPartitions, "Country") print_partitions(df) udf_portable_hash = udf(lambda str: portable_hash(str)) df = df.withColumn("Hash#", udf_portable_hash(df.Country)) df = df.withColumn("Partition#", df["Hash#"] % numPartitions) df.show() '''To address this issue we use customised partition''' '''udf below'''
def hashedPartitioner(k): return portable_hash(keyfunc(k)) % numPartitions