def portable_hash(x):
     """
     (Copied from pyspark.rdd)
     
     This function returns consistent hash code for builtin types, especially
     for None and tuple with None.
 
     The algorithm is similar to that one used by CPython 2.7
 
     >>> portable_hash(None)
     0
     >>> portable_hash((None, 1)) & 0xffffffff
     219750521
     """
 
     if sys.version_info >= (3, 2, 3) and 'PYTHONHASHSEED' not in os.environ:
         raise Exception("Randomness of hash of string should be disabled via PYTHONHASHSEED")
 
     if x is None:
         return 0
     if isinstance(x, tuple):
         h = 0x345678
         for i in x:
             h ^= portable_hash(i)
             h *= 1000003
             h &= sys.maxsize
         h ^= len(x)
         if h == -1:
             h = -2
         return int(h)
     return hash(x)
Exemple #2
0
    def portable_hash(x):
        """
        (Copied from pyspark.rdd)
        
        This function returns consistent hash code for builtin types, especially
        for None and tuple with None.
    
        The algorithm is similar to that one used by CPython 2.7
    
        >>> portable_hash(None)
        0
        >>> portable_hash((None, 1)) & 0xffffffff
        219750521
        """

        if sys.version_info >= (3, 2,
                                3) and 'PYTHONHASHSEED' not in os.environ:
            raise Exception(
                "Randomness of hash of string should be disabled via PYTHONHASHSEED"
            )

        if x is None:
            return 0
        if isinstance(x, tuple):
            h = 0x345678
            for i in x:
                h ^= portable_hash(i)
                h *= 1000003
                h &= sys.maxsize
            h ^= len(x)
            if h == -1:
                h = -2
            return int(h)
        return hash(x)
def main():
	conf = SparkConf()
	conf.set("spark.default.parallelism", "32")
	#conf.set("spark.executor.memory", "1g")
	sc = SparkContext(appName="BatteryTemp", conf=conf)

	def tag_filter(ll):
		if ll.tag == 'KernelPrintk':
			return True
		elif ll.tag == 'Kernel-Trace':
			try:
				return ll.trace_event == 'thermal_temp'
			except:
				return False
		return False

	# Load LogLine tuples
	#all_logs = sc.textFile(processed_dir, use_unicode=False).map(lambda x: lib.logline_from_json(x))
	all_logs = sc.textFile(data_files, use_unicode=False).flatMap(ll_mapper)

	# Filter out any tags we don't care about
	#filtered = all_logs.filter(lambda line: line.tag in tags)
	filtered = all_logs.filter(tag_filter)

	# Group by (boot_id, date) so we have smaller chunks to work with
	keyed = filtered.keyBy(lambda ll: (ll.boot_id, ll.timestamp[:10], ll.line_num))

	partitioned = keyed.repartitionAndSortWithinPartitions(partitionFunc=lambda x: portable_hash(x[:2]), keyfunc=lambda x: (x[0],x[2]))
	intervals = partitioned.mapPartitions(process).map(lambda x: str(x))
	results = intervals.collect()

	fd = open("results", 'w')
	for res in results:
		s = '{}\n'.format(str(res))
		#print str(res)
		#fd.write(str(res) + '\n')
		fd.write(s)
	fd.close()
def partition_by_first_value(key):
    return portable_hash(key[0])
Exemple #5
0
    with SparkContext("local[2]") as sc:
        rdd = sc.parallelize(nums) \
            .map(lambda el: (el, el)) \
            .partitionBy(3) \
            .persist()

        print("Number of partitions: {}".format(rdd.getNumPartitions()))
        print("Partitioner: {}".format(rdd.partitioner))
        print("Partitions structure: {}".format(rdd.glom().collect()))


from pyspark.rdd import portable_hash
num_partitions = 2
for el in nums:
    print("Element: [{}]: {} % {} = partition {}".format(
        el, portable_hash(el), num_partitions, portable_hash(el) % num_partitions))
#
@get_expend_time
def test_6():
    transactions = [
        {'name': 'Bob', 'amount': 100, 'country': 'United Kingdom'},
        {'name': 'James', 'amount': 15, 'country': 'United Kingdom'},
        {'name': 'Marek', 'amount': 51, 'country': 'Poland'},
        {'name': 'Johannes', 'amount': 200, 'country': 'Germany'},
        {'name': 'Paul', 'amount': 75, 'country': 'Poland'},
    ]

    def country_partitioner(country):
        return hash(country)
    # Validate results
    num_partitions = 5
def partition_func(key):
    return portable_hash(key[0])
Exemple #7
0
 def partitioner_(x):
     return portable_hash(x[0]) % n
Exemple #8
0
        i = i + 1


df = spark.createDataFrame(data)
df.show()
print_partitions(df)
'''Repartition data to 3 partition by Country'''
'''We expect each country data to be in each partition but thats not possible as partitoning is done based on hash partitioning
   so different country code might fall in same partition'''
numPartitions = 3

df = df.repartition(numPartitions, "Country")

print_partitions(df)
'''We can verify the hash value of each partition based on which it repartitions '''
udf_portable_hash = udf(lambda str: portable_hash(str))
df = df.withColumn("Hash#", udf_portable_hash(df.Country))
df = df.withColumn("Partition#", df["Hash#"] % numPartitions)
df.show()
'''Increasing partition to 5 puts data for each country in  different partition but thats not confirmed in all scenario'''
numPartitions = 5
df = df.repartition(numPartitions, "Country")
print_partitions(df)
udf_portable_hash = udf(lambda str: portable_hash(str))
df = df.withColumn("Hash#", udf_portable_hash(df.Country))
df = df.withColumn("Partition#", df["Hash#"] % numPartitions)
df.show()
'''To address this issue we use customised partition'''
'''udf below'''

Exemple #9
0
 def hashedPartitioner(k):
     return portable_hash(keyfunc(k)) % numPartitions
Exemple #10
0
 def hashedPartitioner(k):
     return portable_hash(keyfunc(k)) % numPartitions