Python portable_hash Exemples, pyspark.rdd.portable_hash Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : rddtools.py Projet : janelia-flyem/DVIDSparkServices

 def portable_hash(x):
     """
     (Copied from pyspark.rdd)
     
     This function returns consistent hash code for builtin types, especially
     for None and tuple with None.
 
     The algorithm is similar to that one used by CPython 2.7
 
     >>> portable_hash(None)
     0
     >>> portable_hash((None, 1)) & 0xffffffff
     219750521
     """
 
     if sys.version_info >= (3, 2, 3) and 'PYTHONHASHSEED' not in os.environ:
         raise Exception("Randomness of hash of string should be disabled via PYTHONHASHSEED")
 
     if x is None:
         return 0
     if isinstance(x, tuple):
         h = 0x345678
         for i in x:
             h ^= portable_hash(i)
             h *= 1000003
             h &= sys.maxsize
         h ^= len(x)
         if h == -1:
             h = -2
         return int(h)
     return hash(x)

Exemple #2

0

Afficher le fichier

Fichier : rddtools.py Projet : janelia-flyem/flyemflows

    def portable_hash(x):
        """
        (Copied from pyspark.rdd)
        
        This function returns consistent hash code for builtin types, especially
        for None and tuple with None.
    
        The algorithm is similar to that one used by CPython 2.7
    
        >>> portable_hash(None)
        0
        >>> portable_hash((None, 1)) & 0xffffffff
        219750521
        """

        if sys.version_info >= (3, 2,
                                3) and 'PYTHONHASHSEED' not in os.environ:
            raise Exception(
                "Randomness of hash of string should be disabled via PYTHONHASHSEED"
            )

        if x is None:
            return 0
        if isinstance(x, tuple):
            h = 0x345678
            for i in x:
                h ^= portable_hash(i)
                h *= 1000003
                h &= sys.maxsize
            h ^= len(x)
            if h == -1:
                h = -2
            return int(h)
        return hash(x)

Exemple #3

0

Afficher le fichier

Fichier : batterytemp.py Projet : gurupras/phonelab-postprocessing

def main():
	conf = SparkConf()
	conf.set("spark.default.parallelism", "32")
	#conf.set("spark.executor.memory", "1g")
	sc = SparkContext(appName="BatteryTemp", conf=conf)

	def tag_filter(ll):
		if ll.tag == 'KernelPrintk':
			return True
		elif ll.tag == 'Kernel-Trace':
			try:
				return ll.trace_event == 'thermal_temp'
			except:
				return False
		return False

	# Load LogLine tuples
	#all_logs = sc.textFile(processed_dir, use_unicode=False).map(lambda x: lib.logline_from_json(x))
	all_logs = sc.textFile(data_files, use_unicode=False).flatMap(ll_mapper)

	# Filter out any tags we don't care about
	#filtered = all_logs.filter(lambda line: line.tag in tags)
	filtered = all_logs.filter(tag_filter)

	# Group by (boot_id, date) so we have smaller chunks to work with
	keyed = filtered.keyBy(lambda ll: (ll.boot_id, ll.timestamp[:10], ll.line_num))

	partitioned = keyed.repartitionAndSortWithinPartitions(partitionFunc=lambda x: portable_hash(x[:2]), keyfunc=lambda x: (x[0],x[2]))
	intervals = partitioned.mapPartitions(process).map(lambda x: str(x))
	results = intervals.collect()

	fd = open("results", 'w')
	for res in results:
		s = '{}\n'.format(str(res))
		#print str(res)
		#fd.write(str(res) + '\n')
		fd.write(s)
	fd.close()

Exemple #4

0

Afficher le fichier

Fichier : ccf_pyspark.py Projet : PFMassiani/ccf-pyspark

def partition_by_first_value(key):
    return portable_hash(key[0])

Exemple #5

0

Afficher le fichier

    with SparkContext("local[2]") as sc:
        rdd = sc.parallelize(nums) \
            .map(lambda el: (el, el)) \
            .partitionBy(3) \
            .persist()

        print("Number of partitions: {}".format(rdd.getNumPartitions()))
        print("Partitioner: {}".format(rdd.partitioner))
        print("Partitions structure: {}".format(rdd.glom().collect()))


from pyspark.rdd import portable_hash
num_partitions = 2
for el in nums:
    print("Element: [{}]: {} % {} = partition {}".format(
        el, portable_hash(el), num_partitions, portable_hash(el) % num_partitions))
#
@get_expend_time
def test_6():
    transactions = [
        {'name': 'Bob', 'amount': 100, 'country': 'United Kingdom'},
        {'name': 'James', 'amount': 15, 'country': 'United Kingdom'},
        {'name': 'Marek', 'amount': 51, 'country': 'Poland'},
        {'name': 'Johannes', 'amount': 200, 'country': 'Germany'},
        {'name': 'Paul', 'amount': 75, 'country': 'Poland'},
    ]

    def country_partitioner(country):
        return hash(country)
    # Validate results
    num_partitions = 5

Exemple #6

0

Afficher le fichier

Fichier : in-shuffle-secondary-sort-print.py Projet : sebrestin/spark-secondary-sort

def partition_func(key):
    return portable_hash(key[0])

Exemple #7

0

Afficher le fichier

 def partitioner_(x):
     return portable_hash(x[0]) % n

Exemple #8

0

Afficher le fichier

        i = i + 1


df = spark.createDataFrame(data)
df.show()
print_partitions(df)
'''Repartition data to 3 partition by Country'''
'''We expect each country data to be in each partition but thats not possible as partitoning is done based on hash partitioning
   so different country code might fall in same partition'''
numPartitions = 3

df = df.repartition(numPartitions, "Country")

print_partitions(df)
'''We can verify the hash value of each partition based on which it repartitions '''
udf_portable_hash = udf(lambda str: portable_hash(str))
df = df.withColumn("Hash#", udf_portable_hash(df.Country))
df = df.withColumn("Partition#", df["Hash#"] % numPartitions)
df.show()
'''Increasing partition to 5 puts data for each country in  different partition but thats not confirmed in all scenario'''
numPartitions = 5
df = df.repartition(numPartitions, "Country")
print_partitions(df)
udf_portable_hash = udf(lambda str: portable_hash(str))
df = df.withColumn("Hash#", udf_portable_hash(df.Country))
df = df.withColumn("Partition#", df["Hash#"] % numPartitions)
df.show()
'''To address this issue we use customised partition'''
'''udf below'''

Exemple #9

0

Afficher le fichier

Fichier : sort.py Projet : Brandonage/HiBench

 def hashedPartitioner(k):
     return portable_hash(keyfunc(k)) % numPartitions

Exemple #10

0

Afficher le fichier

Fichier : sort.py Projet : liupc/fst-bench

 def hashedPartitioner(k):
     return portable_hash(keyfunc(k)) % numPartitions