def make_filter(feature, feature_point, data, np): feature_point = unicode(feature_point.decode('utf8')) rdd = dpark.parallelize(data, numSlices=1) def _has_feature(item): if item[feature] == None: return False return feature_point in item[feature] def _is_feature(item): return item[feature] == feature_point def _compare_feature(item): try: result = float(item[feature]) >= float(feature_point) return result except Exception: return False def _has_not_feature(item): if item[feature] == None: return True feature_point not in item[feature] def _is_not_feature(item): return item[feature] != feature_point def _not_compare_feature(item): try: return float(item[feature]) < float(feature_point) except Exception: return True np_map = { 0: set([_has_not_feature, _is_not_feature, _not_compare_feature]), 1: set([_has_feature, _is_feature, _compare_feature]) } feature_map = { 'language': set([_has_feature, _has_not_feature]), 'countries': set([_has_feature, _has_not_feature]), 'tags': set([_has_feature, _has_not_feature]), 'rate': set([_compare_feature, _not_compare_feature]), 'people': set([_compare_feature, _not_compare_feature]), 'editors': set([_has_feature, _has_not_feature]), 'directors': set([_has_feature, _has_not_feature]), 'actors': set([_has_feature, _has_not_feature]), 'year': set([_compare_feature, _not_compare_feature]), 'length': set([_compare_feature, _not_compare_feature]), 'types': set([_has_feature, _has_not_feature]) } decision = list(np_map[np] & feature_map[feature])[0] return rdd.filter(decision).collect()
def pixelStats(urls, variable, nPartitions, timeFromFilename=TimeFromFilenameDOY, groupByKeys=GroupByKeys, accumulators=Accumulators, cachePath=CachePath, mode='dpark', modes=Modes): '''Compute a global (or regional) pixel mean field in parallel, given a list of URL's pointing to netCDF files.''' baseKey = groupByKeys[0] if baseKey == 'month': urlsByKey = splitByMonth(urls, timeFromFilename) else: print('pixelStats: Unrecognized groupByKey "%s". Must be in %s' % (baseKey, str(groupByKeys)), file=sys.stderr) sys.exit(1) if mode == 'sequential': accum = [accumulate(u, variable, accumulators) for u in urlsByKey] merged = reduce(combine, accum) stats = statsFromAccumulators(merged) elif mode == 'dpark': import dpark urls = dpark.parallelize(urlsByKey, nPartitions) # returns RDD of URL lists accum = urls.map(lambda urls: accumulate(urls, variable, accumulators) ) # returns RDD of stats accumulators merged = accum.reduce(combine) # merged accumulators on head node stats = statsFromAccumulators( merged) # compute final stats from accumulators elif mode == 'spark': from pyspark import SparkContext sc = SparkContext(appName="PixelStats") urls = sc.parallelize(urlsByKey, nPartitions) # returns RDD of URL lists accum = urls.map(lambda urls: accumulate(urls, variable, accumulators) ) # returns RDD of stats accumulators merged = accum.reduce(combine) # merged accumulators on head node stats = statsFromAccumulators( merged) # compute final stats from accumulators else: stats = None if mode not in modes: print('pixelStats: Unrecognized mode "%s". Must be in %s' % (mode, str(modes)), file=sys.stderr) sys.exit(1) return stats
def make_filter(feature, feature_point, data, np): print np print "====================================" feature_point = unicode(feature_point.decode('utf8')) rdd = dpark.parallelize(data, numSlices=1) def _has_feature(item): if item[feature] == None: return False return feature_point in item[feature] def _is_feature(item): return item[feature] == feature_point def _compare_feature(item): try: result = float(item[feature]) >= float(feature_point) return result except Exception: return False def _has_not_feature(item): if item[feature] == None: return True feature_point not in item[feature] def _is_not_feature(item): return item[feature] != feature_point def _not_compare_feature(item): try: return float(item[feature]) < float(feature_point) except Exception: return True np_map = {0: set( [_has_not_feature, _is_not_feature, _not_compare_feature]), 1: set([_has_feature, _is_feature, _compare_feature])} feature_map = {'language': set([_has_feature, _has_not_feature]), 'countries': set([_has_feature, _has_not_feature]), 'tags': set([_has_feature, _has_not_feature]), 'rate': set([_compare_feature, _not_compare_feature]), 'people': set([_compare_feature, _not_compare_feature]), 'editors': set([_has_feature, _has_not_feature]), 'directors': set([_has_feature, _has_not_feature]), 'actors': set([_has_feature, _has_not_feature]), 'year': set([_compare_feature, _not_compare_feature]), 'length': set([_compare_feature, _not_compare_feature]), 'types': set([_has_feature, _has_not_feature])} decision = list(np_map[np] & feature_map[feature])[0] return rdd.filter(decision).collect()
def get_phidias_point(feature_set, citerion=CrossEncropyCiterion): """ through all feature_set to find a best split citerion """ feature_rdd = dpark.parallelize(feature_set) total_count = feature_rdd.count() def _label_count(item): return (item, 1) def _count_stat(item): return (item[0], sum(item[1])) def _compute_criterion(item): return (item[0], citerion()(item[1], total_count)) def _max_criterion(item1, item2): return item1 if item1[1] > item2[1] else item2 return feature_rdd.map(_label_count).groupByKey().map(_count_stat).map(_compute_criterion).collect()
def get_phidias_point(feature_set, citerion=CrossEncropyCiterion): """ through all feature_set to find a best split citerion """ feature_rdd = dpark.parallelize(feature_set, numSlices=1) total_count = feature_rdd.count() def _label_count(item): return (item, 1) def _count_stat(item): return (item[0], sum(item[1])) def _compute_criterion(item): return (item[0], citerion()(item[1], total_count)) def _max_criterion(item1, item2): return item1 if item1[1] > item2[1] else item2 return feature_rdd.map(_label_count).groupByKey()\ .map(_count_stat).map(_compute_criterion).sort(key=lambda x: x[1], reverse=True).take(20)
def _expand_xs(self): past_xs = self.xs[-self.d:][::-1] def n_to_b(n): string = bin(n)[2:] b_s = [int(s) for s in string][::-1] return b_s def add(n): b_s = n_to_b(n) xs = [past_x for past_x, i in zip(past_xs, b_s)] if xs[-1] == '*': return 0 else: for i, x in zip(b_s, xs)[:-1]: if i < (x != '*'): return 0 return xs[-1] rdd = dpark.parallelize([i for i in range(2**self.d-1)], 5) rdd = rdd.map(add) self.expand_xs = rdd.collect()
# coding: utf-8 import dpark def set_diff(rdd1, rdd2): """ Return an RDD with elements in rdd1 but not in rdd2. """ pair_rdd1 = rdd1.map(lambda x: (x, None)) pair_rdd2 = rdd2.map(lambda x: (x, 1)) return pair_rdd1.leftOuterJoin(pair_rdd2)\ .filter(lambda x: not x[1][1])\ .map(lambda x: x[0]) if __name__ == '__main__': rdd1 = dpark.parallelize([1, 2, 3, 4]) rdd2 = dpark.parallelize([3, 4, 5, 6]) diff = set_diff(rdd1, rdd2) rs = diff.collect() assert sorted(rs) == [1, 2] # DPark 不保证顺序
import math import random import os, sys from pprint import pprint sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) import dpark # range nums = dpark.parallelize(range(100), 4) print nums.count() print nums.reduce(lambda x,y:x+y) # text search f = dpark.textFile("./", ext='py').map(lambda x:x.strip()) log = f.filter(lambda line: 'logging' in line).cache() print 'logging', log.count() print 'error', log.filter(lambda line: 'error' in line).count() for line in log.filter(lambda line: 'error' in line).collect(): print line # word count counts = f.flatMap(lambda x:x.split()).map(lambda x:(x,1)).reduceByKey(lambda x,y:x+y).cache() pprint(counts.filter(lambda (_,v): v>50).collectAsMap()) pprint(sorted(counts.filter(lambda (_,v): v>20).map(lambda (x,y):(y,x)).groupByKey().collect())) pprint(counts.map(lambda v: "%s:%s"%v ).saveAsTextFile("wc/")) # Pi import random def rand(i): x = random.random() y = random.random()