import sys, os # sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) sys.path.append(os.environ['PROJ4001']) from plugin_support import getSparkContext sc=getSparkContext('count') fin, fout=sys.argv[1:3] regions=sc.mixin.inputData(fin,'regions') locations=sc.mixin.inputData(fin,'locations') _orders=[sc.mixin.inputData(fin, relation) for relation in ('orders-part-%05d'%i for i in xrange(6))] orders=reduce(lambda a,b:a.union(b), _orders) orders.cache() result={ '#locations': locations.count(), '#regions': regions.count(), '#orders': orders.count(), '#packages': orders.map(lambda order:len(order.split('\t')[4].split(';'))).reduce(lambda a,b:a+b), } output="""Number of locations: %(#locations)s Number of regions: %(#regions)s Number of orders: %(#orders)s Number of packages: %(#packages)s """ % (result) if sc.mixin.outputData(fout, (line for line in output.split('\n') if line)): print fout
import sys, os, re, operator sys.path.append(os.environ['PROJ4001']) from plugin_support import getSparkContext sc=getSparkContext('top') fin, fout=sys.argv[1:3] regions=sc.mixin.inputData(fin,'regions') locations=sc.mixin.inputData(fin,'locations') lr_mapping={location:region for (location, region) in (line.split('\t') for line in locations.collect())} _orders=(sc.mixin.inputData(fin, relation) for relation in ('orders-part-%05d'%i for i in xrange(6))) def deserialize(line): pattern=re.compile(r'^(\w+)\t(\w+)\t(\d+)\t([\w,]*)\t([\d,;]+)$') fro, to, duration, requirements, str_packages = pattern.match(line).groups() packages=tuple(tuple(int(j) for j in i.split(',')) for i in str_packages.split(';')) return ((fro, to), (duration, requirements, packages)) def rmap(((fro, to), (duration, requirements, packages))): return (lr_mapping[fro]+lr_mapping[to], (fro, to, duration, requirements, packages)) orders=reduce(lambda a,b:a.union(b), _orders).map(deserialize).map(rmap) print orders.takeSample(False, 2) def rstat1(): def sum_weight((key, (fro, to, duration, requirements, packages))): return key, sum(i[3] for i in packages) def prepare((key, s)): def _key(r): r1, r2=r[:2], r[2:]
import sys, os, re, operator sys.path.append(os.environ['PROJ4001']) from plugin_support import getSparkContext sc = getSparkContext('top') fin, fout = sys.argv[1:3] regions = sc.mixin.inputData(fin, 'regions') locations = sc.mixin.inputData(fin, 'locations') lr_mapping = { location: region for (location, region) in (line.split('\t') for line in locations.collect()) } _orders = (sc.mixin.inputData(fin, relation) for relation in ('orders-part-%05d' % i for i in xrange(6))) def deserialize(line): pattern = re.compile(r'^(\w+)\t(\w+)\t(\d+)\t([\w,]*)\t([\d,;]+)$') fro, to, duration, requirements, str_packages = pattern.match( line).groups() packages = tuple( tuple(int(j) for j in i.split(',')) for i in str_packages.split(';')) return ((fro, to), (duration, requirements, packages)) def rmap(((fro, to), (duration, requirements, packages))): return (lr_mapping[fro] + lr_mapping[to], (fro, to, duration, requirements, packages))