Example #1
0
import sys, os
# sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
sys.path.append(os.environ['PROJ4001'])
from plugin_support import getSparkContext

sc=getSparkContext('count')
fin, fout=sys.argv[1:3]
regions=sc.mixin.inputData(fin,'regions')
locations=sc.mixin.inputData(fin,'locations')
_orders=[sc.mixin.inputData(fin, relation) for relation in ('orders-part-%05d'%i for i in xrange(6))]
orders=reduce(lambda a,b:a.union(b), _orders)
orders.cache()

result={
    '#locations': locations.count(),
    '#regions': regions.count(),
    '#orders': orders.count(),
    '#packages': orders.map(lambda order:len(order.split('\t')[4].split(';'))).reduce(lambda a,b:a+b),
}

output="""Number of locations: %(#locations)s
Number of regions: %(#regions)s
Number of orders: %(#orders)s
Number of packages: %(#packages)s
""" % (result)

if sc.mixin.outputData(fout, (line for line in output.split('\n') if line)):
    print fout

Example #2
0
import sys, os, re, operator
sys.path.append(os.environ['PROJ4001'])
from plugin_support import getSparkContext

sc=getSparkContext('top')
fin, fout=sys.argv[1:3]
regions=sc.mixin.inputData(fin,'regions')
locations=sc.mixin.inputData(fin,'locations')

lr_mapping={location:region for (location, region) in (line.split('\t') for line in locations.collect())}

_orders=(sc.mixin.inputData(fin, relation) for relation in ('orders-part-%05d'%i for i in xrange(6)))

def deserialize(line):
    pattern=re.compile(r'^(\w+)\t(\w+)\t(\d+)\t([\w,]*)\t([\d,;]+)$')
    fro, to, duration, requirements, str_packages = pattern.match(line).groups()
    packages=tuple(tuple(int(j) for j in i.split(',')) for i in str_packages.split(';'))
    return ((fro, to), (duration, requirements, packages))

def rmap(((fro, to), (duration, requirements, packages))):
    return (lr_mapping[fro]+lr_mapping[to], (fro, to, duration, requirements, packages))

orders=reduce(lambda a,b:a.union(b), _orders).map(deserialize).map(rmap)
print orders.takeSample(False, 2)

def rstat1():
    def sum_weight((key, (fro, to, duration, requirements, packages))):
        return key, sum(i[3] for i in packages)
    def prepare((key, s)):
        def _key(r):
            r1, r2=r[:2], r[2:]
Example #3
0
import sys, os, re, operator
sys.path.append(os.environ['PROJ4001'])
from plugin_support import getSparkContext

sc = getSparkContext('top')
fin, fout = sys.argv[1:3]
regions = sc.mixin.inputData(fin, 'regions')
locations = sc.mixin.inputData(fin, 'locations')

lr_mapping = {
    location: region
    for (location, region) in (line.split('\t')
                               for line in locations.collect())
}

_orders = (sc.mixin.inputData(fin, relation)
           for relation in ('orders-part-%05d' % i for i in xrange(6)))


def deserialize(line):
    pattern = re.compile(r'^(\w+)\t(\w+)\t(\d+)\t([\w,]*)\t([\d,;]+)$')
    fro, to, duration, requirements, str_packages = pattern.match(
        line).groups()
    packages = tuple(
        tuple(int(j) for j in i.split(',')) for i in str_packages.split(';'))
    return ((fro, to), (duration, requirements, packages))


def rmap(((fro, to), (duration, requirements, packages))):
    return (lr_mapping[fro] + lr_mapping[to], (fro, to, duration, requirements,
                                               packages))