Example #1
0
File: rdd.py Project: helfer/py-rdd
  def unserialize_action(blob):
    filename, function, hash_function, multivalue = blob
    function = util.decode_function(function)
    hash_function = util.decode_function(hash_function)
    if multivalue:
      def action(data, hash_num):
        output = collections.defaultdict(list)
        f = open(filename)
        for line in map(lambda li: li.strip('\n'), f.readlines()):
          key, value = function(line)
          if hash_function(key) == hash_num:
            output[key].append(value)
        f.close()
        return output
    else:
      def action(data, hash_num):
        output = collections.defaultdict(list)
        f = open(filename)
        for line in f.readlines():
          key, value = function(line)
          if hash_function(key) == hash_num:
            output[key] = value
        f.close()
        return output

    return action
Example #2
0
 def worker_reduce(self, args):
   rdd_id, hash_num, func, initializer, target = util.pls(args)
   func = util.decode_function(func)
   if target == 'keys':
     return reduce(func, self.data[(rdd_id, hash_num)].keys(), initializer)
   elif target == 'values':
     return reduce(func, self.data[(rdd_id, hash_num)].values(), initializer)
Example #3
0
File: rdd.py Project: helfer/py-rdd
 def unserialize_action(blob):
   function = util.decode_function(blob)
   def action(data, hash_num):
     output = {}
     for key, value in data.items():
       output[key] = function(value)
     return output
   return action
Example #4
0
File: rdd.py Project: helfer/py-rdd
 def unserialize_action(blob):
   function = util.decode_function(blob[0])
   initializer = blob[1]
   def action(data, hash_num):
     output = {}
     for key, values in data.items():
       output[key] = reduce(function,values,initializer)
     return output
   return action
Example #5
0
File: rdd.py Project: helfer/py-rdd
 def unserialize_action(blob):
   function = util.decode_function(blob)
   def action(data, hash_num):
     output = {}
     for key, value in data.items():
         ## note: out_key must have type string in order to be sent through
         ## RPC
         out_key, out_value = function(key, value)
         output[out_key] = out_value
     return output
   return action
Example #6
0
File: rdd.py Project: helfer/py-rdd
 def unserialize_action(blob):
   function = util.decode_function(blob)
   def action(data, hash_num):
     output = collections.defaultdict(list)
     for key, val in data.items():
       for out_key, out_value in function(val):
         ## note: out_key must have type string in order to be sent through
         ## RPC
         output[out_key].append(out_value)
     return output
   return action
Example #7
0
  def run_task(self, pickled_args):
    (rdd_id, hash_num, rdd_type, action, data_src, parents, hash_func,
        peers) = util.pls(pickled_args)
    rdd_type = pickle.loads(rdd_type)
    action = rdd_type.unserialize_action(action)
    hash_func = util.decode_function(hash_func)
    filter_func = util.encode_function(lambda key: hash_func(key) == hash_num)

    if rdd_type == rdd.JoinRDD:
      working_data = [{}, {}]
      for index in [0, 1]:
        parent_uid = parents[index]
        assignment = data_src[index]
        key = (parent_uid, hash_num)
        with self.lock:
          data_is_local = self.data.has_key(key)
        if not data_is_local:
#          print "Join: Querying remote server"
          proxy = xmlrpclib.ServerProxy(assignment,transport=self.transport)
          try:
            working_data[index] = self.query_remote(key,proxy)
          except (socket.timeout,KeyError):
#            print "timeout or key error"
            return assignment
        else:
          with self.lock:
            working_data[index] = self.data[key]
      with self.lock:
        self.data[(rdd_id, hash_num)] = action(working_data[0], working_data[1])
      return "OK"

    if rdd_type == rdd.PartitionByRDD:
      working_data = collections.defaultdict(list)
      for peer in peers:
        if peer != self.uri:
          proxy = xmlrpclib.ServerProxy(peer,transport=self.transport)
        else:
          proxy = self
        for parent_uid in parents:
          key = (parent_uid, hash_num)
          try:
            queried_data = self.query_remote(key,proxy,{})
          except socket.timeout:
            return peer
          #print queried_data
          try:
            for k, v in queried_data.items():
              if type(v) == list:
                working_data[k].extend(v)
              else:
                working_data[k].append(v)
          except ValueError as e:
            print key,queried_data
            raise e
    elif len(parents) > 0:
      ## number of parents should be 1
      parent_uid = parents[0]
      assignment = data_src[0]
      key = (parent_uid, hash_num)
      with self.lock:
        data_is_local = self.data.has_key(key)
      if not data_is_local:
#        print "Querying remote server"
        proxy = xmlrpclib.ServerProxy(assignment,self.transport)
        try:
          working_data = self.query_remote(key,proxy)
        except (socket.timeout , KeyError):
#          print "fetch timeout or KeyError"
          return assignment
      else:
        with self.lock:
          working_data = self.data[key]
    else:
      working_data = {}
    output = action(working_data, hash_num)
    if (rdd_type == rdd.IntermediateFlatMapRDD or
        rdd_type == rdd.IntermediateMapRDD):
      ## Split output into partial partitions
      for k, v in output.items():
        ## v should be a list
        key = (rdd_id, hash_func(k))
        with self.lock:
          if self.data[key].has_key(k):
            self.data[key][k].extend(v)
          else:
            self.data[key][k] = v
    else:
      with self.lock:
        self.data[(rdd_id, hash_num)] = output

    return "OK"