コード例 #1
0
ファイル: rdd.py プロジェクト: helfer/py-rdd
 def reduce(self, function, initializer, target = 'keys'):
   # target = 'keys': reduce across keys
   # target = 'values': reduce across values
   intermediate_results = []
   encoded_func = util.encode_function(function)
   for num in range(self.hash_grain):
     try:
       assignment = self.worker_assignment[num]
     except KeyError:
       self.execute()
     proxy = xmlrpclib.ServerProxy(self.worker_assignment[num].uri)
     args = util.pds(self.uid, num, encoded_func, initializer, target)
     intermediate_result = proxy.worker_reduce(args)
     intermediate_results.append(intermediate_result)
   return reduce(function, intermediate_results, initializer)
コード例 #2
0
ファイル: rdd.py プロジェクト: helfer/py-rdd
 def serialize_action(self):
   return util.encode_function(self.function)
コード例 #3
0
ファイル: rdd.py プロジェクト: helfer/py-rdd
 def serialize_action(self):
   return (self.filename, util.encode_function(self.function),
 util.encode_function(self.hash_function), self.multivalue)
コード例 #4
0
ファイル: rdd.py プロジェクト: helfer/py-rdd
 def serialize_action(self):
   return (util.encode_function(self.function),self.initializer)
コード例 #5
0
ファイル: worker.py プロジェクト: helfer/py-rdd
  def run_task(self, pickled_args):
    (rdd_id, hash_num, rdd_type, action, data_src, parents, hash_func,
        peers) = util.pls(pickled_args)
    rdd_type = pickle.loads(rdd_type)
    action = rdd_type.unserialize_action(action)
    hash_func = util.decode_function(hash_func)
    filter_func = util.encode_function(lambda key: hash_func(key) == hash_num)

    if rdd_type == rdd.JoinRDD:
      working_data = [{}, {}]
      for index in [0, 1]:
        parent_uid = parents[index]
        assignment = data_src[index]
        key = (parent_uid, hash_num)
        with self.lock:
          data_is_local = self.data.has_key(key)
        if not data_is_local:
#          print "Join: Querying remote server"
          proxy = xmlrpclib.ServerProxy(assignment,transport=self.transport)
          try:
            working_data[index] = self.query_remote(key,proxy)
          except (socket.timeout,KeyError):
#            print "timeout or key error"
            return assignment
        else:
          with self.lock:
            working_data[index] = self.data[key]
      with self.lock:
        self.data[(rdd_id, hash_num)] = action(working_data[0], working_data[1])
      return "OK"

    if rdd_type == rdd.PartitionByRDD:
      working_data = collections.defaultdict(list)
      for peer in peers:
        if peer != self.uri:
          proxy = xmlrpclib.ServerProxy(peer,transport=self.transport)
        else:
          proxy = self
        for parent_uid in parents:
          key = (parent_uid, hash_num)
          try:
            queried_data = self.query_remote(key,proxy,{})
          except socket.timeout:
            return peer
          #print queried_data
          try:
            for k, v in queried_data.items():
              if type(v) == list:
                working_data[k].extend(v)
              else:
                working_data[k].append(v)
          except ValueError as e:
            print key,queried_data
            raise e
    elif len(parents) > 0:
      ## number of parents should be 1
      parent_uid = parents[0]
      assignment = data_src[0]
      key = (parent_uid, hash_num)
      with self.lock:
        data_is_local = self.data.has_key(key)
      if not data_is_local:
#        print "Querying remote server"
        proxy = xmlrpclib.ServerProxy(assignment,self.transport)
        try:
          working_data = self.query_remote(key,proxy)
        except (socket.timeout , KeyError):
#          print "fetch timeout or KeyError"
          return assignment
      else:
        with self.lock:
          working_data = self.data[key]
    else:
      working_data = {}
    output = action(working_data, hash_num)
    if (rdd_type == rdd.IntermediateFlatMapRDD or
        rdd_type == rdd.IntermediateMapRDD):
      ## Split output into partial partitions
      for k, v in output.items():
        ## v should be a list
        key = (rdd_id, hash_func(k))
        with self.lock:
          if self.data[key].has_key(k):
            self.data[key][k].extend(v)
          else:
            self.data[key][k] = v
    else:
      with self.lock:
        self.data[(rdd_id, hash_num)] = output

    return "OK"