def reduce(self, function, initializer, target = 'keys'): # target = 'keys': reduce across keys # target = 'values': reduce across values intermediate_results = [] encoded_func = util.encode_function(function) for num in range(self.hash_grain): try: assignment = self.worker_assignment[num] except KeyError: self.execute() proxy = xmlrpclib.ServerProxy(self.worker_assignment[num].uri) args = util.pds(self.uid, num, encoded_func, initializer, target) intermediate_result = proxy.worker_reduce(args) intermediate_results.append(intermediate_result) return reduce(function, intermediate_results, initializer)
def serialize_action(self): return util.encode_function(self.function)
def serialize_action(self): return (self.filename, util.encode_function(self.function), util.encode_function(self.hash_function), self.multivalue)
def serialize_action(self): return (util.encode_function(self.function),self.initializer)
def run_task(self, pickled_args): (rdd_id, hash_num, rdd_type, action, data_src, parents, hash_func, peers) = util.pls(pickled_args) rdd_type = pickle.loads(rdd_type) action = rdd_type.unserialize_action(action) hash_func = util.decode_function(hash_func) filter_func = util.encode_function(lambda key: hash_func(key) == hash_num) if rdd_type == rdd.JoinRDD: working_data = [{}, {}] for index in [0, 1]: parent_uid = parents[index] assignment = data_src[index] key = (parent_uid, hash_num) with self.lock: data_is_local = self.data.has_key(key) if not data_is_local: # print "Join: Querying remote server" proxy = xmlrpclib.ServerProxy(assignment,transport=self.transport) try: working_data[index] = self.query_remote(key,proxy) except (socket.timeout,KeyError): # print "timeout or key error" return assignment else: with self.lock: working_data[index] = self.data[key] with self.lock: self.data[(rdd_id, hash_num)] = action(working_data[0], working_data[1]) return "OK" if rdd_type == rdd.PartitionByRDD: working_data = collections.defaultdict(list) for peer in peers: if peer != self.uri: proxy = xmlrpclib.ServerProxy(peer,transport=self.transport) else: proxy = self for parent_uid in parents: key = (parent_uid, hash_num) try: queried_data = self.query_remote(key,proxy,{}) except socket.timeout: return peer #print queried_data try: for k, v in queried_data.items(): if type(v) == list: working_data[k].extend(v) else: working_data[k].append(v) except ValueError as e: print key,queried_data raise e elif len(parents) > 0: ## number of parents should be 1 parent_uid = parents[0] assignment = data_src[0] key = (parent_uid, hash_num) with self.lock: data_is_local = self.data.has_key(key) if not data_is_local: # print "Querying remote server" proxy = xmlrpclib.ServerProxy(assignment,self.transport) try: working_data = self.query_remote(key,proxy) except (socket.timeout , KeyError): # print "fetch timeout or KeyError" return assignment else: with self.lock: working_data = self.data[key] else: working_data = {} output = action(working_data, hash_num) if (rdd_type == rdd.IntermediateFlatMapRDD or rdd_type == rdd.IntermediateMapRDD): ## Split output into partial partitions for k, v in output.items(): ## v should be a list key = (rdd_id, hash_func(k)) with self.lock: if self.data[key].has_key(k): self.data[key][k].extend(v) else: self.data[key][k] = v else: with self.lock: self.data[(rdd_id, hash_num)] = output return "OK"