def shuffle(self, input_source): results = [] #debug_print("[Wide-RDD] {0} InputSource is {1}".format(self.id, input_source)) for source in input_source: # debug_print("[Shuffle] {0} Shuffling from source {1}".format(self.id, source)) if not isinstance(source, list): source=[source] for p in source: result=None while result is None: task_node_table=p["task_node_table"] worker_address= task_node_table["{0}_{1}".format(p['job_id'],p['task_id'])]["address"] client = get_client(worker_address) # debug_print("[Shuffle] {0} get a None from {1}, at Part {2}, retrying".format(self.id, p['task_id'],p['partition_id'])) result=execute_command(client, client.get_rdd_result, p['job_id'], p['task_id'], p['partition_id']) # debug_print("[Shuffle] {0} get a result={1} from {2}, at Part {3}".format(self.id, result, p['task_id'],p['partition_id'])) results += result return results
'num_partition_GBK': 2, 'split_size': 128, } t = rdd.TextFile(self.filename) m = rdd.Map(t, (lambda urls: parseNeighbors(urls))) links = rdd.GroupByKey(m) ranks = rdd.Map(links, lambda url_neighbors: (url_neighbors[0], 1.0)) for iteration in range(5): joins = rdd.Join([links, ranks]) contribs = rdd.FlatMap( joins, lambda url_urls_rank: computeContribs( url_urls_rank[1][0], url_urls_rank[1][1])) rbk = rdd.ReduceByKey(contribs, lambda a, b: a + b) ranks = rdd.MapValue(rbk, lambda rank: rank * 0.85 + 0.15) ranks.collect(driver) if __name__ == '__main__': master_address = sys.argv[1] self_address = sys.argv[2] filepath = sys.argv[3] page_rank_client = PageRankClient(filepath) # page_rank_client = PageRankClient(sys.argv[1]) client = get_client(master_address) execute_command(client, client.get_job, pickle_object(page_rank_client), self_address) print "[Client]Job Submited...." page_rank_client.start_server(self_address)
RDD._config = {'num_partition_RBK': 2, 'num_partition_GBK': 2, 'split_size': 128, } t = rdd.TextFile(self.filename) m = rdd.Map(t, (lambda urls: parseNeighbors(urls))) links = rdd.GroupByKey(m) ranks = rdd.Map(links, lambda url_neighbors: (url_neighbors[0], 1.0)) for iteration in range(5): joins = rdd.Join([links, ranks]) contribs = rdd.FlatMap(joins, lambda url_urls_rank: computeContribs(url_urls_rank[1][0], url_urls_rank[1][1])) rbk = rdd.ReduceByKey(contribs, lambda a, b: a + b) ranks = rdd.MapValue(rbk, lambda rank: rank * 0.85 + 0.15) ranks.collect(driver) if __name__ == '__main__': master_address = sys.argv[1] self_address = sys.argv[2] filepath = sys.argv[3] page_rank_client = PageRankClient(filepath) # page_rank_client = PageRankClient(sys.argv[1]) client = get_client(master_address) execute_command(client, client.get_job, pickle_object(page_rank_client), self_address) print "[Client]Job Submited...." page_rank_client.start_server(self_address)
'num_partition_GBK': 2, 'split_size': 128 } lines = rdd.TextFile(self.filename) warnings = rdd.Filter(lines, lambda l: l.startswith("Warning")) worker0_warnings = rdd.Filter(warnings, lambda x: x.contains("worker0")) worker0_down_info = rdd.Map(worker0_warnings, lambda w: parse_warning(w)) worker0_down_info.collect(driver) if __name__ == '__main__': master_address = sys.argv[1] self_address = sys.argv[2] filepath = sys.argv[3] word_count_client = LogQueryClient(filepath) # word_count_client = LogQueryClient("../../files/logquery.txt") new_rdd = unpickle_object(pickle_object(word_count_client)) client = get_client(master_address) print "=====" obj = pickle_object(word_count_client) print "=====" execute_command(client, client.get_job, obj, self_address) print "=====" word_count_client.start_server("0.0.0.0:" + self_address.split(":")[1])
RDD._config = {'num_partition_RBK': 2, 'num_partition_GBK': 2, 'split_size': 128} lines = rdd.TextFile(self.filename) warnings = rdd.Filter(lines,lambda l: l.startswith("Warning")) worker0_warnings = rdd.Filter(warnings, lambda x: x.contains("worker0")) worker0_down_info = rdd.Map(worker0_warnings, lambda w: parse_warning(w)) worker0_down_info.collect(driver) if __name__ == '__main__': master_address = sys.argv[1] self_address = sys.argv[2] filepath = sys.argv[3] word_count_client = LogQueryClient(filepath) # word_count_client = LogQueryClient("../../files/logquery.txt") new_rdd = unpickle_object(pickle_object(word_count_client)) client = get_client(master_address) print "=====" obj = pickle_object(word_count_client) print "=====" execute_command(client, client.get_job, obj, self_address) print "=====" word_count_client.start_server("0.0.0.0:" + self_address.split(":")[1])