Esempio n. 1
0
 def shuffle(self, input_source):
     results = []
     #debug_print("[Wide-RDD] {0} InputSource is {1}".format(self.id, input_source))
     for source in input_source:
         # debug_print("[Shuffle] {0} Shuffling from source {1}".format(self.id, source))
         if not isinstance(source, list):
             source=[source]
         for p in source:
             result=None
             while result is None:
                 task_node_table=p["task_node_table"]
                 worker_address= task_node_table["{0}_{1}".format(p['job_id'],p['task_id'])]["address"]
                 client = get_client(worker_address)
                 # debug_print("[Shuffle] {0} get a None from {1}, at Part {2}, retrying".format(self.id, p['task_id'],p['partition_id']))
                 result=execute_command(client, client.get_rdd_result,
                                       p['job_id'],
                                       p['task_id'],
                                       p['partition_id'])
             # debug_print("[Shuffle] {0} get a result={1} from {2}, at Part {3}".format(self.id, result,  p['task_id'],p['partition_id']))
             results += result
     return results
Esempio n. 2
0
            'num_partition_GBK': 2,
            'split_size': 128,
        }
        t = rdd.TextFile(self.filename)
        m = rdd.Map(t, (lambda urls: parseNeighbors(urls)))
        links = rdd.GroupByKey(m)
        ranks = rdd.Map(links, lambda url_neighbors: (url_neighbors[0], 1.0))
        for iteration in range(5):
            joins = rdd.Join([links, ranks])
            contribs = rdd.FlatMap(
                joins, lambda url_urls_rank: computeContribs(
                    url_urls_rank[1][0], url_urls_rank[1][1]))
            rbk = rdd.ReduceByKey(contribs, lambda a, b: a + b)
            ranks = rdd.MapValue(rbk, lambda rank: rank * 0.85 + 0.15)
        ranks.collect(driver)


if __name__ == '__main__':

    master_address = sys.argv[1]
    self_address = sys.argv[2]
    filepath = sys.argv[3]

    page_rank_client = PageRankClient(filepath)
    # page_rank_client = PageRankClient(sys.argv[1])
    client = get_client(master_address)
    execute_command(client, client.get_job, pickle_object(page_rank_client),
                    self_address)
    print "[Client]Job Submited...."
    page_rank_client.start_server(self_address)
Esempio n. 3
0
        RDD._config = {'num_partition_RBK': 2,
                   'num_partition_GBK': 2,
                   'split_size': 128,
                   }
        t = rdd.TextFile(self.filename)
        m = rdd.Map(t, (lambda urls: parseNeighbors(urls)))
        links = rdd.GroupByKey(m)
        ranks = rdd.Map(links, lambda url_neighbors: (url_neighbors[0], 1.0))
        for iteration in range(5):
            joins = rdd.Join([links, ranks])
            contribs = rdd.FlatMap(joins,
                                   lambda url_urls_rank: computeContribs(url_urls_rank[1][0], url_urls_rank[1][1]))
            rbk = rdd.ReduceByKey(contribs, lambda a, b: a + b)
            ranks = rdd.MapValue(rbk, lambda rank: rank * 0.85 + 0.15)
        ranks.collect(driver)


if __name__ == '__main__':

    master_address = sys.argv[1]
    self_address = sys.argv[2]
    filepath = sys.argv[3]

    page_rank_client = PageRankClient(filepath)
    # page_rank_client = PageRankClient(sys.argv[1])
    client = get_client(master_address)
    execute_command(client, client.get_job, pickle_object(page_rank_client), self_address)
    print "[Client]Job Submited...."
    page_rank_client.start_server(self_address)