コード例 #1
0
ファイル: rdd.py プロジェクト: YuanyuanZh/MiniSpark
 def shuffle(self, input_source):
     results = []
     #debug_print("[Wide-RDD] {0} InputSource is {1}".format(self.id, input_source))
     for source in input_source:
         # debug_print("[Shuffle] {0} Shuffling from source {1}".format(self.id, source))
         if not isinstance(source, list):
             source=[source]
         for p in source:
             result=None
             while result is None:
                 task_node_table=p["task_node_table"]
                 worker_address= task_node_table["{0}_{1}".format(p['job_id'],p['task_id'])]["address"]
                 client = get_client(worker_address)
                 # debug_print("[Shuffle] {0} get a None from {1}, at Part {2}, retrying".format(self.id, p['task_id'],p['partition_id']))
                 result=execute_command(client, client.get_rdd_result,
                                       p['job_id'],
                                       p['task_id'],
                                       p['partition_id'])
             # debug_print("[Shuffle] {0} get a result={1} from {2}, at Part {3}".format(self.id, result,  p['task_id'],p['partition_id']))
             results += result
     return results
コード例 #2
0
ファイル: pagerank.py プロジェクト: YuanyuanZh/MiniSpark
            'num_partition_GBK': 2,
            'split_size': 128,
        }
        t = rdd.TextFile(self.filename)
        m = rdd.Map(t, (lambda urls: parseNeighbors(urls)))
        links = rdd.GroupByKey(m)
        ranks = rdd.Map(links, lambda url_neighbors: (url_neighbors[0], 1.0))
        for iteration in range(5):
            joins = rdd.Join([links, ranks])
            contribs = rdd.FlatMap(
                joins, lambda url_urls_rank: computeContribs(
                    url_urls_rank[1][0], url_urls_rank[1][1]))
            rbk = rdd.ReduceByKey(contribs, lambda a, b: a + b)
            ranks = rdd.MapValue(rbk, lambda rank: rank * 0.85 + 0.15)
        ranks.collect(driver)


if __name__ == '__main__':

    master_address = sys.argv[1]
    self_address = sys.argv[2]
    filepath = sys.argv[3]

    page_rank_client = PageRankClient(filepath)
    # page_rank_client = PageRankClient(sys.argv[1])
    client = get_client(master_address)
    execute_command(client, client.get_job, pickle_object(page_rank_client),
                    self_address)
    print "[Client]Job Submited...."
    page_rank_client.start_server(self_address)
コード例 #3
0
ファイル: pagerank.py プロジェクト: YuanyuanZh/MiniSpark
        RDD._config = {'num_partition_RBK': 2,
                   'num_partition_GBK': 2,
                   'split_size': 128,
                   }
        t = rdd.TextFile(self.filename)
        m = rdd.Map(t, (lambda urls: parseNeighbors(urls)))
        links = rdd.GroupByKey(m)
        ranks = rdd.Map(links, lambda url_neighbors: (url_neighbors[0], 1.0))
        for iteration in range(5):
            joins = rdd.Join([links, ranks])
            contribs = rdd.FlatMap(joins,
                                   lambda url_urls_rank: computeContribs(url_urls_rank[1][0], url_urls_rank[1][1]))
            rbk = rdd.ReduceByKey(contribs, lambda a, b: a + b)
            ranks = rdd.MapValue(rbk, lambda rank: rank * 0.85 + 0.15)
        ranks.collect(driver)


if __name__ == '__main__':

    master_address = sys.argv[1]
    self_address = sys.argv[2]
    filepath = sys.argv[3]

    page_rank_client = PageRankClient(filepath)
    # page_rank_client = PageRankClient(sys.argv[1])
    client = get_client(master_address)
    execute_command(client, client.get_job, pickle_object(page_rank_client), self_address)
    print "[Client]Job Submited...."
    page_rank_client.start_server(self_address)

コード例 #4
0
ファイル: logquery.py プロジェクト: YuanyuanZh/MiniSpark
            'num_partition_GBK': 2,
            'split_size': 128
        }
        lines = rdd.TextFile(self.filename)
        warnings = rdd.Filter(lines, lambda l: l.startswith("Warning"))
        worker0_warnings = rdd.Filter(warnings,
                                      lambda x: x.contains("worker0"))
        worker0_down_info = rdd.Map(worker0_warnings,
                                    lambda w: parse_warning(w))
        worker0_down_info.collect(driver)


if __name__ == '__main__':

    master_address = sys.argv[1]
    self_address = sys.argv[2]
    filepath = sys.argv[3]

    word_count_client = LogQueryClient(filepath)

    # word_count_client = LogQueryClient("../../files/logquery.txt")
    new_rdd = unpickle_object(pickle_object(word_count_client))

    client = get_client(master_address)
    print "====="
    obj = pickle_object(word_count_client)
    print "====="
    execute_command(client, client.get_job, obj, self_address)
    print "====="
    word_count_client.start_server("0.0.0.0:" + self_address.split(":")[1])
コード例 #5
0
ファイル: logquery.py プロジェクト: YuanyuanZh/MiniSpark
        RDD._config = {'num_partition_RBK': 2,
                   'num_partition_GBK': 2,
                   'split_size': 128}
        lines = rdd.TextFile(self.filename)
        warnings = rdd.Filter(lines,lambda l: l.startswith("Warning"))
        worker0_warnings = rdd.Filter(warnings, lambda x: x.contains("worker0"))
        worker0_down_info = rdd.Map(worker0_warnings, lambda w: parse_warning(w))
        worker0_down_info.collect(driver)


if __name__ == '__main__':

    master_address = sys.argv[1]
    self_address = sys.argv[2]
    filepath = sys.argv[3]

    word_count_client = LogQueryClient(filepath)

    # word_count_client = LogQueryClient("../../files/logquery.txt")
    new_rdd = unpickle_object(pickle_object(word_count_client))


    client = get_client(master_address)
    print "====="
    obj = pickle_object(word_count_client)
    print "====="
    execute_command(client, client.get_job, obj, self_address)
    print "====="
    word_count_client.start_server("0.0.0.0:" + self_address.split(":")[1])