Ejemplo n.º 1
0
def parallel_process(all_features, predictions, loss, sc, debug, alpha, k, w, loss_type):
    top_k = Topk(k)
    cur_lvl = 0
    cur_lvl_nodes = list(all_features)
    pred_pandas = predictions.toPandas()
    x_size = len(pred_pandas)
    b_topk = SparkContext.broadcast(sc, top_k)
    b_cur_lvl = SparkContext.broadcast(sc, cur_lvl)
    buckets = {}
    for node in cur_lvl_nodes:
        bucket = Bucket(node, cur_lvl, w, x_size, loss)
        buckets[bucket.name] = bucket
    b_buckets = SparkContext.broadcast(sc, buckets)
    rows = predictions.rdd.map(lambda row: (row[1].indices, row[2]))\
        .map(lambda item: list(item))
    mapped = rows.map(lambda row: rows_mapper(row, b_buckets.value, loss_type))
    flattened = mapped.flatMap(lambda line: (line.items()))
    reduced = flattened.combineByKey(combiner, merge_values, merge_combiners)
    cur_lvl_nodes = reduced.values()\
        .map(lambda bucket: spark_utils.calc_bucket_metrics(bucket, loss, w, x_size, b_cur_lvl.value))
    if debug:
        cur_lvl_nodes.map(lambda bucket: bucket.print_debug(b_topk.value)).collect()
    cur_lvl = 1
    prev_level = cur_lvl_nodes.collect()
    top_k = top_k.buckets_top_k(prev_level, x_size, alpha, 1)
    while len(prev_level) > 0:
        b_cur_lvl_nodes = SparkContext.broadcast(sc, prev_level)
        b_topk = SparkContext.broadcast(sc, top_k)
        cur_min = top_k.min_score
        b_cur_lvl = SparkContext.broadcast(sc, cur_lvl)
        top_k.print_topk()
        buckets = join_enum(prev_level, cur_lvl, x_size, alpha, top_k, w, loss)
        b_buckets = SparkContext.broadcast(sc, buckets)
        to_slice = dict(filter(lambda bucket: bucket[1].check_bounds(x_size, alpha, top_k), buckets.items()))
        b_to_slice = SparkContext.broadcast(sc, to_slice)
        mapped = rows.map(lambda row: rows_mapper(row, b_to_slice.value, loss_type))
        flattened = mapped.flatMap(lambda line: (line.items()))
        to_process = flattened.combineByKey(combiner, merge_values, merge_combiners)
        if debug:
            to_process.values().map(lambda bucket: bucket.print_debug(b_topk.value)).collect()
        prev_level = to_process\
            .map(lambda bucket: spark_utils.calc_bucket_metrics(bucket[1], loss, w, x_size, b_cur_lvl.value))\
            .collect()
        cur_lvl += 1
        top_k = top_k.buckets_top_k(prev_level, x_size, alpha, cur_min)
        print("Level " + str(cur_lvl) + " had " + str(
            len(b_cur_lvl_nodes.value * (len(prev_level) - 1)))+" candidates but after pruning only " +
              str(len(prev_level)) + " go to the next level")
        top_k.print_topk()
    print()
    print("Program stopped at level " + str(cur_lvl - 1))
    print("Selected slices are: ")
    top_k.print_topk()
    return None
Ejemplo n.º 2
0
def parallel_process(all_features, predictions, loss, sc, debug, alpha, k, w, loss_type):
    top_k = Topk(k)
    cur_lvl = 0
    levels = []
    cur_lvl_nodes = list(all_features)
    pred_pandas = predictions.toPandas()
    x_size = len(pred_pandas)
    b_topk = SparkContext.broadcast(sc, top_k)
    b_cur_lvl = SparkContext.broadcast(sc, cur_lvl)
    buckets = {}
    for node in cur_lvl_nodes:
        bucket = Bucket(node, cur_lvl, w, x_size, loss)
        buckets[bucket.name] = bucket
    b_buckets = SparkContext.broadcast(sc, buckets)
    # rows = predictions.rdd.map(lambda row: (row[0], row[1].indices, row[2])) \
    #     .map(lambda item: (item[0], item[1].tolist(), item[2]))
    rows = predictions.rdd.map(lambda row: row[1].indices) \
        .map(lambda item: list(item))
    mapped = rows.map(lambda row: rows_mapper(row, b_buckets.value, loss_type))
    flattened = mapped.flatMap(lambda line: (line.items()))
    reduced = flattened.combineByKey(combiner, join_data_parallel.merge_values, join_data_parallel.merge_combiners)
    cur_lvl_nodes = reduced.values() \
        .map(lambda bucket: spark_utils.calc_bucket_metrics(bucket, loss, w, x_size, b_cur_lvl.value))
    if debug:
        cur_lvl_nodes.map(lambda bucket: bucket.print_debug(b_topk.value)).collect()
    cur_lvl = 1
    prev_level = cur_lvl_nodes.collect()
    b_cur_lvl_nodes = SparkContext.broadcast(sc, prev_level)
    levels.append(b_cur_lvl_nodes)
    top_k = top_k.buckets_top_k(prev_level, x_size, alpha, 1)
    while len(prev_level) > 0:
        b_topk = SparkContext.broadcast(sc, top_k)
        cur_min = top_k.min_score
        b_cur_lvl = SparkContext.broadcast(sc, cur_lvl)
        top_k.print_topk()
        buckets = []
        for left in range(int(cur_lvl / 2) + 1):
            right = cur_lvl - left - 1
            nodes = union_enum(levels[left].value, levels[right].value, x_size, alpha, top_k, w, loss, cur_lvl)
            buckets.append(nodes)
        b_buckets = sc.parallelize(buckets)
        all_buckets = b_buckets.flatMap(lambda line: (line.items()))
        combined = dict(all_buckets.combineByKey(combiner, merge_values, merge_combiners).collect())
        b_buckets = SparkContext.broadcast(sc, combined)
        to_slice = dict(filter(lambda bucket: bucket[1].check_bounds(x_size, alpha, top_k), combined.items()))
        b_to_slice = SparkContext.broadcast(sc, to_slice)
        mapped = rows.map(lambda row: rows_mapper(row, b_to_slice.value, loss_type))
        flattened = mapped.flatMap(lambda line: (line.items()))
        partial = flattened.combineByKey(combiner, join_data_parallel.merge_values, join_data_parallel.merge_combiners)
        prev_level = partial\
            .map(lambda bucket: spark_utils.calc_bucket_metrics(bucket[1], loss, w, x_size, b_cur_lvl.value)).collect()
        top_k = top_k.buckets_top_k(prev_level, x_size, alpha, cur_min)
        b_topk = SparkContext.broadcast(sc, top_k)
        if debug:
            partial.values().map(lambda bucket: bucket.print_debug(b_topk.value)).collect()
        print("Level " + str(cur_lvl) + " had " + str(
            len(levels[cur_lvl - 1].value) * (len(levels[cur_lvl - 1].value) - 1)) +
              " candidates but after pruning only " + str(len(prev_level)) + " go to the next level")
        print("Program stopped at level " + str(cur_lvl))
        b_cur_lvl_nodes = SparkContext.broadcast(sc, prev_level)
        levels.append(b_cur_lvl_nodes)
        cur_lvl += 1
    print()
    print("Selected slices are: ")
    top_k.print_topk()
    return None