Example #1
0
def add_c0_pct_and_outcomes(
    tree_name,
    balanced_h5,
    component_df,
    tiebreak_node,
    weighted_status_flag,
    output_type,
):
    df_transpose = component_df.T
    start_status = datetime.now()
    c0_pct = calc_weighted_status(component_df, df_transpose, tiebreak_node,
                                  weighted_status_flag)
    print_timing_output("CALC_STATUS_TIME: (hh:mm:ss.ms)",
                        datetime.now() - start_status, output_type)
    df_dict_temp = {"% C0": c0_pct}

    mapping = list(balanced_h5["mapping_to_original"])
    df_dict_temp["Vert ID"] = [mapping[i] for i in component_df.index]
    assert len(mapping) == len(component_df.index)
    #print("ATTACHING OUTCOMES?")
    if "tree_bias_score" in balanced_h5.attrs:
        #print("YES")
        outcomes = list(balanced_h5["outcomes"])
        df_dict_temp["outcome"] = outcomes
        assert len(mapping) == len(outcomes)
    df = pd.DataFrame(
        df_dict_temp)  ### creates dataframe with status, node ID, and outcomes
    TimerManager.stopTimerX(0)
    print_timing_output("TOTAL_TIME: (hh:mm:ss.ms)",
                        str(TimerManager.getTimerXElapsed(0)), output_type)
    print("TOTAL_TIME: (hh:mm:ss.ms)", str(TimerManager.getTimerXElapsed(0)))
    return df
Example #2
0
def postprocess_locally(config_obj):
    postprocess_start = datetime.now()
    output_type = config_obj["machine"]
    #print("-------------------- Config Object --------------------")
    #print(config_obj)
    #print("-------------------- End Config Object --------------------")
    if config_obj["postprocess"]:
        #print("Creating at least one thing.")
        if config_obj["dataframes"]["vertex_df"]:
            postprocess_vertex_df(config_obj)
        if config_obj["plots"]["tree_bias_vs_c0"]:
            if config_obj["has_labels"]:
                postprocess_tree_bias_vs_c0(config_obj)
            else:
                print(
                    "You can't create a bias vs c0 graph: Your data has no labels."
                )
        if config_obj["plots"]["vertex_status_vs_id"]:
            postprocess_vertex_status_vs_id(config_obj)

    print_timing_output(
        "TOTAL_POSTPROCESS_TIME: (hh:mm:ss.ms)",
        datetime.now() - postprocess_start,
        output_type,
    )
    if output_type == "current":
        output_file = [get_output_file_path(config_obj)]
        timing_filename = create_write_filename(output_file)
        create_timing_results(output_file, timing_filename)
Example #3
0
def write_balanced_h5(
    config_obj,
    mapping_to_original,
    b_csr_unbal_adj_matrix,
    csr_st_adj_matrix,
    nx_st_graph,
    csr_bal_adj_matrix,
    changed_edge_list,
    tree_name,
):
    bal_adj_path = get_balanced_h5_path(config_obj) + tree_name + ".h5"
    try:
        output_type = config_obj["machine"]
        start_component_list = datetime.now()
        component_list = get_balance_components(nx_st_graph)
        print_timing_output(
            "COMPONENT_LIST_GEN_TIME: Component List Acquired, took: (hh:mm:ss.ms)",
            datetime.now() - start_component_list,
            output_type,
        )

        num_components = np.unique(component_list)
        if len(num_components) > 1:
            f = h5py.File(bal_adj_path, "w")
            g = f.create_group("csr_st_adj_matrix")
            create_matrix_h5_file(g, csr_st_adj_matrix)
            h = f.create_group("csr_bal_adj_matrix")
            create_matrix_h5_file(h, csr_bal_adj_matrix)
            f.create_dataset("changed_edge_list", data=changed_edge_list)
            f.create_dataset("mapping_to_original", data=mapping_to_original)
            component_stats_start = datetime.now()
            write_component_stats(f, csr_bal_adj_matrix, csr_st_adj_matrix,
                                  component_list, config_obj)
            print_timing_output(
                "COMPONENT_STATS_TIME: (hh:mm:ss:ms)",
                datetime.now() - component_stats_start,
                output_type,
            )
            write_vertex_degree_stats(f, csr_bal_adj_matrix, csr_st_adj_matrix,
                                      component_list)
            write_edge_stats(f, b_csr_unbal_adj_matrix, csr_bal_adj_matrix)
            write_balanced_attributes(f, config_obj, tree_name)
            if config_obj[
                    "has_labels"]:  ## wrties c0 and c1 wins and losses to existing h5
                write_outcome_stats(f, config_obj, csr_bal_adj_matrix,
                                    component_list)
            f.close()
    except:
        print("Error writing balanced h5")
Example #4
0
def get_full_unsym_csr_adj_matrix_and_possibly_outcomes_from_csv(config_obj):
    output_type = config_obj["machine"]
    edges_csv_path = (get_raw_dataset_csv_path(config_obj) + "_edges.csv"
                      )  # Expects: From Node ID, To Node ID, Edge Weight
    users_csv_path = (get_raw_dataset_csv_path(config_obj) + "_users.csv"
                      )  # Expects: Node ID, User ID, Label (optional)
    print("Attempting to read the following CSV's:")
    print("Edges: ", edges_csv_path)
    print("Users: ", users_csv_path)
    csr_adj_matrix = None
    users_df = None
    if os.path.isfile(edges_csv_path) and os.path.isfile(users_csv_path):
        data_df = pd.read_csv(edges_csv_path)
        start_matrix = datetime.now()
        row_ind = data_df.iloc[:, [0]]
        col_ind = data_df.iloc[:, [1]]
        data = data_df.iloc[:, [2]]

        row_vertices = data_df.iloc[:, [0]].max()
        col_vertices = data_df.iloc[:, [1]].max()
        max_vertices = int(max(row_vertices.values, col_vertices.values)) + 1

        csr_adj_matrix = sp.csr_matrix(
            (
                data.values.flatten(),
                (row_ind.values.flatten(), col_ind.values.flatten()),
            ),
            shape=(max_vertices, max_vertices),
        )
        print_timing_output(
            "MATRIX_CREATE_TIME: (hh:mm:ss.ms)",
            datetime.now() - start_matrix,
            output_type,
        )
        users_df = pd.read_csv(users_csv_path, )
    else:
        print(
            "Error creating matrices from edges and users files. Please check the paths and filenames and make sure they are correct."
        )
    return csr_adj_matrix, users_df
Example #5
0
def preprocess_locally(config_obj):
    preprocess_start = datetime.now()
    output_type = config_obj["machine"]
    print("Preprocessing on: ", config_obj["machine"])
    if config_obj["preprocess"]:
        print("Making a symmetric full H5.")
        sym_start_time = datetime.now()
        create_full_h5(config_obj)
        print_timing_output(
            "SYM_MATRIX_CREATE_TIME: (hh:mm:ss.ms)",
            datetime.now() - sym_start_time,
            output_type,
        )
    else:
        print(
            "Preprocess option not selected. Not making a symmetric full H5.")
    print("---------- Done making symmetric full H5 (if one was made)")
    print_timing_output(
        "TOTAL_PREPROCESS_TIME: (hh:mm:ss.ms)",
        datetime.now() - preprocess_start,
        output_type,
    )
    return 0
Example #6
0
def process_locally(config_obj):
    process_start = datetime.now()
    output_type = config_obj["machine"]
    tree_type = str([
        item for item in config_obj["tree_type"]
        if config_obj["tree_type"][item]
    ][0])
    tree_names_to_create = generate_tree_names(config_obj)
    #print("Creating", len(tree_names_to_create), tree_type, "trees.")
    balanced_tuple_list = None
    unbalanced_csr_adj_matrix = get_full_csr_adj(config_obj, True)
    mapping_to_original = get_mapping_to_original(config_obj, True)
    num_edges_in_graph = int(unbalanced_csr_adj_matrix.count_nonzero() / 2)
    random_weights_dict = {}
    for tree_name in tree_names_to_create:
        random_weights_dict[tree_name] = np.random.uniform(
            0, 1, num_edges_in_graph)
    if config_obj["parallelism"] == "spark":  # Use Spark
        #print("Paralellism: Spark")
        from pyspark import SparkContext, SparkConf, broadcast

        # https://datascience.stackexchange.com/questions/8549/how-do-i-set-get-heap-size-for-spark-via-python-notebook
        # https://stackoverflow.com/questions/50865093/modulenotfounderror-in-pyspark-worker-on-rdd-collect
        # None of these settings are actually getting set for the spark session, settings had to be moved to process_general_spark.sh
        # Spark settings are not able to be set from inside the application when the application is already running
        conf = SparkConf().setAppName("Balancer")
        #        conf = (
        #            conf.setMaster("local[*]")
        #            .set("spark.executor.memory", "100G")
        #            .set("spark.driver.memory", "100G")
        #            .set("spark.driver.maxResultSize", "16G")
        #            .set("spark.cleaner.ttl", 86400 * 2)
        #            .set("spark.local.dir", "~/tmp/spark-temp")
        #        )
        sc = SparkContext(conf=conf)

        broadcast.Broadcast.dump = broadcast_dump
        tree_names_to_create_rdd = sc.parallelize(tree_names_to_create)
        ### construct a list containing all the node indices for breadth first trees. If length of total nodes
        ### in graph is less than number of trees requested, loop around the indices and repeat the breadth trees
        num_times_to_repeat_index = int(
            len(tree_names_to_create) / len(mapping_to_original))
        node_index_list = list(range(len(mapping_to_original)))
        if num_times_to_repeat_index >= 1:
            node_index_list = node_index_list * num_times_to_repeat_index
            remainder = len(tree_names_to_create) - len(node_index)
            for num in range(remainder):
                node_index_list.append(num)
        ### pass in the value of node_index_list at the index of the tree name being run.
        ### allows parallel operation while still guaranteeing breadth trees are run the same as serial.
        balanced_tuple_list = tree_names_to_create_rdd.map(
            lambda tree_name: balance_graph(
                unbalanced_csr_adj_matrix,
                tree_name,
                config_obj,
                mapping_to_original,
                random_weights_dict,
                node_index_list[tree_names_to_create.index(tree_name)],
            ))
        balanced_tuple_list = balanced_tuple_list.collect()
    elif config_obj["parallelism"] == "serial":
        #print("Paralellism: Serial")
        balanced_tuple_list = list()
        node_index = 0
        ### increment node_index for breadth first tree generation. Makes breadth first trees run starting
        ### at highest degree node first and then descending.
        for tree_name in tree_names_to_create:
            if node_index == len(mapping_to_original):
                node_index = 0
                #print(
                #    "NODE INDEX RESET, THERE ARE LESS NODES THAN NUMBER OF BREADTH TREES REQUESTED"
                #)
            #print("Creating tree number: ", tree_names_to_create.index(tree_name))
            balanced_tuple_list.append(
                balance_graph(
                    unbalanced_csr_adj_matrix,
                    tree_name,
                    config_obj,
                    mapping_to_original,
                    random_weights_dict,
                    node_index,
                ))
            node_index += 1
    print_timing_output("TOTAL_PROCESS_TIME: (hh:mm:ss.ms)",
                        datetime.now() - process_start, output_type)
    return 0
Example #7
0
def balance_graph(
    b_unbalanced_csr_adj_matrix,
    tree_name,
    config_obj,
    mapping_to_original,
    rand_weights,
    node_index,
):
    output_type = config_obj["machine"]
    start_balance = datetime.now()
    print_timing_output("Begin graph balancing:", str(datetime.now()),
                        output_type)
    changed_edge_list = []
    tree_type = get_tree_type(config_obj)
    print_timing_output("Tree type: -------------------->", tree_type,
                        output_type)
    csr_st_adj_matrix = get_spanning_tree(
        b_unbalanced_csr_adj_matrix, tree_type, tree_name, rand_weights,
        node_index)  # types: random, random_MST, breadth, depth
    tree_time = datetime.now() - start_balance
    print_timing_output("TREE_TIME: Random Tree Acquired, took: (hh:mm:ss.ms)",
                        tree_time, output_type)
    non_st_edges_list = get_non_st_edges(b_unbalanced_csr_adj_matrix,
                                         csr_st_adj_matrix)
    nx_st_graph = nx.from_scipy_sparse_matrix(csr_st_adj_matrix)
    begin_get_edges_to_change = datetime.now()
    unique_paths = get_unique_paths(nx_st_graph, non_st_edges_list)
    changed_edge_list = [
        edge_to_change for edge_to_change in (
            should_change_edge(nx_st_graph, unique_paths, coord_triple)
            for coord_triple in non_st_edges_list)
        if edge_to_change is not None
    ]
    print_timing_output(
        "Got changed edges, took: (hh:mm:ss.ms)",
        datetime.now() - begin_get_edges_to_change,
        output_type,
    )
    csr_bal_adj_matrix = reverse_edges(b_unbalanced_csr_adj_matrix,
                                       changed_edge_list)
    print_timing_output(
        "BALANCE_TIME: (hh:mm:ss:ms)",
        (datetime.now() - start_balance - tree_time),
        output_type,
    )
    print_timing_output(
        "TOTAL_BALANCE_TIME: Balancing time elapsed: (hh:mm:ss.ms)",
        datetime.now() - start_balance,
        output_type,
    )
    write_balanced_h5(
        config_obj,
        mapping_to_original,
        b_unbalanced_csr_adj_matrix,
        csr_st_adj_matrix,
        nx_st_graph,
        csr_bal_adj_matrix,
        changed_edge_list,
        tree_name,
    )
    del csr_st_adj_matrix
    del csr_bal_adj_matrix
    del changed_edge_list
    gc.collect()
    return tree_name
Example #8
0
def postprocess_vertex_df(config_obj, dont_remake_override=False):
    (
        dataset,
        data_subset_type,
        matrix_name,
        num_trees,
        tree_type,
        parallelism,
    ) = get_common_config_details(config_obj)
    output_folder = get_postprocess_folder_general(config_obj)
    output_type = config_obj["machine"]
    file_tag = get_file_tag(config_obj)
    FULL_DF_PATH = output_folder + file_tag + "_vertex_df.pkl"
    vertex_df = None
    #print("-------- Entering Post-Process Vertex DF --------")
    if (not isfile(FULL_DF_PATH)
            or config_obj["postprocess"]) and not dont_remake_override:
        #print("-------- Creating Vertex DF --------")
        # if (it's not a file or we want to remake it) AND we're not calling this function from a plotting fn, table fn, etc...
        to_be_df_dict = None
        trees_list = get_balanced_file_list(config_obj, True)
        if parallelism == "parallel" or parallelism == "spark":
            num_cores = mp.cpu_count()
            #print(
            #    "Creating vertex df (parallel: ",
            #    num_cores,
            #    " cores):",
            #    dataset,
            #    ", ",
            #    data_subset_type,
            #    ", ",
            #    matrix_name,
            #    ") ",
            #)
            vertex_df_start = datetime.now()
            df_tup_list = Parallel(n_jobs=num_cores)(
                delayed(get_per_tree_vertex_dict_tup)(tree, config_obj)
                for tree in trees_list)
            to_be_df_dict = {tup[0]: tup[1] for tup in df_tup_list}
            #print("Finished base, adding component and outcome columns.")
        elif parallelism == "serial":
            #print(
            #    "Creating vertex df (serial):",
            #    dataset,
            #    ", ",
            #    data_subset_type,
            #    ", ",
            #    matrix_name,
            #    ") ",
            #)
            vertex_df_start = datetime.now()
            to_be_df_dict = {
                tree: get_per_tree_vertex_dict(tree, config_obj)
                for tree in trees_list
            }
            #print("Finished base, adding component and outcome columns.")
        vertex_df = create_vertex_df_from_vertex_dict(to_be_df_dict,
                                                      config_obj, trees_list,
                                                      FULL_DF_PATH)
        print_timing_output(
            "VERTEX_DF_TIME: (hh:mm:ss.ms)",
            datetime.now() - vertex_df_start,
            output_type,
        )
    else:
        #print("-------- Reading Vertex DF --------")
        vertex_df = pd.read_pickle(FULL_DF_PATH)
    return vertex_df