def add_c0_pct_and_outcomes( tree_name, balanced_h5, component_df, tiebreak_node, weighted_status_flag, output_type, ): df_transpose = component_df.T start_status = datetime.now() c0_pct = calc_weighted_status(component_df, df_transpose, tiebreak_node, weighted_status_flag) print_timing_output("CALC_STATUS_TIME: (hh:mm:ss.ms)", datetime.now() - start_status, output_type) df_dict_temp = {"% C0": c0_pct} mapping = list(balanced_h5["mapping_to_original"]) df_dict_temp["Vert ID"] = [mapping[i] for i in component_df.index] assert len(mapping) == len(component_df.index) #print("ATTACHING OUTCOMES?") if "tree_bias_score" in balanced_h5.attrs: #print("YES") outcomes = list(balanced_h5["outcomes"]) df_dict_temp["outcome"] = outcomes assert len(mapping) == len(outcomes) df = pd.DataFrame( df_dict_temp) ### creates dataframe with status, node ID, and outcomes TimerManager.stopTimerX(0) print_timing_output("TOTAL_TIME: (hh:mm:ss.ms)", str(TimerManager.getTimerXElapsed(0)), output_type) print("TOTAL_TIME: (hh:mm:ss.ms)", str(TimerManager.getTimerXElapsed(0))) return df
def postprocess_locally(config_obj): postprocess_start = datetime.now() output_type = config_obj["machine"] #print("-------------------- Config Object --------------------") #print(config_obj) #print("-------------------- End Config Object --------------------") if config_obj["postprocess"]: #print("Creating at least one thing.") if config_obj["dataframes"]["vertex_df"]: postprocess_vertex_df(config_obj) if config_obj["plots"]["tree_bias_vs_c0"]: if config_obj["has_labels"]: postprocess_tree_bias_vs_c0(config_obj) else: print( "You can't create a bias vs c0 graph: Your data has no labels." ) if config_obj["plots"]["vertex_status_vs_id"]: postprocess_vertex_status_vs_id(config_obj) print_timing_output( "TOTAL_POSTPROCESS_TIME: (hh:mm:ss.ms)", datetime.now() - postprocess_start, output_type, ) if output_type == "current": output_file = [get_output_file_path(config_obj)] timing_filename = create_write_filename(output_file) create_timing_results(output_file, timing_filename)
def write_balanced_h5( config_obj, mapping_to_original, b_csr_unbal_adj_matrix, csr_st_adj_matrix, nx_st_graph, csr_bal_adj_matrix, changed_edge_list, tree_name, ): bal_adj_path = get_balanced_h5_path(config_obj) + tree_name + ".h5" try: output_type = config_obj["machine"] start_component_list = datetime.now() component_list = get_balance_components(nx_st_graph) print_timing_output( "COMPONENT_LIST_GEN_TIME: Component List Acquired, took: (hh:mm:ss.ms)", datetime.now() - start_component_list, output_type, ) num_components = np.unique(component_list) if len(num_components) > 1: f = h5py.File(bal_adj_path, "w") g = f.create_group("csr_st_adj_matrix") create_matrix_h5_file(g, csr_st_adj_matrix) h = f.create_group("csr_bal_adj_matrix") create_matrix_h5_file(h, csr_bal_adj_matrix) f.create_dataset("changed_edge_list", data=changed_edge_list) f.create_dataset("mapping_to_original", data=mapping_to_original) component_stats_start = datetime.now() write_component_stats(f, csr_bal_adj_matrix, csr_st_adj_matrix, component_list, config_obj) print_timing_output( "COMPONENT_STATS_TIME: (hh:mm:ss:ms)", datetime.now() - component_stats_start, output_type, ) write_vertex_degree_stats(f, csr_bal_adj_matrix, csr_st_adj_matrix, component_list) write_edge_stats(f, b_csr_unbal_adj_matrix, csr_bal_adj_matrix) write_balanced_attributes(f, config_obj, tree_name) if config_obj[ "has_labels"]: ## wrties c0 and c1 wins and losses to existing h5 write_outcome_stats(f, config_obj, csr_bal_adj_matrix, component_list) f.close() except: print("Error writing balanced h5")
def get_full_unsym_csr_adj_matrix_and_possibly_outcomes_from_csv(config_obj): output_type = config_obj["machine"] edges_csv_path = (get_raw_dataset_csv_path(config_obj) + "_edges.csv" ) # Expects: From Node ID, To Node ID, Edge Weight users_csv_path = (get_raw_dataset_csv_path(config_obj) + "_users.csv" ) # Expects: Node ID, User ID, Label (optional) print("Attempting to read the following CSV's:") print("Edges: ", edges_csv_path) print("Users: ", users_csv_path) csr_adj_matrix = None users_df = None if os.path.isfile(edges_csv_path) and os.path.isfile(users_csv_path): data_df = pd.read_csv(edges_csv_path) start_matrix = datetime.now() row_ind = data_df.iloc[:, [0]] col_ind = data_df.iloc[:, [1]] data = data_df.iloc[:, [2]] row_vertices = data_df.iloc[:, [0]].max() col_vertices = data_df.iloc[:, [1]].max() max_vertices = int(max(row_vertices.values, col_vertices.values)) + 1 csr_adj_matrix = sp.csr_matrix( ( data.values.flatten(), (row_ind.values.flatten(), col_ind.values.flatten()), ), shape=(max_vertices, max_vertices), ) print_timing_output( "MATRIX_CREATE_TIME: (hh:mm:ss.ms)", datetime.now() - start_matrix, output_type, ) users_df = pd.read_csv(users_csv_path, ) else: print( "Error creating matrices from edges and users files. Please check the paths and filenames and make sure they are correct." ) return csr_adj_matrix, users_df
def preprocess_locally(config_obj): preprocess_start = datetime.now() output_type = config_obj["machine"] print("Preprocessing on: ", config_obj["machine"]) if config_obj["preprocess"]: print("Making a symmetric full H5.") sym_start_time = datetime.now() create_full_h5(config_obj) print_timing_output( "SYM_MATRIX_CREATE_TIME: (hh:mm:ss.ms)", datetime.now() - sym_start_time, output_type, ) else: print( "Preprocess option not selected. Not making a symmetric full H5.") print("---------- Done making symmetric full H5 (if one was made)") print_timing_output( "TOTAL_PREPROCESS_TIME: (hh:mm:ss.ms)", datetime.now() - preprocess_start, output_type, ) return 0
def process_locally(config_obj): process_start = datetime.now() output_type = config_obj["machine"] tree_type = str([ item for item in config_obj["tree_type"] if config_obj["tree_type"][item] ][0]) tree_names_to_create = generate_tree_names(config_obj) #print("Creating", len(tree_names_to_create), tree_type, "trees.") balanced_tuple_list = None unbalanced_csr_adj_matrix = get_full_csr_adj(config_obj, True) mapping_to_original = get_mapping_to_original(config_obj, True) num_edges_in_graph = int(unbalanced_csr_adj_matrix.count_nonzero() / 2) random_weights_dict = {} for tree_name in tree_names_to_create: random_weights_dict[tree_name] = np.random.uniform( 0, 1, num_edges_in_graph) if config_obj["parallelism"] == "spark": # Use Spark #print("Paralellism: Spark") from pyspark import SparkContext, SparkConf, broadcast # https://datascience.stackexchange.com/questions/8549/how-do-i-set-get-heap-size-for-spark-via-python-notebook # https://stackoverflow.com/questions/50865093/modulenotfounderror-in-pyspark-worker-on-rdd-collect # None of these settings are actually getting set for the spark session, settings had to be moved to process_general_spark.sh # Spark settings are not able to be set from inside the application when the application is already running conf = SparkConf().setAppName("Balancer") # conf = ( # conf.setMaster("local[*]") # .set("spark.executor.memory", "100G") # .set("spark.driver.memory", "100G") # .set("spark.driver.maxResultSize", "16G") # .set("spark.cleaner.ttl", 86400 * 2) # .set("spark.local.dir", "~/tmp/spark-temp") # ) sc = SparkContext(conf=conf) broadcast.Broadcast.dump = broadcast_dump tree_names_to_create_rdd = sc.parallelize(tree_names_to_create) ### construct a list containing all the node indices for breadth first trees. If length of total nodes ### in graph is less than number of trees requested, loop around the indices and repeat the breadth trees num_times_to_repeat_index = int( len(tree_names_to_create) / len(mapping_to_original)) node_index_list = list(range(len(mapping_to_original))) if num_times_to_repeat_index >= 1: node_index_list = node_index_list * num_times_to_repeat_index remainder = len(tree_names_to_create) - len(node_index) for num in range(remainder): node_index_list.append(num) ### pass in the value of node_index_list at the index of the tree name being run. ### allows parallel operation while still guaranteeing breadth trees are run the same as serial. balanced_tuple_list = tree_names_to_create_rdd.map( lambda tree_name: balance_graph( unbalanced_csr_adj_matrix, tree_name, config_obj, mapping_to_original, random_weights_dict, node_index_list[tree_names_to_create.index(tree_name)], )) balanced_tuple_list = balanced_tuple_list.collect() elif config_obj["parallelism"] == "serial": #print("Paralellism: Serial") balanced_tuple_list = list() node_index = 0 ### increment node_index for breadth first tree generation. Makes breadth first trees run starting ### at highest degree node first and then descending. for tree_name in tree_names_to_create: if node_index == len(mapping_to_original): node_index = 0 #print( # "NODE INDEX RESET, THERE ARE LESS NODES THAN NUMBER OF BREADTH TREES REQUESTED" #) #print("Creating tree number: ", tree_names_to_create.index(tree_name)) balanced_tuple_list.append( balance_graph( unbalanced_csr_adj_matrix, tree_name, config_obj, mapping_to_original, random_weights_dict, node_index, )) node_index += 1 print_timing_output("TOTAL_PROCESS_TIME: (hh:mm:ss.ms)", datetime.now() - process_start, output_type) return 0
def balance_graph( b_unbalanced_csr_adj_matrix, tree_name, config_obj, mapping_to_original, rand_weights, node_index, ): output_type = config_obj["machine"] start_balance = datetime.now() print_timing_output("Begin graph balancing:", str(datetime.now()), output_type) changed_edge_list = [] tree_type = get_tree_type(config_obj) print_timing_output("Tree type: -------------------->", tree_type, output_type) csr_st_adj_matrix = get_spanning_tree( b_unbalanced_csr_adj_matrix, tree_type, tree_name, rand_weights, node_index) # types: random, random_MST, breadth, depth tree_time = datetime.now() - start_balance print_timing_output("TREE_TIME: Random Tree Acquired, took: (hh:mm:ss.ms)", tree_time, output_type) non_st_edges_list = get_non_st_edges(b_unbalanced_csr_adj_matrix, csr_st_adj_matrix) nx_st_graph = nx.from_scipy_sparse_matrix(csr_st_adj_matrix) begin_get_edges_to_change = datetime.now() unique_paths = get_unique_paths(nx_st_graph, non_st_edges_list) changed_edge_list = [ edge_to_change for edge_to_change in ( should_change_edge(nx_st_graph, unique_paths, coord_triple) for coord_triple in non_st_edges_list) if edge_to_change is not None ] print_timing_output( "Got changed edges, took: (hh:mm:ss.ms)", datetime.now() - begin_get_edges_to_change, output_type, ) csr_bal_adj_matrix = reverse_edges(b_unbalanced_csr_adj_matrix, changed_edge_list) print_timing_output( "BALANCE_TIME: (hh:mm:ss:ms)", (datetime.now() - start_balance - tree_time), output_type, ) print_timing_output( "TOTAL_BALANCE_TIME: Balancing time elapsed: (hh:mm:ss.ms)", datetime.now() - start_balance, output_type, ) write_balanced_h5( config_obj, mapping_to_original, b_unbalanced_csr_adj_matrix, csr_st_adj_matrix, nx_st_graph, csr_bal_adj_matrix, changed_edge_list, tree_name, ) del csr_st_adj_matrix del csr_bal_adj_matrix del changed_edge_list gc.collect() return tree_name
def postprocess_vertex_df(config_obj, dont_remake_override=False): ( dataset, data_subset_type, matrix_name, num_trees, tree_type, parallelism, ) = get_common_config_details(config_obj) output_folder = get_postprocess_folder_general(config_obj) output_type = config_obj["machine"] file_tag = get_file_tag(config_obj) FULL_DF_PATH = output_folder + file_tag + "_vertex_df.pkl" vertex_df = None #print("-------- Entering Post-Process Vertex DF --------") if (not isfile(FULL_DF_PATH) or config_obj["postprocess"]) and not dont_remake_override: #print("-------- Creating Vertex DF --------") # if (it's not a file or we want to remake it) AND we're not calling this function from a plotting fn, table fn, etc... to_be_df_dict = None trees_list = get_balanced_file_list(config_obj, True) if parallelism == "parallel" or parallelism == "spark": num_cores = mp.cpu_count() #print( # "Creating vertex df (parallel: ", # num_cores, # " cores):", # dataset, # ", ", # data_subset_type, # ", ", # matrix_name, # ") ", #) vertex_df_start = datetime.now() df_tup_list = Parallel(n_jobs=num_cores)( delayed(get_per_tree_vertex_dict_tup)(tree, config_obj) for tree in trees_list) to_be_df_dict = {tup[0]: tup[1] for tup in df_tup_list} #print("Finished base, adding component and outcome columns.") elif parallelism == "serial": #print( # "Creating vertex df (serial):", # dataset, # ", ", # data_subset_type, # ", ", # matrix_name, # ") ", #) vertex_df_start = datetime.now() to_be_df_dict = { tree: get_per_tree_vertex_dict(tree, config_obj) for tree in trees_list } #print("Finished base, adding component and outcome columns.") vertex_df = create_vertex_df_from_vertex_dict(to_be_df_dict, config_obj, trees_list, FULL_DF_PATH) print_timing_output( "VERTEX_DF_TIME: (hh:mm:ss.ms)", datetime.now() - vertex_df_start, output_type, ) else: #print("-------- Reading Vertex DF --------") vertex_df = pd.read_pickle(FULL_DF_PATH) return vertex_df