def get_tree_df_row(tree, config_obj): balanced_h5_file = get_balanced_h5_path(config_obj, True) + tree balanced_h5 = None row = {} try: balanced_h5 = h5py.File(balanced_h5_file, 'r') row = { "pv_c0_pct": 100 * (balanced_h5.attrs['pendant_vertices_c0'] / (balanced_h5.attrs['pendant_vertices_c0'] + balanced_h5.attrs['pendant_vertices_c1'])), "pv_c1_pct": 100 * (balanced_h5.attrs['pendant_vertices_c1'] / (balanced_h5.attrs['pendant_vertices_c0'] + balanced_h5.attrs['pendant_vertices_c1'])), "pv_pct": 100 * ((balanced_h5.attrs['pendant_vertices_c0'] + balanced_h5.attrs['pendant_vertices_c1']) / balanced_h5.attrs['total_vertex_size']), "tree_pendant_vertices": balanced_h5.attrs['pendant_vertices_c0'] + balanced_h5.attrs['pendant_vertices_c1'], "c0_pct": (balanced_h5.attrs['c0_vertex_size'] / (balanced_h5.attrs['c0_vertex_size'] + balanced_h5.attrs['c1_vertex_size'])) * 100, "++_edges": balanced_h5.attrs['++_edges'], "-+_edges": balanced_h5.attrs['-+_edges'], "+-_edges": balanced_h5.attrs['+-_edges'], "--_edges": balanced_h5.attrs['--_edges'], "num_changed_edges": balanced_h5.attrs['num_changed_edges'], "mean_tree_vertex_deg": balanced_h5.attrs['mean_tree_vertex_deg'], "stdev_tree_vertex_deg": balanced_h5.attrs['stdev_tree_vertex_deg'] } if 'tree_bias_score' in balanced_h5.attrs: row["tree_bias_score"] = balanced_h5.attrs['tree_bias_score'] balanced_h5.close() except Exception as error: print("Tree DF OSError. The following tree is bad: ", tree, " - path: ", balanced_h5_file) print(error) # os.remove(balanced_h5_file) return row
def get_per_tree_vertex_dict(tree, config_obj): balanced_h5_file = get_balanced_h5_path(config_obj, True) + tree balanced_h5 = None row = {} try: balanced_h5 = h5py.File(balanced_h5_file, 'r') ### TODO: Make this a config option in the yaml file instead of comment/uncomment ### This block can be uncommented to add the c0 and c1 size percentages to Data h5s that have already been made. total_nodes = balanced_h5.attrs['c0_vertex_size'] + balanced_h5.attrs['c1_vertex_size'] c0_size_percent = balanced_h5.attrs['c0_vertex_size'] / total_nodes c1_size_percent = balanced_h5.attrs['c1_vertex_size'] / total_nodes component_list_dict = dict(enumerate(balanced_h5['component_list'])) component_list_with_tuples = [(component_list_dict[entry], c0_size_percent) if component_list_dict[entry] == 0 else (component_list_dict[entry], c1_size_percent) for entry in range(total_nodes)] ### These next two lines should be switched depending on if the c0 and c1 size were added when the trees were initially run or not. If they were not, use line 176 and the block above. If they were, use line 177. row['component_list_with_tuples'] = component_list_with_tuples # row['component_list_with_tuples'] = list(balanced_h5['weighted_component_list']) row['component_list'] = list(balanced_h5['component_list']) row['vertex_neighbor_amt_list'] = list(balanced_h5['vertex_neighbor_amt_list']) row['mean_tree_vertex_deg'] = balanced_h5.attrs['mean_tree_vertex_deg'] row['stdev_tree_vertex_deg'] = balanced_h5.attrs['stdev_tree_vertex_deg'] row['pendant_vertices_c0'] = balanced_h5.attrs['pendant_vertices_c0'] row['pendant_vertices_c1'] = balanced_h5.attrs['pendant_vertices_c1'] row['pendant_vertices_total'] = balanced_h5.attrs['pendant_vertices_total'] balanced_h5.close() except Exception as error: print("Vertex DF error. The following tree is bad. ", tree, " - path: ", balanced_h5_file) print("error: " , error) ## Eric commented out this line so things can be tested in this function without deleting tree files. # os.remove(balanced_h5_file) return row
def get_per_tree_vertex_dict(tree, config_obj): balanced_h5_file = get_balanced_h5_path(config_obj, True) + tree balanced_h5 = None row = {} try: balanced_h5 = h5py.File(balanced_h5_file, "r") row["component_list_with_tuples"] = list( balanced_h5["weighted_component_list"]) row["component_list"] = list(balanced_h5["component_list"]) row["vertex_neighbor_amt_list"] = list( balanced_h5["vertex_neighbor_amt_list"]) row["mean_tree_vertex_deg"] = balanced_h5.attrs["mean_tree_vertex_deg"] row["stdev_tree_vertex_deg"] = balanced_h5.attrs[ "stdev_tree_vertex_deg"] row["pendant_vertices_c0"] = balanced_h5.attrs["pendant_vertices_c0"] row["pendant_vertices_c1"] = balanced_h5.attrs["pendant_vertices_c1"] row["pendant_vertices_total"] = balanced_h5.attrs[ "pendant_vertices_total"] balanced_h5.close() except Exception as error: print( "Vertex DF error. The following tree is bad. ", tree, " - path: ", balanced_h5_file, ) print("error: ", error) return row
def create_vertex_df_from_vertex_dict(to_be_df_dict, config_obj, trees_list, FULL_DF_PATH): #print("Creating components dict.") try: components_dict = { tree: to_be_df_dict[tree]["component_list"] for tree in trees_list } component_df = pd.DataFrame(components_dict) component_dict_with_weights = { tree: to_be_df_dict[tree]["component_list_with_tuples"] for tree in trees_list } component_weights_df = pd.DataFrame(component_dict_with_weights) except: print("tree without component list found, moving to next.") balanced_h5_file = get_balanced_h5_path(config_obj, True) + trees_list[0] try: balanced_h5 = h5py.File(balanced_h5_file, "r") #print("Adding c0 pct (and outcomes if applicable).") vertex_df = add_c0_pct_and_outcomes( trees_list[0], balanced_h5, component_weights_df, config_obj["tiebreak_node"], config_obj["weighted_status"], config_obj["machine"], ) pd.to_pickle(vertex_df, FULL_DF_PATH) #print("Done. Made vertex df of shape: ", vertex_df.shape) finally: balanced_h5.close() return vertex_df
def get_balanced_file_list(config_obj, local_override=False): BALANCED_DIR = get_balanced_h5_path(config_obj, local_override) balanced_file_list = [ f for f in listdir(BALANCED_DIR) if isfile(join(BALANCED_DIR, f)) if f != ".DS_Store" ] balanced_file_list.sort() #print("Tree list Length: ", len(balanced_file_list)) return balanced_file_list
def write_balanced_h5( config_obj, mapping_to_original, b_csr_unbal_adj_matrix, csr_st_adj_matrix, nx_st_graph, csr_bal_adj_matrix, changed_edge_list, tree_name, ): bal_adj_path = get_balanced_h5_path(config_obj) + tree_name + ".h5" try: output_type = config_obj["machine"] start_component_list = datetime.now() component_list = get_balance_components(nx_st_graph) print_timing_output( "COMPONENT_LIST_GEN_TIME: Component List Acquired, took: (hh:mm:ss.ms)", datetime.now() - start_component_list, output_type, ) num_components = np.unique(component_list) if len(num_components) > 1: f = h5py.File(bal_adj_path, "w") g = f.create_group("csr_st_adj_matrix") create_matrix_h5_file(g, csr_st_adj_matrix) h = f.create_group("csr_bal_adj_matrix") create_matrix_h5_file(h, csr_bal_adj_matrix) f.create_dataset("changed_edge_list", data=changed_edge_list) f.create_dataset("mapping_to_original", data=mapping_to_original) component_stats_start = datetime.now() write_component_stats(f, csr_bal_adj_matrix, csr_st_adj_matrix, component_list, config_obj) print_timing_output( "COMPONENT_STATS_TIME: (hh:mm:ss:ms)", datetime.now() - component_stats_start, output_type, ) write_vertex_degree_stats(f, csr_bal_adj_matrix, csr_st_adj_matrix, component_list) write_edge_stats(f, b_csr_unbal_adj_matrix, csr_bal_adj_matrix) write_balanced_attributes(f, config_obj, tree_name) if config_obj[ "has_labels"]: ## wrties c0 and c1 wins and losses to existing h5 write_outcome_stats(f, config_obj, csr_bal_adj_matrix, component_list) f.close() except: print("Error writing balanced h5")
def write_balanced_h5(config_obj, mapping_to_original, b_csr_unbal_adj_matrix, csr_st_adj_matrix, nx_st_graph, csr_bal_adj_matrix, changed_edge_list, tree_name): bal_adj_path = get_balanced_h5_path(config_obj) + tree_name + ".h5" print("Writing balanced h5: ", "(", tree_name, ", ", config_obj['dataset'], ", ", config_obj['data_subset_type'], ", ", config_obj['matrix_name'], ", ", config_obj['component_no'], ") ") try: start_component_list = datetime.now() component_list = get_balance_components(nx_st_graph) print('Component List Acquired, took: (hh:mm:ss.ms) {}'.format( datetime.now() - start_component_list)) num_components = np.unique(component_list) if len(num_components) > 1: f = h5py.File(bal_adj_path, 'w') g = f.create_group("csr_st_adj_matrix") create_matrix_h5_file(g, csr_st_adj_matrix) h = f.create_group("csr_bal_adj_matrix") create_matrix_h5_file(h, csr_bal_adj_matrix) f.create_dataset('changed_edge_list', data=changed_edge_list) f.create_dataset('mapping_to_original', data=mapping_to_original) write_component_stats(f, csr_bal_adj_matrix, csr_st_adj_matrix, component_list) write_vertex_degree_stats(f, csr_bal_adj_matrix, csr_st_adj_matrix, component_list) write_edge_stats(f, b_csr_unbal_adj_matrix, csr_bal_adj_matrix) write_balanced_attributes(f, config_obj, tree_name) if config_obj[ 'has_labels']: ## wrties c0 and c1 wins and losses to existing h5 write_outcome_stats(f, config_obj, csr_bal_adj_matrix, component_list) print("Wrote balanced h5: ", "(", tree_name, ", ", config_obj['dataset'], ", ", config_obj['data_subset_type'], ", ", config_obj['matrix_name'], ", ", config_obj['component_no'], ")") f.close() else: print( "Component list had only one component. No Agreeable Minority exists for this tree. Tree not saved to h5." ) finally: print()