def convert_huffner(): # Define some directories-of-interest paths original_dir = Path('.') / 'data' / 'original' preprocessed_dir = Path('.') / 'data' / 'preprocessed' # Huffner files we don't preprocess blacklist = ['aa12', 'j12', 'j27'] # Identify the Huffner data data_names = sorted( filter(lambda n: n not in blacklist, names_in_dir(original_dir / 'huffner', '.graph'))) print('Identified {} Huffner files'.format(len(data_names))) # Convert datasets for dataset in data_names: print('Processing', dataset) start_time = time.time() # Process the graph graph = read_huffner(original_dir / 'huffner', dataset) oct_set = set() graph_reduced = True while graph_reduced: # Require a change for graph_reduced to be triggered again graph_reduced = False # Compute OCT reductions print("- Computing OCT reduction") graph = reset_labels(graph) changed, graph, oct_set = oct_reductions(graph, oct_set) if changed: print("-- OCT reduced graph") graph_reduced = True # Compute print("- Computing VC reduction") graph = reset_labels(graph) write_snap(graph, preprocessed_dir / 'snap') changed, graph, oct_set = vc_reductions(graph, oct_set) if changed: print("-- VC reduced graph") graph_reduced = True total_time = time.time() - start_time print('Preprocessing `{}` took {} seconds'.format( dataset, round(total_time, 1))) # Write the results graph = reset_labels(graph) write_summary(graph, preprocessed_dir / 'summary', 'huffner.csv') write_oct_set(graph, oct_set, preprocessed_dir / 'oct') write_name_lookup(graph, preprocessed_dir / 'lookup') write_edgelist(graph, preprocessed_dir / 'edgelist') write_huffner(graph, preprocessed_dir / 'huffner') write_snap(graph, preprocessed_dir / 'snap') print('Preprocessed Huffner data')
def _sanitize_select_gka(original_dir, sanitized_dir, data_names): for dataset in data_names: # Sanitize the graph and write print('Sanitizing', dataset) graph = read_beasley(original_dir / 'gka', dataset + '.txt') graph = reset_labels(graph) write_edgelist(graph, sanitized_dir / 'edgelist') write_huffner(graph, sanitized_dir / 'huffner') write_snap(graph, sanitized_dir / 'snap') print('Preprocessed GKA data')
def convert_select_gka(data_names): # Define some directories-of-interest paths original_dir = Path('.') / 'data' / 'original' preprocessed_dir = Path('.') / 'data' / 'preprocessed' # Remove the old statistics CSV if Path(preprocessed_dir / 'summary' / 'gka.csv').is_file(): Path(preprocessed_dir / 'summary' / 'gka.csv').unlink() # Convert datasets for dataset in data_names: print('Processing', dataset) start_time = time.time() # Process the graph graph = read_beasley(original_dir / 'gka', dataset) oct_set = set() graph_reduced = True while graph_reduced: # Require a change for graph_reduced to be triggered again graph_reduced = False # Compute OCT reductions print("- Computing OCT reduction") graph = reset_labels(graph) changed, graph, oct_set = oct_reductions(graph, oct_set) if changed: print("-- OCT reduced graph") graph_reduced = True # Compute print("- Computing VC reduction") graph = reset_labels(graph) write_snap(graph, preprocessed_dir / 'snap') changed, graph, oct_set = vc_reductions(graph, oct_set) if changed: print("-- VC reduced graph") graph_reduced = True # Write the results total_time = time.time() - start_time print('Preprocessing `{}` took {} seconds'.format( dataset, round(total_time, 1))) graph = reset_labels(graph) write_summary(graph, preprocessed_dir / 'summary', 'gka.csv') write_oct_set(graph, oct_set, preprocessed_dir / 'oct') write_name_lookup(graph, preprocessed_dir / 'lookup') write_edgelist(graph, preprocessed_dir / 'edgelist') write_huffner(graph, preprocessed_dir / 'huffner') write_snap(graph, preprocessed_dir / 'snap') print('Preprocessed GKA data')
def _sanitize_select_beasley(original_dir, sanitized_dir, data_names): """ Sanitize select graphs in the origina/beasley/ directory. """ for dataset in data_names: # Sanitize the graph and write print('Sanitizing', dataset) graph = read_beasley(original_dir / 'beasley', dataset + '.txt') graph = reset_labels(graph) write_edgelist(graph, sanitized_dir / 'edgelist') write_huffner(graph, sanitized_dir / 'huffner') write_snap(graph, sanitized_dir / 'snap') print('Preprocessed Beasley data')
def _sanitize_huffner(original_dir, sanitized_dir): """ Sanitize all graphs in the original/huffner/ directory. """ # Identify the Huffner data data_names = sorted(names_in_dir(original_dir / 'huffner', '.graph')) print('Identified {} Huffner files'.format(len(data_names))) # Convert datasets for dataset in data_names: # Sanitize the graph and write print('Sanitizing', dataset) graph = read_huffner(original_dir / 'huffner', dataset + '.graph') graph = reset_labels(graph) write_edgelist(graph, sanitized_dir / 'edgelist') write_huffner(graph, sanitized_dir / 'huffner') write_snap(graph, sanitized_dir / 'snap') print('Sanitized Huffner data')
def _convert_quantum(data_names): # Define some directories-of-interest paths input_dir = Path('.') / 'data' / 'sanitized' output_dir = Path('.') / 'data' / 'preprocessed' # Remove the old statistics CSV summary_dir = Path(output_dir / 'summary') summary_filename = summary_dir / 'quantum.csv' if summary_filename.is_file(): Path(summary_filename).unlink() else: summary_dir.mkdir(exist_ok=True, parents=True) _write_summary_header(summary_filename) # Convert datasets for dataset in data_names: timestamp = datetime.\ datetime.\ fromtimestamp(time.time()).strftime('%Y/%m/%d-%H:%M:%S:') print('{} Processing {}'.format(timestamp, dataset)) # Process the graph graph = read_edgelist(input_dir / 'edgelist', dataset) graph = reset_labels(graph) graph.graph['original_vertices'] = graph.order() graph.graph['original_edges'] = graph.size() oct_set = set() graph_reduced = True while graph_reduced: # Require a change for graph_reduced to be triggered again graph_reduced = False # Compute OCT reductions print("- Computing OCT reduction") graph = reset_labels(graph) changed, graph, oct_set = oct_reductions(graph, oct_set) if changed: print("-- OCT reduced graph") graph_reduced = True # Compute print("- Computing VC reduction") graph = reset_labels(graph) write_snap(graph, output_dir / 'snap') changed, graph, oct_set = vc_reductions(graph, oct_set) if changed: print("-- VC reduced graph") graph_reduced = True # Write the results graph = reset_labels(graph) _write_summary(graph, output_dir / 'summary', 'quantum.csv') _write_oct_set(graph, oct_set, output_dir / 'oct') _write_name_lookup(graph, output_dir / 'lookup') write_edgelist(graph, output_dir / 'edgelist') write_huffner(graph, output_dir / 'huffner') write_snap(graph, output_dir / 'snap') print('Finished preprocessing quantum data')
# Keep only the non-synthetic data datasets = sorted(list(filter(lambda x: '-' not in x, datasets))) # Read in the pre-computed optimal OCT sizes oct_upper_bound = _populate_oct_upper_bound_lookup() # For every dataset and seed, generate a synthetic graph with each model for dataset, seed in product(datasets, args.seeds): print('For {} and seed {}'.format(dataset, seed)) # Generate the sanitized ER random graph print('- Generating Erdos-Renyi') graph = read_edgelist(input_dir, dataset + '.edgelist') er_graph = _generate_er(graph, seed) reset_labels(er_graph) # Write the graph write_edgelist(er_graph, sanitized_dir / 'edgelist') write_huffner(er_graph, sanitized_dir / 'huffner') write_snap(er_graph, sanitized_dir / 'snap') # Generate the sanitized CL random graph print('- Generating Chung-Lu') graph = read_edgelist(input_dir, dataset + '.edgelist') cl_graph = _generate_cl(graph, seed) reset_labels(cl_graph) # Write the graph write_edgelist(cl_graph, sanitized_dir / 'edgelist') write_huffner(cl_graph, sanitized_dir / 'huffner') write_snap(cl_graph, sanitized_dir / 'snap') # Generate the sanitized BA random graph print('- Generating Barabasi-Albert')