def parallel_computation(input_path, dataset, model): path = os.path.join(input_path, dataset, model) input_filenames = [f for f in listdir(path) if isfile(join(path, f))] number_of_files = len(input_filenames) n_threads = 2 pbar_inner = tqdm(number_of_files) def pbar_update(result): pbar_inner.update() pbar_inner.set_postfix_str(result) # for idx in range(number_of_files): # sublevel_parallel_computation(p_arg[0],p_arg[1],p_arg[2], idx) asyncResults = [] with mp.Pool(n_threads) as innerPool: ColorPrint.print_green( f"Starting Pool with {n_threads} threads with {len(parallel_args)} tasks." ) for idx in range(number_of_files): r = innerPool.apply_async(sublevel_parallel_computation, [input_path, dataset, model, idx], callback=pbar_update) asyncResults.append(r) for r in asyncResults: try: r.wait() except: continue return model, dataset
def main() -> None: args = parse_args() num_jobs, num_trials = int(args.cores[0]), int(args.trials[0]) CP.print_green( f'Running infinity mirror on {num_jobs} cores for {num_trials} trials') # print(args) # exit(1) Parallel(n_jobs=num_jobs, backend='multiprocessing')( delayed(run_infinity_mirror)(trial=i + 1, args=args) for i in range(num_trials)) return
def write_stats_jsons(self, stats: Union[str, list], overwrite: bool = False) -> None: """ write the stats dictionary as a compressed json :return: """ # standardize incoming type if isinstance(stats, str): stats = [stats] for statistic in stats: assert statistic in [ method_name for method_name in dir(self) if callable(getattr(self, method_name)) and not method_name.startswith('_') ] output_directory = get_imt_output_directory() file_output_directory = os.path.join(output_directory, 'graph_stats', self.dataset, self.model, statistic) ensure_dir(file_output_directory, recursive=True) filename = os.path.join( output_directory, 'graph_stats', self.dataset, self.model, statistic, f'gs_{self.trial}_{self.iteration}.json.gz') # if the file already exists and overwrite flag is not set, then don't rework. if not overwrite and verify_file(filename): CP.print_green( f'Statistic: {statistic} output file for {self.model}-{self.dataset}-{self.trial} already exists. Skipping.' ) return try: data = self[statistic] # todo : maybe there's a better way?! save_zipped_json(data, filename) CP.print_blue(f'Stats json stored at {filename}') except Exception as e: CP.print_red(f'Exception occurred on {filename}!') CP.print_red(str(e)) if statistic == 'netlsd': save_zipped_json(data, filename + '.failed') return
model = 'GCN_AE' if model == 'Linear': model = 'Linear_AE' #for dataset in datasets: # for model in models: # input_directory = f"/data/infinity-mirror/stats/pgd/" # input_filenames = [input_directory + f for f in listdir(input_directory) if # isfile(join(input_directory, f)) and f'{dataset}_{model}_pgd_full' in f] graph_dists = defaultdict(defaultdict) #if len(input_filenames) != 1: # ColorPrint.print_red(f'There is file inconsistancy for {dataset} using {model} \n') # exit() #filename = input_filenames[0] data = pd.read_csv(os.path.join(subdir, filename), sep="\t") ColorPrint.print_green(f'Loaded {filename}') data['trial'] = data['trial'].apply( lambda x: int(str(x).strip('.pkl.gz'))) original_data = data.loc[(data.gen == 0) & (data.trial == 1)] original_data = original_data.drop(['model', 'gen', 'trial'], axis=1) original_data = original_data.to_numpy()[0] org_max = original_data.max() results = defaultdict(defaultdict) for chain_id in [x for x in data.trial.unique() if x != 1]: for gen_id in [ x for x in data.loc[data.trial == chain_id].gen.unique() if x != 0
def run(self, use_pickle: bool) -> None: """ New runner - uses list of graphs :param use_pickle: :return: """ pickle_ext = '.pkl.gz' self.graphs = [] if use_pickle: if check_file_exists(self.graphs_pickle_path + pickle_ext): # the whole pickle exists graphs = load_pickle(self.graphs_pickle_path + pickle_ext) #assert len(graphs) == 21, f'Expected 21 graphs, found {len(graphs)}' assert len( graphs ) == self.num_generations + 1, f'Expected 21 graphs, found {len(graphs)}' CP.print_green( f'Using completed pickle at {self.graphs_pickle_path + pickle_ext!r}. Loaded {len(graphs)} graphs' ) return else: temp_file_pattern = re.compile( f'list_(\d+)_{self.trial}_temp_(\d+).pkl.gz') dir_name = '/'.join(self.graphs_pickle_path.split('/')[:-1]) input_files = [ f for f in os.listdir(dir_name) if re.match(temp_file_pattern, f) ] if len(input_files) > 0: assert len( input_files ) == 1, f'More than one matches found: {input_files}' input_file = input_files[0] total_generations, progress = map( int, temp_file_pattern.fullmatch(input_file).groups()) graphs = load_pickle(join(dir_name, input_file)) assert len( graphs ) == progress + 1, f'Found {len(graphs)}, expected: {progress}' CP.print_blue( f'Partial pickle found at {input_file!r} trial: {self.trial} progress: {progress}/{total_generations}' ) self.graphs = graphs remaining_generations = self.num_generations - len(self.graphs) tqdm.write( f'Running Infinity Mirror on {self.initial_graph.name!r} {self.initial_graph.order(), self.initial_graph.size()} {self.model.model_name!r} {remaining_generations} generations' ) pbar = tqdm(total=remaining_generations, bar_format='{l_bar}{bar}|[{elapsed}<{remaining}]', ncols=50) if len(self.graphs) == 0: self.initial_graph.level = 0 self.graphs = [self.initial_graph] self.features = [None] completed_trial = False for i in range(len(self.graphs) - 1, self.num_generations): if i == len(self.graphs) - 1: curr_graph = self.graphs[-1] # use the last graph level = i + 1 try: fit_time_start = time.perf_counter() self.model.update( new_input_graph=curr_graph) # update the model fit_time = time.perf_counter() - fit_time_start except Exception as e: fit_time = np.nan print(f'Model fit failed {e}') break try: gen_time_start = time.perf_counter() generated_graphs = self.model.generate( num_graphs=self.num_graphs, gen_id=level) # generate a new set of graphs gen_time = time.perf_counter() - gen_time_start except Exception as e: gen_time = np.nan print(f'Generation failed {e}') break if self.features: self.features.append(self.model.params) curr_graph = generated_graphs[ 0] # we are only generating one graph curr_graph.name = f'{self.initial_graph.name}_{level}_{self.trial}' curr_graph.gen = level self.graphs.append(curr_graph) temp_pickle_path = self.graphs_pickle_path + f'_temp_{level}{pickle_ext}' prev_temp_pickle_path = self.graphs_pickle_path + f'_temp_{level-1}{pickle_ext}' temp_features_path = self.graphs_features_path + f'_temp_{level}{pickle_ext}' prev_temp_features_path = self.graphs_features_path + f'_temp_{level-1}{pickle_ext}' save_pickle(obj=self.graphs, path=temp_pickle_path) save_pickle(obj=self.features, path=temp_features_path) delete_files(prev_temp_pickle_path) delete_files(prev_temp_features_path) self.write_timing_csv(iter_=level, fit_time=fit_time, gen_time=gen_time) if level == self.num_generations: completed_trial = True pbar.update(1) pbar.close() if completed_trial: # only delete the temp pickle if the trial finishes successfully delete_files( temp_pickle_path ) # delete the temp file if the loop finishes normally delete_files( temp_features_path ) # delete the temp file if the loop finishes normally CP.print_green( f'List of {len(self.graphs)} Graphs is pickled at "{self.graphs_pickle_path + pickle_ext}"' ) save_pickle(obj=self.graphs, path=self.graphs_pickle_path + pickle_ext) save_pickle(obj=self.features, path=self.graphs_features_path + pickle_ext) return
'lambda_distance': 'laplacian_eigenvalues' } # datasets = ['clique-ring-500-4', 'eucore', 'flights', 'tree'] models = ['Chung-Lu', 'CNRG', 'SBM', 'Erdos-Renyi', 'BUGGE', 'HRG'] # models = ['BTER', 'BUGGE', 'Chung-Lu', 'CNRG', 'Erdos-Renyi', 'Kronecker', 'SBM', 'GCN_AE', 'Linear_AE'] #stats = ['pagerank_js', 'degree_js', 'pgd_distance', 'netlsd_distance', 'lambda_distance', 'portrait_divergence'] stats = ['degree_js', 'pagerank_js', 'lambda_distance'] # datasets, models, trials, filenames = walker() datasets = ['cond-mat', 'enron'] for dataset in datasets: for model in models: for stat in stats: ColorPrint.print_green( f'computing {stat} distances for {dataset} {model}') trials = walker_texas_ranger(dataset, model, stat=implemented_metrics[stat], unique=True) args = [[dataset, model, trial, stat] for trial in trials] print(args[:5]) # exit(-1) try: results = parallel_async(distance_computation, args, num_workers=10) df = pd.concat(results) except Exception as e: ColorPrint.print_red( f'Error, for {dataset!r} {model!r} {stat!r}')