def save_data(self, res, out_dir=''): """ save res to disk Parameters ---------- res out_dir Returns ------- """ if not pth.exists(out_dir): os.makedirs(out_dir) out_file = pth.join(out_dir, 'res.dat') dump_data(res, out_file=out_file) # (data_file, model_name), (_best_res, _middle_res) = res try: _dat2csv(res, out_file=out_file + '.csv', feat_set=self.data_cfg['feat']) except Exception as e: print(f"Error({e})") return out_file
def _merge_models(feat, is_header, is_before_proj, is_gs, ds, covariance_type, dataset_name, model_name): """ merge all models and datasets to one csv Parameters ---------- feat is_header is_gs d Returns ------- """ res_ = {} vs = [] # store for csv for i, d_tup in enumerate(ds): _, d = d_tup if "OCSVM" in model_name: pth_cfg = (feat, is_header, is_before_proj, is_gs, d, None, model_name) data = _merge_datasets([dataset_name], pth_cfg) res_[d_tup] = data elif 'GMM' in model_name: pth_cfg = (feat, is_header, is_before_proj, is_gs, d, covariance_type, model_name) data = _merge_datasets([dataset_name], pth_cfg) res_[d_tup] = data else: msg = d_tup raise NotImplementedError(msg) # store for csv if i == 0: vs = copy.deepcopy(data) else: vs.extend(data) # print(vs) # 'feat-header_false-before_proj_False-gs_True-diag-std_False_center_False-d_5' out_file_ = pth.join(in_dir, feat + "-header_" + str(is_header), "before_proj_" + str(is_before_proj) + "-gs_" + str(is_gs), f"std_False_center_False-{str(covariance_type)}", f'{dataset_name}-{model_name}.csv') print(f'data_models: {out_file_}') check_path(out_file_) out_file_dat = out_file_ + '.dat' dump_data(res_, out_file=out_file_dat) # save as csv pd.DataFrame(vs).to_csv(out_file_, index=False, encoding='utf-8-sig') # # save as xlsx # out_xlsx = dat2xlxs_new(out_file_dat, out_file=out_file_dat + '.xlsx', models=models) # # compute ratio OCSVM/GMM # out_xlsx_ratio = improvement(out_xlsx, feat_set=feat, # out_file=os.path.splitext(out_file_dat)[0] + '-ratio.xlsx') # print(out_xlsx) # # # for paper # out_latex = dat2latex(out_xlsx_ratio, out_file=os.path.splitext(out_file_dat)[0] + '-latex.xlsx') # print(out_latex) # show(in_file=out_file_) # show model separately return out_file_
def main1(directions=[ ('direction', 'src_dst'), ], feats=[('feat', 'iat_size'), ('feat', 'stats')], headers=[('is_header', True), ('is_header', False)], gses=[('is_gs', True), ('is_gs', False)], before_projs=[ ('before_proj', False), ], ds=[ ('d_kjl', 5), ], out_dir='speedup/out', train_sizes=[('train_size', 5000)], is_parallel=True): # Store all the results res = [] # Get all datasets datasets = [('data_name', v) for v in DATASETS] datasets_cfg = list(itertools.product(datasets, directions, feats, headers)) # Get all models models = [('model_name', v) for v in MODELS] models_cfg = list( itertools.product(models, gses, before_projs, ds, train_sizes)) # The total number of the experiments n_tot = len(list(itertools.product(datasets_cfg, models_cfg))) lg.info(f'n_tot: {n_tot}') for i, (data_cfg, model_cfg) in enumerate( list(itertools.product(datasets_cfg, models_cfg))): lg.info(f'{i}/{n_tot}, {dict(data_cfg)}, {dict(model_cfg)}') n_cpus = os.cpu_count() lg.info(f'n_cpus: {n_cpus}') # If we execute all experiments in parallel if is_parallel: parallel = Parallel(n_jobs=5, verbose=30) with parallel: res = parallel( delayed(_main)(dict(data_cfg), dict(model_cfg), out_dir) for data_cfg, model_cfg, in list( itertools.product(datasets_cfg, models_cfg))) else: # Run each combination in sequence. for i, (data_cfg, model_cfg) in enumerate( list(itertools.product(datasets_cfg, models_cfg))): res_, time_token = _main(dict(data_cfg), dict(model_cfg), out_dir) res.append(res_) lg.info(f'{i + 1}/{n_tot}, it takes {time_token:.5f}s') # Dump all results to disk dump_data(res, out_file=f'{out_dir}/res.dat') lg.info('\n\n***finish!')
def main(directions=[ ('direction', 'src_dst'), ], feats=[('feat', 'iat_size'), ('feat', 'stats')], headers=[('is_header', True), ('is_header', False)], gses=[('is_gs', True), ('is_gs', False)], before_projs=[ ('before_proj', False), ], ds=[ ('d_kjl', 5), ], train_sizes=[('train_size', 5000)], k_qs=[('k_qs', 5000**(5 / 7)), ('k_qs', 5000**(2 / 3))], out_dir='speedup/out', is_parallel=True): # Store all the results res = [] datasets = [('data_name', v) for v in DATASETS] datasets_cfg = list(itertools.product(datasets, directions, feats, headers)) models = [('model_name', v) for v in MODELS] models_cfg = list( itertools.product(models, gses, before_projs, ds, train_sizes, k_qs)) # Total number of experiments n_tot = len(list(itertools.product(datasets_cfg, models_cfg))) lg.info(f'n_tot: {n_tot}') for i, (data_cfg, model_cfg) in enumerate( list(itertools.product(datasets_cfg, models_cfg))): lg.info(f'{i}/{n_tot}, {dict(data_cfg)}, {dict(model_cfg)}') n_cpus = os.cpu_count() lg.info(f'n_cpus: {n_cpus}') if is_parallel: # It doesn't work well (will be killed by the server). Not sure what's the issue about it. # n_jobs(=5) * _single_main(n_job=10) = 5*10, too large parallel = Parallel(n_jobs=5, verbose=30) with parallel: res = parallel( delayed(_main)(dict(data_cfg), dict(model_cfg), out_dir) for data_cfg, model_cfg, in list( itertools.product(datasets_cfg, models_cfg))) else: # Run each combination in sequence. It's slow but it works. for i, (data_cfg, model_cfg) in enumerate( list(itertools.product(datasets_cfg, models_cfg))): res_, time_token = _main(dict(data_cfg), dict(model_cfg), out_dir) res.append(res_) lg.info(f'{i + 1}/{n_tot}, it takes {time_token:.5f}s') # dump all data to disk dump_data(res, out_file=f'{out_dir}/res.dat') lg.info('\n\n***finish!')
def save_each_result(data, case_str, out_file=None): if not pt.exists(pt.dirname(out_file)): os.makedirs(pt.dirname(out_file)) # dump first dump_data(data, pt.splitext(out_file)[0] + '.dat') with open(out_file, 'w') as f: aucs = data['aucs'] train_times = data['train_times'] test_times = data['test_times'] params = data['params'] _prefix, _line, _suffex = _get_line(data, feat_set='iat_size') aucs_str = "-".join([str(v) for v in aucs]) train_times_str = "-".join([str(v) for v in train_times]) test_times_str = "-".join([str(v) for v in test_times]) line = f'{case_str}, {_prefix}, {_line}, => aucs:{aucs_str}, train_times:{train_times_str}, test_times:{test_times_str}, with params: {params}: {_suffex}' f.write(line + '\n')
def main1(DATASETS = [], MODELS = [], directions=[('direction', 'src_dst'), ], feats=[('feat', 'iat_size'), ('feat', 'stats')], headers=[('is_header', True), ('is_header', False)], gses=[('is_gs', True), ('is_gs', False)], before_projs=[('before_proj', False), ], ds=[('kjl_d', 5), ], out_dir='speedup/out', train_sizes=[('train_size', 5000)], is_parallel=False): """ Get all results on all datasets and save the results to file. Parameters ---------- directions feats headers gses before_projs ds out_dir train_sizes is_parallel Returns ------- """ start = time.time() # Store all the results res = [] ################################################################################################################ # 1. Get all datasets datasets = [('data_name', v) for v in DATASETS] datasets_cfg = list(itertools.product(datasets, directions, feats, headers)) ################################################################################################################ # 2. Get all models models = [('model_name', v) for v in MODELS] models_cfg = list(itertools.product(models, gses, before_projs, ds, train_sizes)) ################################################################################################################ # 3. Get the total number of the experiments and print each of them out n_tot = len(list(itertools.product(datasets_cfg, models_cfg))) lg.info(f'n_tot: {n_tot}') for i, (data_cfg, model_cfg) in enumerate(list(itertools.product(datasets_cfg, models_cfg))): lg.info(f'{i}/{n_tot}, {dict(data_cfg)}, {dict(model_cfg)}') n_cpus = os.cpu_count() lg.info(f'n_cpus: {n_cpus}') ################################################################################################################ # 4. Run experiments in parallel or serial if is_parallel: # if backend='loky', the time taken is less than that of serial. but if backend='multiprocessing', we can # get very similar time cost comparing with serial. # The reason may be that loky module manages a pool of worker that can be re-used across time. # It provides a robust and dynamic implementation os the ProcessPoolExecutor and a function # get_reusable_executor() which hide the pool management under the hood. with Parallel(n_jobs=-1, verbose=30, backend='multiprocessing') as parallel: res = parallel(delayed(single_main)(copy.deepcopy(dict(data_cfg_)), copy.deepcopy(dict(model_cfg_)), copy.deepcopy(out_dir)) for data_cfg_, model_cfg_ in list(itertools.product(datasets_cfg, models_cfg))) else: # Running each combination in serial to obtain correct train time and test time for i, (data_cfg_, model_cfg_) in enumerate(list(itertools.product(datasets_cfg, models_cfg))): res_, time_taken = single_main(copy.deepcopy(dict(data_cfg_)), copy.deepcopy(dict(model_cfg_)), copy.deepcopy(out_dir)) res.append((res_, time_taken)) lg.info(f'{i + 1}/{n_tot}, it takes {time_taken:.5f}s') ################################################################################################################ # 5. Dump all results to disk dump_data(res, out_file=f'{out_dir}/res.dat') ################################################################################################################ end = time.time() lg.info(f'\n***It takes {end - start:.5f}s to finish {n_tot} experiments!')
def save_result2(result, out_file): dump_data(result, pth.splitext(out_file)[0] + '.dat')