def my_callback_func(cur_param_vec, is_init=False, alg_state_kwargs=alg_state_kwargs): # Update step counter, timer, etc. if not is_init: alg_state_kwargs.update( update_alg_state_kwargs(**alg_state_kwargs)) if do_print_now(**alg_state_kwargs) or do_save_now(**alg_state_kwargs): cur_loss_val = loss_func_wrt_paramvec_and_step(cur_param_vec) alg_state_kwargs['cur_loss_val'] = cur_loss_val if do_print_now(**alg_state_kwargs): pprint(make_status_string( **alg_state_kwargs)) # assume cur_loss_val is inside save_status_to_txt_files(**alg_state_kwargs) alg_state_kwargs.update( update_alg_state_kwargs_after_print(**alg_state_kwargs)) if do_save_now(**alg_state_kwargs): param_dict = param_tfm_manager.unflatten_to_common_param_dict( cur_param_vec, **dim_P) if save_func_wrt_param_dict is not None: save_func_wrt_param_dict(param_dict=param_dict, **alg_state_kwargs) if callback_func_wrt_param_dict is not None: callback_func_wrt_param_dict( param_dict=param_dict, losstrain_ttl=alg_state_kwargs.get('cur_loss_val', init_loss_val), alg_state_kwargs=alg_state_kwargs, **callback_kwargs) alg_state_kwargs.update( update_alg_state_kwargs_after_save(**alg_state_kwargs))
def load_df_from_all_folders_matching_list_of_patterns( list_of_path_patterns=None, legend_name=None, y_ind=0, column_names=None, query_str=None, task_ids=None, **kwargs): pprint(">>> BEGIN load_df_from_all_folders_that_match_pattern") list_of_match_df = list() for path_pattern in list_of_path_patterns: cur_alg_df = load_df_from_all_folders_that_match_pattern( path_pattern, y_ind=y_ind, task_ids=task_ids, column_names=column_names) if query_str is not None: cur_alg_df = cur_alg_df.query(query_str).copy() # Append to list of all matching dataframes list_of_match_df.append(cur_alg_df) # Create all matching DataFrame all_matching_runs_df = pd.concat(list_of_match_df) pprint("<<< END load_df_from_all_folders_that_match_pattern") return all_matching_runs_df
def make_best_task_df(df, target_query="SPLIT_NAME == 'VALID' and LAP > 50", score_colname='Y_ERROR_RATE', score_ranking_func=np.argmin, default_score=None, verbose=False): ''' Find best task for each unique job in provided df. Returns ------- best_df : dataframe of best tasks for each unique job ''' if default_score is None: default_score = fetch_default_score(score_ranking_func.__name__) best_task_df_list = list() job_paths = np.unique(df['JOB_PATH'].values) for job_path in job_paths: if job_path is None: continue job_df = df.query("JOB_PATH == '%s'" % job_path) taskids = np.unique(job_df['TASKID'].values) best_score_idx = np.zeros_like(taskids, dtype=np.int32) best_score = default_score * np.ones_like(taskids, dtype=np.float64) for tt, taskidstr in enumerate(taskids): task_df = job_df.query(target_query + " and TASKID == '%s'" % taskidstr) if task_df.shape[0] < 1: continue if not np.all(np.isfinite(task_df[score_colname].values)): pprint(task_df[score_colname].values) best_score_idx[tt] = score_ranking_func( task_df[score_colname].values) best_score[tt] = task_df[score_colname].values[best_score_idx[tt]] best_task_idx = score_ranking_func(best_score) best_task_df = job_df.query("TASKID == '%s'" % taskids[best_task_idx]) best_task_df_list.append(best_task_df) if verbose: pprint(job_path) pprint("best task: %s" % best_task_idx) return pd.concat(best_task_df_list)
def minimize( loss_func_wrt_paramvec_and_step=None, grad_func_wrt_paramvec_and_step=None, save_func_wrt_param_dict=None, callback_func_wrt_param_dict=None, callback_kwargs=None, param_tfm_manager=None, dim_P=None, init_param_dict=None, step_direction='steepest', step_size=0.01, decay_rate=1.0, decay_interval=25, decay_staircase=0, b1=0.9, b2=0.999, eps=1e-8, max_l2_norm_of_grad_per_entry=10.0, **kwargs): pprint('[grad_descent_minimizer] Begin training...') pprint('--step_direction %s' % step_direction) pprint('--step_size %.3f' % step_size) pprint('--decay_rate %.3f' % decay_rate) # Parse user input step_direction = str(step_direction) assert step_direction in ['adam', 'steepest'] step_size = float(step_size) decay_rate = float(decay_rate) decay_staircase = int(decay_staircase) decay_interval = float(decay_interval) b1 = float(b1) b2 = float(b2) eps = float(eps) # Convert provided common param dict # to a flat 1D array with unconstrained values param_vec = param_tfm_manager.flatten_to_differentiable_param_vec( init_param_dict, **dim_P) # Warmup start_time_sec = time.time() init_loss_val = loss_func_wrt_paramvec_and_step(param_vec, step_id=0) loss_eval_time_sec = time.time() - start_time_sec pprint("Loss @ init: %8.3f sec | val %.6e" % ( loss_eval_time_sec, init_loss_val)) pprint("Params @ init: %8s | %5d params | l2 norm / entry %.4e" % ( ' ', param_vec.size, calc_l2_norm_of_vector_per_entry(param_vec))) start_time_sec = time.time() init_grad_vec = grad_func_wrt_paramvec_and_step(param_vec, step_id=0) elapsed_time_sec = time.time() - start_time_sec init_grad_norm_per_entry = calc_l2_norm_of_vector_per_entry(init_grad_vec) pprint("Gradient @ init: %8.3f sec | %5d params | l2 norm / entry %.4e" % ( elapsed_time_sec, init_grad_vec.size, init_grad_norm_per_entry)) # Create settings that track algorithm state # cur_step, cur_lap, n_laps, n_steps, etc. alg_state_kwargs = init_alg_state_kwargs( cur_step=0.0, **kwargs) n_steps = alg_state_kwargs['n_steps'] if 'output_path' in alg_state_kwargs: laps_to_save_str, steps_to_save_str = calc_laps_when_snapshots_saved( return_str=True, keep_first=5, keep_last=5, **alg_state_kwargs) pprint("Snapshots will be saved at intervals:") pprint(" laps: %s" % laps_to_save_str) pprint(" steps: %s" % steps_to_save_str) pprint("Snapshot saved to --output_path:\n%s" % ( alg_state_kwargs['output_path'])) # Adam estimates of gradient mean/variance m = np.zeros_like(param_vec) v = np.zeros_like(param_vec) cur_step_size = step_size cur_loss_val = init_loss_val cur_grad_norm_per_entry = init_grad_norm_per_entry for step_id in xrange(0, n_steps + 1): if step_id > 0: grad_vec = grad_func_wrt_paramvec_and_step(param_vec, step_id=step_id) cur_grad_norm_per_entry = calc_l2_norm_of_vector_per_entry(grad_vec) assert np.isfinite(cur_grad_norm_per_entry) if cur_grad_norm_per_entry > max_l2_norm_of_grad_per_entry: warn_msg = ( 'WARNING: clipping gradient enforced.' + '\n cur l2 norm / entry = %.2e' + '\n new l2 norm / entry = %.2e') pprint(warn_msg % ( cur_grad_norm_per_entry, max_l2_norm_of_grad_per_entry)) grad_vec *= max_l2_norm_of_grad_per_entry / cur_grad_norm_per_entry cur_grad_norm_per_entry = calc_l2_norm_of_vector_per_entry(grad_vec) assert cur_grad_norm_per_entry <= max_l2_norm_of_grad_per_entry # Decay learning rate, like tensorflow's exponential decay if decay_staircase: cur_step_count = int(step_id) // int(decay_interval) else: cur_step_count = float(step_id) / float(decay_interval) cur_step_size = step_size * decay_rate ** (cur_step_count) if step_direction == 'adam': g = grad_vec m = (1 - b1) * g + b1 * m # First moment estimate. v = (1 - b2) * (g**2) + b2 * v # Second moment estimate. mhat = m / (1 - b1**(step_id)) # Bias correction. vhat = v / (1 - b2**(step_id)) step_vec = -1.0 * cur_step_size * mhat / (np.sqrt(vhat) + eps) elif step_direction.count('steep'): step_vec = -1.0 * cur_step_size * grad_vec else: raise ValueError("Unrecognized step_direction: %s" % step_direction) param_vec = param_vec + step_vec assert np.all(np.isfinite(param_vec)) # Update step counter, timer, etc. alg_state_kwargs = update_alg_state_kwargs( **alg_state_kwargs) if do_print_now(**alg_state_kwargs): cur_loss_val = loss_func_wrt_paramvec_and_step(param_vec, step_id=step_id) pprint(make_status_string( cur_loss_val=cur_loss_val, cur_grad_norm_per_entry=cur_grad_norm_per_entry, **alg_state_kwargs)) save_status_to_txt_files( cur_loss_val=cur_loss_val, cur_grad_norm_per_entry=cur_grad_norm_per_entry, cur_step_size=cur_step_size, **alg_state_kwargs) alg_state_kwargs = update_alg_state_kwargs_after_print( **alg_state_kwargs) if do_save_now(**alg_state_kwargs): param_dict = param_tfm_manager.unflatten_to_common_param_dict( param_vec, **dim_P) if save_func_wrt_param_dict is not None: save_func_wrt_param_dict( param_dict=param_dict, **alg_state_kwargs) if callback_func_wrt_param_dict is not None: callback_func_wrt_param_dict( param_dict=param_dict, losstrain_ttl=cur_loss_val, alg_state_kwargs=alg_state_kwargs, **callback_kwargs) alg_state_kwargs = update_alg_state_kwargs_after_save( **alg_state_kwargs) param_dict = param_tfm_manager.unflatten_to_common_param_dict( param_vec, **dim_P) pprint('[grad_descent_minimizer] Done with training.') return param_dict, alg_state_kwargs
def train_and_eval_clf_with_best_params_via_grid_search( classifier_name='logreg', param_grid_dict=None, datasets_by_split=None, verbose=True, feat_colnames=None, feat_preproc_grid_dict=None, y_col_id=0, y_orig_col_id=0, y_col_name='', output_path='/tmp/', max_grid_search_steps=None, class_weight_opts='', c_logspace_arg_str='', random_state=8675309, n_bootstraps=5000, seed_bootstrap=42, bootstrap_stratify_pos_and_neg=True, ): (make_classifier, score_classifier, calc_best_idx, make_clf_report, make_csv_row_dict, make_interp_report) = \ make_constructor_and_evaluator_funcs( classifier_name, n_bootstraps=n_bootstraps, seed_bootstrap=seed_bootstrap, bootstrap_stratify_pos_and_neg=bootstrap_stratify_pos_and_neg) if param_grid_dict is None: param_grid_dict = default_param_grid( classifier_name, c_logspace_arg_str=c_logspace_arg_str) if class_weight_opts == 'balanced': if 'class_weight' in param_grid_dict: param_grid_dict['class_weight'].insert(0, 'balanced') if isinstance(feat_preproc_grid_dict, dict): param_grid_dict.update(feat_preproc_grid_dict) n_grid = 1 for key, val_list in param_grid_dict.items(): n_grid *= len(val_list) if verbose: if max_grid_search_steps: pprint('Max configs in grid search: %d' % max_grid_search_steps) pprint('Total configs in grid search: %d' % n_grid) param_generator = make_param_dict_generator(param_grid_dict) clf_list = list() param_dict_list = list() score_list = list() start_time = time.time() x_tr, y_tr = make_nonnan_xy_for_target(datasets_by_split['train'], y_col_id) x_va, y_va = make_nonnan_xy_for_target(datasets_by_split['valid'], y_col_id) x_te, y_te = make_nonnan_xy_for_target(datasets_by_split['test'], y_col_id) for ii, param_dict in enumerate(param_generator): np.random.seed(random_state) clf = make_classifier(feat_colnames=feat_colnames, random_state=random_state, **param_dict) clf.fit(x_tr, y_tr) score = score_classifier(clf, x_va, y_va) clf_list.append(clf) score_list.append(score) param_dict_list.append(param_dict) if verbose: tr_score = score_classifier(clf, x_tr, y_tr) elapsed_time = time.time() - start_time param_str = str(param_dict) param_str = param_str.replace('),', ' ') for badstr in ['OrderedDict', '[', ']', '(', ')', ',']: param_str = param_str.replace(badstr, '') pprint("%4d/%d %10.2f sec va_auc %.4f tr_auc %.4f %s" % (1 + ii, n_grid, elapsed_time, score, tr_score, param_str)) if max_grid_search_steps and ((ii + 1) >= max_grid_search_steps): if verbose: pprint("Exceed max_grid_search_steps. Break!") break best_id = calc_best_idx(score_list) best_score = score_list[best_id] best_param_dict = param_dict_list[best_id] best_clf = clf_list[best_id] if verbose: pprint("------") pprint(" best param dict, using function " + calc_best_idx.__name__) pprint("------") pprint("va_auc = %.4f %s" % (best_score, str(best_param_dict))) ## Now tuning threshold, if applicable if isinstance(best_clf.named_steps['clf'], ThresholdClassifier): for cur_split_name, x_split in [ ('train', x_tr), ('test', x_te), ('valid', x_va), ]: yproba_class1 = best_clf.predict_proba(x_split)[:, 1] if verbose: pprint("Percentiles of clf Pr(y=1) on SPLIT = %s..." % cur_split_name) perc_str_list = list() for perc in [0, 1, 10, 25, 50, 75, 90, 99, 100]: perc_str = "%3d%% %.4f" % ( perc, np.percentile(yproba_class1, perc)) perc_str_list.append(perc_str) pprint(" " + " ".join(perc_str_list)) ## DEPRECATED #thr_min = np.maximum(0.001, [1]) #thr_max = np.minimum(0.999, np.unique(yproba_class1)[-2]) #thr_grid = np.linspace(thr_min, thr_max, num=101) ## Grid search on validation over possible threshold values # Make sure all candidates at least provide # one instance of each class (positive and negative) assert cur_split_name == 'valid' nontrivial_thr_vals = np.unique(yproba_class1)[1:-1] if nontrivial_thr_vals.size > 100: # Too many for possible thr values for typical compute power thr_grid = np.linspace(nontrivial_thr_vals[0], nontrivial_thr_vals[-1], 100) else: # Just look at all possible thresholds # that give distinct operating points. thr_grid = nontrivial_thr_vals if verbose: pprint("Searching thresholds...") pprint("thr_grid = %.4f, %.4f, %.4f ... %.4f, %.4f" % (thr_grid[0], thr_grid[1], thr_grid[2], thr_grid[-2], thr_grid[-1])) score_grid = np.zeros_like(thr_grid, dtype=np.float64) acc_grid = np.zeros_like(thr_grid, dtype=np.float64) tmp_clf = copy.deepcopy(best_clf) for gg, thr in enumerate(thr_grid): tmp_clf.named_steps['clf'].set_threshold(thr) yhat = tmp_clf.predict(x_va) score_grid[gg] = f1_score(y_va, yhat, pos_label=1) acc_grid[gg] = accuracy_score(y_va, yhat) gg_best = np.argmax(score_grid) best_clf.named_steps['clf'].set_threshold(thr_grid[gg_best]) if verbose: pprint("------") pprint(" best threshold by f1 score on validation") pprint("------") pprint("thr = %.4f f1_score %.4f acc_score %.4f" % ( thr_grid[gg_best], score_grid[gg_best], acc_grid[gg_best], )) if verbose: pprint() pprint(make_clf_report(best_clf, x_tr, y_tr, y_col_name + '_train')) pprint(make_clf_report(best_clf, x_va, y_va, y_col_name + '_valid')) pprint(make_clf_report(best_clf, x_te, y_te, y_col_name + '_test')) ireport = make_interp_report(best_clf, feat_colnames, y_col_name) if len(ireport) > 0: clf_ireport_path = os.path.join( output_path, 'clf_%d_interpretation.txt' % (y_orig_col_id)) with open(clf_ireport_path, 'w') as f: f.write(ireport) if verbose: pprint(ireport) # Write the classifier obj to disk if classifier_name != 'k_nearest_neighbors': clf_path = os.path.join(output_path, 'clf_%d_object.dump' % (y_orig_col_id)) joblib.dump(best_clf, clf_path, compress=1) pprint("wrote clf object to file via joblib:") pprint(clf_path) clf_repr_path = os.path.join(output_path, 'clf_%d_repr.txt' % (y_orig_col_id)) with open(clf_repr_path, 'w') as f: f.write(repr(best_clf) + "\n") clf_repr_path = os.path.join( output_path, 'clf_%d_best_param_dict_repr.txt' % (y_orig_col_id)) with open(clf_repr_path, 'w') as f: f.write(repr(best_param_dict) + "\n") if verbose: pprint("completed clf saving after %11.2f sec" % (time.time() - start_time)) if os.path.exists(output_path): for ss, split in enumerate(['valid', 'test', 'train']): csv_fpath = os.path.join( output_path, 'clf_%d_callback_%s.csv' % (y_orig_col_id, split)) x_cursplit, y_cursplit = make_nonnan_xy_for_target( datasets_by_split[split], y_col_id=y_col_id) row_dict = make_csv_row_dict(best_clf, x_cursplit, y_cursplit, y_col_name, split, classifier_name) csv_df = pd.DataFrame([row_dict], columns=row_dict.keys()) csv_df.to_csv(csv_fpath, index=False) if hasattr(best_clf, 'predict_proba'): for nbins in [6, 10, 20]: fig_fpath = os.path.join( output_path, 'clf_%d_calibration_%02dbin_%s.pdf' % (y_orig_col_id, nbins, split)) info_per_bin = calc_calibration_info(best_clf, x_cursplit, y_cursplit, bins=nbins) plot_binary_clf_calibration_curve_and_histograms( info_per_bin=info_per_bin) plt.savefig(fig_fpath, bbox_inches='tight', pad_inches=0) if verbose: elapsed_time = time.time() - start_time pprint("eval %d/%d on %5s split done after %11.2f sec" % (ss + 1, 3, split, elapsed_time)) pprint("wrote csv file: " + csv_fpath) return best_clf, best_param_dict
def read_args_from_stdin_and_run(): ''' Main executable function to train and evaluate classifier. Post Condition -------------- AUC and other eval info printed to stdout. Trained classifier saved ???. ''' if not sys.stdin.isatty(): for line in sys.stdin.readlines(): line = line.strip() sys.argv.extend(line.split(' ')) parser = argparse.ArgumentParser() parser.add_argument('--dataset_path', default='/tmp/', type=str, help="Path to folder containing:" + " *.npy files: X_train, y_train, P_train" " *.txt files: X_colnames.txt and y_colnames.txt") parser.add_argument( '--output_path', default='/tmp/', type=str, help="Path to folder to hold output from classifier. Includes:" + " perf_metric*.txt files: auc_train.txt & auc_test.txt" + " settings.txt: description of all settings to reproduce.") parser.add_argument('--feature_arr_names', type=str, default='X', help='Name of feature files to use for training') parser.add_argument('--features_path', default='/tmp/', type=str, help="Path to folder with extra feature files") parser.add_argument( '--target_arr_name', default='Y', type=str, ) parser.add_argument( '--target_names', default='all', type=str, help='Name of response/intervention to test.' + ' To try specific interventions, write names separated by commas.' + ' To try all interventions, use special name "all"') parser.add_argument( '--n_folds', default=1, type=int, help='Number of folds for cross validation during classification.') parser.add_argument('--classifier_name', default='logistic_regression', choices=[ 'k_nearest_neighbors', 'mlp', 'logistic_regression', 'extra_trees', 'svm_with_linear_kernel', 'svm_with_rbf_kernel' ], help='Name of classifier') parser.add_argument( '--class_weight_opts', choices=['none', 'balanced'], default='none', ) parser.add_argument('--max_grid_search_steps', default=None, type=int, help='max number of steps for grid search') parser.add_argument('--frac_labels_train', default=1.0, type=float, help='Fraction of the training data to use') parser.add_argument('--c_logspace_arg_str', default="-6,4,7", type=str, help='Comma-sep list of args to np.logspace') parser.add_argument('--seed', default=8675309, type=int, help='Seed for random number generation') parser.add_argument('--seed_bootstrap', default=42, type=int, help='Seed for bootstrap') parser.add_argument('--n_bootstraps', default=5000, type=int, help='Number of samples for bootstrap conf. intervals') parser.add_argument('--bootstrap_stratify_pos_and_neg', default=True, type=int, help='Whether to stratify examples or not') args, unk_list = parser.parse_known_args() arg_dict = vars(args) dataset_path = arg_dict['dataset_path'] for key, val in arg_dict.items(): if arg_dict['output_path'].count('$' + key): arg_dict['output_path'] = \ arg_dict['output_path'].replace('$' + key, str(val)) if not os.path.exists(arg_dict['output_path']): mkpath(arg_dict['output_path']) config_pprint_logging(arg_dict['output_path'], txtfile='stdout_%s.txt' % arg_dict['target_names']) pprint('[run_classifier says:] Parsing args ...') # Parse possible preprocessors feat_preproc_grid_dict = dict() for key, val in zip(unk_list[::2], unk_list[1::2]): if key.startswith('--preproc_'): feat_preproc_grid_dict[key[2:]] = str(val).split(',') pprint(key + " : " + val) arg_dict[key[2:]] = val for key in feat_preproc_grid_dict.keys(): ii = unk_list.index('--' + key) del unk_list[ii + 1] del unk_list[ii] if len(unk_list) > 0: pprint("UNKNOWN ARGS (ignored)") for key in unk_list: pprint(key) # Set default seed for numpy np.random.seed(arg_dict['seed']) # Write parsed args to plain-text file # so we can exactly reproduce later with open(os.path.join(arg_dict['output_path'], 'settings.txt'), 'w') as f: for key, val in arg_dict.items(): f.write(key + ' = ' + str(val) + '\n') pprint(key + ' = ' + str(val)) with open(os.path.join(arg_dict['output_path'], 'args.txt'), 'w') as f: for key, val in arg_dict.items(): f.write('--' + key + ' ' + str(val) + '\n') pprint('') feat_path_list = [arg_dict['dataset_path'], arg_dict['features_path']] pprint('[run_classifier says:] Loading dataset ...') start_time = time.time() feature_arr_names = arg_dict['feature_arr_names'].split(',') pprint('feature_arr_names:') feat_colnames_by_arr = OrderedDict() for feat_arr_name in feature_arr_names: pprint(feat_arr_name) cur_feat_colnames = None for feat_path in feat_path_list: colname_fpath = os.path.join(feat_path, feat_arr_name + '_colnames.txt') if os.path.exists(colname_fpath): cur_feat_colnames = \ [str(feat_arr_name + ":") + s for s in load_list_of_unicode_from_txt(colname_fpath)] break feat_colnames_by_arr[feat_arr_name] = cur_feat_colnames target_arr_name = arg_dict['target_arr_name'] all_target_names = load_list_of_strings_from_txt( os.path.join(arg_dict['dataset_path'], target_arr_name + '_colnames.txt')) target_names = arg_dict['target_names'] if target_names == 'all': target_names = all_target_names target_cols = np.arange(len(all_target_names)).tolist() else: target_names = target_names.split(',') target_cols = list() for name in target_names: assert name in all_target_names target_cols.append(all_target_names.index(name)) datasets_by_split = dict() for split_name in ['train', 'valid', 'test']: datasets_by_split[split_name] = dict() split_dataset = datasets_by_split[split_name] # Load Y dense_fpath = os.path.join(dataset_path, target_arr_name + "_%s.npy" % split_name) y = np.asarray(np.load(dense_fpath), order='C', dtype=np.float32) # 0/1/nan if y.ndim < 2: y = y[:, np.newaxis] assert y.ndim == 2 assert y.shape[1] == len(all_target_names) split_dataset['y'] = y[:, target_cols] assert split_dataset['y'].shape[1] == len(target_cols) # Load X x_list = list() for feat_arr_name in feature_arr_names: for ii, feat_path in enumerate(feat_path_list): dense_fpath = os.path.join( feat_path, feat_arr_name + "_%s.npy" % split_name) sparse_fpath = os.path.join( feat_path, feat_arr_name + "_csr_%s.npz" % split_name) x_cur = None try: if os.path.exists(sparse_fpath): print("Here is sparse_fpath", sparse_fpath) x_cur = load_csr_matrix(sparse_fpath) print(x_cur) assert np.all(np.isfinite(x_cur.data)) break else: x_cur = np.asarray(np.load(dense_fpath), order='C', dtype=np.float64) if x_cur.ndim < 2: x_cur = np.atleast_2d(x_cur).T assert np.all(np.isfinite(x_cur)) break except IOError as e: if ii == len(feat_path_list) - 1: # Couldn't find desired file in any feat_path raise e else: # Try the next feat_path in the list pass if x_cur is not None: if feat_colnames_by_arr[feat_arr_name] is not None: feat_dim = len(feat_colnames_by_arr[feat_arr_name]) print('feat name, %s, feat_dim %d' % (feat_arr_name, feat_dim)) print('x_cur shape', x_cur.shape[1]) assert x_cur.shape[1] == feat_dim else: # Add dummy colnames feat_dim = x_cur.shape[1] n_sig_digits = np.maximum(3, int(np.ceil(np.log10(feat_dim)))) fmt_str = "%s_%0" + str(n_sig_digits) + "d" feat_colnames_by_arr[feat_arr_name] = [ fmt_str % (feat_arr_name, fid) for fid in range(feat_dim) ] x_list.append(x_cur) if isinstance(x_list[0], np.ndarray): split_dataset['x'] = np.hstack(x_list) else: split_dataset['x'] = scipy.sparse.hstack(x_list, format='csr') #Use only a fraction of the training dataset if specified frac_labels_train = arg_dict['frac_labels_train'] if split_name == 'train' and frac_labels_train < 1.0: # Same random seed taken from bow_dataset.py data_prng = np.random.RandomState(int(42)) n_rows = y.shape[0] #Note: does not handle truly missing labels indexed_rows = np.arange(n_rows) shuffled_rows = data_prng.permutation(indexed_rows) n_visible = int(np.ceil(frac_labels_train * n_rows)) visible_rows = shuffled_rows[:n_visible] split_dataset['x'] = split_dataset['x'][visible_rows, :] split_dataset['y'] = split_dataset['y'][visible_rows, :] assert split_dataset['x'].ndim == 2 assert split_dataset['x'].shape[0] == split_dataset['y'].shape[0] assert (isinstance(split_dataset['x'], np.ndarray) or isinstance(split_dataset['x'], scipy.sparse.csr_matrix)) if split_name == 'train': # Flatten feat colnames into single list feat_colnames = sum(feat_colnames_by_arr.values(), []) assert isinstance(feat_colnames, list) assert len(feat_colnames) == split_dataset['x'].shape[1] if len(feat_colnames) > 10: pprint('x colnames: %s ... %s' % (' '.join( feat_colnames[:5]), ' '.join(feat_colnames[-5:]))) else: pprint('x colnames: %s' % ' '.join(feat_colnames)) pprint('y colnames: %s' % ' '.join(target_names)) pprint('---- %5s dataset summary' % split_name) pprint('%9d total examples' % y.shape[0]) pprint('y : %d x %d targets' % split_dataset['y'].shape) pprint('x : %d x %d features' % split_dataset['x'].shape) for c in range(len(target_names)): y_c = split_dataset['y'][:, c] nan_bmask = np.isnan(y_c) pos_bmask = y_c == 1 neg_bmask = y_c == 0 pprint('target %s :' % target_names[c]) pprint(' %6d pos examples | %.3f' % (np.sum(pos_bmask), calcfrac(pos_bmask))) pprint(' %6d neg examples | %.3f' % (np.sum(neg_bmask), calcfrac(neg_bmask))) pprint(' %6d NaN examples | %.3f' % (np.sum(nan_bmask), calcfrac(nan_bmask))) assert nan_bmask.sum() + pos_bmask.sum() + neg_bmask.sum( ) == neg_bmask.size elapsed_time = time.time() - start_time pprint('[run_classifier says:] dataset loaded after %.2f sec.' % elapsed_time) n_cols = len(target_names) for c in range(n_cols): pprint('[run_classifier says:] train for target %s' % target_names[c]) train_and_eval_clf_with_best_params_via_grid_search( arg_dict['classifier_name'], datasets_by_split=datasets_by_split, y_col_id=c, y_orig_col_id=all_target_names.index(target_names[c]), y_col_name=target_names[c], feat_colnames=feat_colnames, feat_preproc_grid_dict=feat_preproc_grid_dict, output_path=arg_dict['output_path'], max_grid_search_steps=arg_dict['max_grid_search_steps'], class_weight_opts=arg_dict['class_weight_opts'], c_logspace_arg_str=arg_dict['c_logspace_arg_str'], random_state=arg_dict['seed'], seed_bootstrap=arg_dict['seed_bootstrap'], n_bootstraps=arg_dict['n_bootstraps'], bootstrap_stratify_pos_and_neg=arg_dict[ 'bootstrap_stratify_pos_and_neg'], ) elapsed_time = time.time() - start_time pprint('[run_classifier says:] target %s completed after %.2f sec' % (target_names[c], elapsed_time))
def calc_perf_metrics_for_snapshot_param_dict( param_dict=None, topics_KV=None, w_CK=None, datasets_by_split=None, model_hyper_P=None, dim_P=None, alg_state_kwargs=None, output_path=None, cur_lap=0.0, cur_step=None, elapsed_time_sec=0.0, losstrain_ttl=None, verbose_timings=False, disable_output=False, do_force_update_w_CK=0, perf_metrics_pi_optim_kwargs=None, **unused_kwargs): ''' Compute performance metrics at provided topic model param dict. Returns ------- info_dict : dict Contains all perf. metric information. Post Condition -------------- Row appended to CSV files in output_path/ * snapshot_perf_metrics_train.csv * snapshot_perf_metrics_valid.csv * snapshot_perf_metrics_test.csv ''' if perf_metrics_pi_optim_kwargs is None: perf_metrics_pi_optim_kwargs = dict() etimes = OrderedDict() etimes = start_timer_segment(etimes, 'total') # Unpack parameters if param_dict is not None: topics_KV = param_dict['topics_KV'] w_CK = param_dict['w_CK'] if topics_KV is None: raise ValueError("topics_KV should not None") if not np.all(np.isfinite(topics_KV)): raise ValueError("topics_KV should not be NaN or Inf") if w_CK is None: raise ValueError("w_CK should not None") if not np.all(np.isfinite(w_CK)): raise ValueError("w_CK should not be NaN or Inf") # Track norms of params (crude debugging tool) l1_norm_logtopics = np.mean(np.abs(np.log(topics_KV.flatten()))) l1_norm_w = np.mean(np.abs(w_CK.flatten())) # Unpack hyperparams alpha = model_hyper_P['alpha'] tau = model_hyper_P['tau'] lambda_w = model_hyper_P['lambda_w'] weight_y = model_hyper_P['weight_y'] # Unpack state kwargs if alg_state_kwargs is not None: output_path = alg_state_kwargs['output_path'] cur_lap = alg_state_kwargs['cur_lap'] cur_step = alg_state_kwargs['cur_step'] elapsed_time_sec = alg_state_kwargs['elapsed_time_sec'] # TODO check if dataset is semisupervised y_DC = datasets_by_split['train']['y_DC'] n_labels = y_DC.shape[1] u_y_vals = np.unique(y_DC.flatten()) if u_y_vals.size <= 2 and np.union1d([0.0, 1.0], u_y_vals).size == 2: output_data_type = 'binary' else: output_data_type = 'real' # Count number of docs for which at least one pair of each vocab word occurs _, ndocs_csc_VV = coh.calc_pairwise_cooccurance_counts( dataset=datasets_by_split['train']) split_names = ['train', 'valid', 'test'] for split_name in split_names: etimes = start_timer_segment(etimes, '%s_calc_lossmap' % split_name) ans_dict = pc_toolbox.model_slda.slda_loss__cython.calc_loss__slda( dataset=datasets_by_split[split_name], topics_KV=topics_KV, w_CK=w_CK, LP=None, weight_x=1.0, weight_y=1.0, alpha=alpha, tau=tau, lambda_w=lambda_w, pi_estimation_mode='missing_y', pi_estimation_weight_y=0.0, return_dict=True, **perf_metrics_pi_optim_kwargs) etimes = stop_timer_segment(etimes, '%s_calc_lossmap' % split_name) assert 'summary_msg' in ans_dict # Extract doc-topic features assert 'pi_DK' in ans_dict pi_DK = ans_dict.pop('pi_DK') info_dict = OrderedDict([ ('step', float(cur_step)), ('lap', float(cur_lap)), ('elapsed_time_sec', float(elapsed_time_sec)), ('logpdf_x_pertok', -1 * ans_dict['uloss_x__pertok']), ('logpdf_y_perdoc', -1 * ans_dict['uloss_y__perdoc']), ('lossmap_ttl_pertok', ans_dict['loss_ttl']), ('lossmap_x_pertok', ans_dict['loss_x']), ('lossmap_y_pertok', ans_dict['loss_y']), ('lossmap_pi_pertok', ans_dict['loss_pi']), ('lossmap_topic_pertok', ans_dict['loss_topics']), ('lossmap_w_pertok', ans_dict['loss_w']), ]) if losstrain_ttl is not None: info_dict['losstrain_ttl'] = float(losstrain_ttl) ## Compute y metrics # Case 1/2: binary etimes = start_timer_segment(etimes, '%s_calc_y_metrics' % split_name) assert 'y_proba_DC' in ans_dict if output_data_type.count('binary'): y_proba_DC = ans_dict.pop('y_proba_DC') C = y_proba_DC.shape[1] assert np.nanmin(y_proba_DC) >= 0.0 assert np.nanmax(y_proba_DC) <= 1.0 for c in range(n_labels): ytrue_c_D = datasets_by_split[split_name]['y_DC'][:, c] yproba_c_D = y_proba_DC[:, c] # Keep only finite values rowmask = np.logical_and(np.isfinite(yproba_c_D), np.isfinite(ytrue_c_D)) ytrue_c_D = ytrue_c_D[rowmask] yproba_c_D = yproba_c_D[rowmask] if ytrue_c_D.size == 0: raise ValueError("Label id c=%d has no observed y values" % c) yhat_c_D = np.asarray(yproba_c_D > 0.5, dtype=ytrue_c_D.dtype) # Error rate error_rate_y__c = np.sum(np.logical_xor(ytrue_c_D, yhat_c_D)) error_rate_y__c /= float(ytrue_c_D.size) info_dict['y_%d_error_rate' % c] = error_rate_y__c # Area under ROC curve try: #roc_auc_y__c = roc_auc_score(ytrue_c_D, yproba_c_D) roc_auc_y__c = average_precision_score( ytrue_c_D, yproba_c_D) except ValueError as e: # Error occurs when not enough examples of each label roc_auc_y__c = 0.0 info_dict['y_%d_auprc' % c] = roc_auc_y__c # Case 2/2: real values elif output_data_type.count('real'): # Remember, y_proba_DC is really estimated mean of y_DC y_est_DC = ans_dict.pop('y_proba_DC') for c in range(n_labels): y_true_c_D = datasets_by_split[split_name]['y_DC'][:, c] y_est_c_D = y_est_DC[:, c] # Keep only finite values rowmask = np.logical_and(np.isfinite(y_true_c_D), np.isfinite(y_est_c_D)) y_true_c_D = y_true_c_D[rowmask] y_est_c_D = y_est_c_D[rowmask] if y_true_c_D.size == 0: raise ValueError("Label id c=%d has no observed y values" % c) # Compute RMSE rmse = np.sqrt(np.mean(np.square(y_true_c_D - y_est_c_D))) info_dict['y_%d_rmse' % c] = rmse etimes = stop_timer_segment(etimes, '%s_calc_y_metrics' % split_name) ## Compute vb lower bound on logpdf x etimes = start_timer_segment(etimes, '%s_calc_lb_logpdf_x' % split_name) lb_logpdf_x, lb_logpdf_x_pertok = calc_elbo_for_many_docs( dataset=datasets_by_split[split_name], topics_KV=topics_KV, alpha=alpha, init_name_list=['warm'], init_pi_DK=pi_DK, verbose=False, do_trace_elbo=False, ) etimes = stop_timer_segment(etimes, '%s_calc_lb_logpdf_x' % split_name) info_dict['elbo_logpdf_x_pertok'] = lb_logpdf_x_pertok ## COHERENCE etimes = start_timer_segment(etimes, '%s_calc_coher_metrics' % split_name) K = topics_KV.shape[0] npmi_K = np.zeros(K) for k in range(K): # Select at most 20 vocab words per topic # But if fewer than that take up 90% of the mass, take only those top_vocab_ids = np.argsort(-1 * topics_KV[k])[:20] cumsum_mass = np.cumsum(topics_KV[k, top_vocab_ids]) m = np.searchsorted(cumsum_mass, 0.9) top_vocab_ids = top_vocab_ids[:(m + 1)] npmi_K[ k], _ = coh.calc_npmi_and_pmi_coherence_for_top_ranked_terms_in_topic( ndocs_csc_VV=ndocs_csc_VV, top_vocab_ids=top_vocab_ids, pair_smooth_eps=0.1) if K < 10: perc_list = [0, 50, 100] else: perc_list = [0, 10, 50, 90, 100] for perc in perc_list: pstr = '%06.2f' % perc info_dict['topic_npmi_p' + pstr] = np.percentile(npmi_K, perc) etimes = stop_timer_segment(etimes, '%s_calc_coher_metrics' % split_name) info_dict['losstrain_weight_y'] = weight_y info_dict['alpha'] = alpha info_dict['tau'] = tau info_dict['lambda_w'] = lambda_w info_dict['n_states'] = float(topics_KV.shape[0]) info_dict['l1norm_w'] = float(l1_norm_w) info_dict['l1norm_logtopics'] = float(l1_norm_logtopics) info_df = pd.DataFrame([info_dict]) col_order = info_dict.keys() ppinfo_str = info_df.to_csv( None, float_format='% 20.12g', na_rep='%20s' % 'nan', index=False, header=False, columns=col_order) # relying on an ordered dict here info_str = info_df.to_csv( None, float_format='% .12g', na_rep='nan', index=False, header=False, columns=col_order) # relying on an ordered dict here assert np.max(list(map(len, col_order))) <= 20 if not disable_output: csv_fpath = os.path.join( output_path, 'snapshot_perf_metrics_%s.csv' % split_name) ppcsv_fpath = os.path.join( output_path, 'pretty_snapshot_perf_metrics_%s.csv' % split_name) if int(cur_step) == 0: with open(csv_fpath, 'w') as f: header_str = ','.join(['%s' % s for s in col_order]) f.write(header_str + "\n") with open(ppcsv_fpath, 'w') as f: header_str = ','.join(['%20s' % s for s in col_order]) f.write(header_str + "\n") with open(csv_fpath, 'a') as f: f.write(info_str) with open(ppcsv_fpath, 'a') as f: f.write(ppinfo_str) pi_summary_txt_fpath = os.path.join( output_path, 'perf_metrics_pi_optim_summaries_%s.txt' % split_name) lap_prefix = 'lap %011.3f ' % cur_lap with open(pi_summary_txt_fpath, 'a') as f: f.write(lap_prefix + ans_dict['summary_msg'] + "\n") # Write timings to txt file for comparison msg = pprint_timer_segments(etimes, prefix='lap%011.3f' % (cur_lap)) if verbose_timings: pprint(msg) if not disable_output: timings_txt = os.path.join(output_path, 'timings_for_perf_metrics.txt') with open(timings_txt, 'a') as f: f.write(msg) return info_dict
def get_stratified_subsample_ids(y_DC=None, n_subsamples=1000, min_per_label=5, seed=42, verbose=False): ''' Get row ids of examples to keep in subsample for initializing weights Returns ------- doc_ids : 1D array of ids Examples -------- >>> y_DC = np.zeros((1000, 3)) >>> y_DC[200:205, 0] = 1 >>> y_DC[400:405, 1] = 1 >>> y_DC[:995, 2] = 1 >>> mask = get_stratified_subsample_ids(y_DC, 10, min_per_label=5) >>> mask.tolist() [200, 201, 202, 203, 204, 400, 401, 402, 403, 404, 995, 996, 997, 998, 999] >>> np.sum(y_DC[mask] == 0, axis=0).tolist() [10, 10, 10] >>> np.sum(y_DC[mask] == 1, axis=0).tolist() [5, 5, 5] ''' n_labels = y_DC.shape[1] n_examples = y_DC.shape[0] if n_subsamples >= n_examples: return np.arange(n_examples) # If here, we actually need to subsample # Make version of y_DC where 1 is the minority class in EVERY column sums_total = np.sum(y_DC, axis=0) need_flip = sums_total / n_examples > 0.5 y_DC[:, need_flip] = 1.0 - y_DC[:, need_flip] sums_total[need_flip] = n_examples - sums_total[need_flip] keep_mask = np.zeros(y_DC.shape[0], dtype=np.bool) sums_subsample = np.sum(y_DC[keep_mask], axis=0) for c in xrange(n_labels): if sums_subsample[c] < min_per_label \ and sums_subsample[c] < sums_total[c]: n_more = np.minimum(min_per_label, sums_total[c]) on_ids = np.flatnonzero(y_DC[:, c])[:min_per_label] keep_mask[on_ids] = True size = np.sum(keep_mask) if size < n_subsamples: prng = np.random.RandomState(seed) eligible_ids = np.flatnonzero(keep_mask == 0) chosen_ids = prng.choice(eligible_ids, n_subsamples - size, replace=False) keep_mask[chosen_ids] = 1 size = np.sum(keep_mask) assert size >= n_subsamples sums_subsample = np.sum(y_DC[keep_mask], axis=0) if verbose: pprint('Minority examples per label in dataset of size %d' % n_examples) pprint(' '.join(['%4d' % val for val in sums_total])) pprint('Minority examples per label in subsample of size %d:' % size) pprint(' '.join(['%4d' % val for val in sums_subsample])) return np.flatnonzero(keep_mask)
def calc_nef_map_pi_DK(dataset=None, topics_KV=None, alpha=None, nef_alpha=None, init_pi_DK=None, n_seconds_between_print=-1, active_proba_thr=0.005, return_info=False, calc_pi_d_K=calc_nef_map_pi_d_K, **some_pi_estimation_kwargs): ''' Extract doc-topic probability features for every doc in dataset. Args ---- dataset : dict with array fields 'n_docs' : int, non-negative number of documents in dataset 'word_id_U' : 1D array, size U, dtype=int vocab ids for each doc-term pair in dataset 'word_ct_U' : 1D array, size U, dtype=float counts for each doc-term pair in dataset 'doc_indptr_Dp1' : 1D array, size D+1, type=int indptr / fenceposts delineating where individual docs begin/end topics_KV : 2D array, size K x V, rows sum to one probability of each word v appearing under each topic k alpha : float, positive value concentration parameter of Dirichlet prior on doc-topic probas Returns ------- pi_DK : 2D array, size D x K Each row has positive entries and sums to one. info_dict : dict Only returned if called with return_info=True ''' # Parse pi estimation kwargs pi_estimation_kwargs = dict(**DefaultDocTopicOptKwargs) for key in pi_estimation_kwargs.keys(): if key in some_pi_estimation_kwargs: val = DefaultDocTopicOptKwargs[key] if isinstance(val, float): pi_estimation_kwargs[key] = float( some_pi_estimation_kwargs[key]) else: pi_estimation_kwargs[key] = int(some_pi_estimation_kwargs[key]) assert topics_KV is not None K = int(topics_KV.shape[0]) n_docs = dataset['n_docs'] doc_indptr_Dp1 = dataset['doc_indptr_Dp1'] word_id_U = dataset['word_id_U'] word_ct_U = dataset['word_ct_U'] pi_DK = np.zeros((n_docs, K)) n_docs_converged = 0 n_docs_restarted = 0 iters_per_doc = np.zeros(n_docs, dtype=np.int32) n_active_per_doc = np.zeros(n_docs, dtype=np.int32) restarts_per_doc = np.zeros(n_docs, dtype=np.int32) step_size_per_doc = np.zeros(n_docs, dtype=np.float32) dist_per_doc = np.zeros(n_docs, dtype=np.float32) loss_per_doc = np.zeros(n_docs, dtype=np.float32) is_time = False start_time_sec = time.time() last_print_sec = start_time_sec for d in xrange(n_docs): start_d = doc_indptr_Dp1[d] stop_d = doc_indptr_Dp1[d + 1] if init_pi_DK is None: init_pi_d_K = None else: init_pi_d_K = init_pi_DK[d] # MCH: Cannot autograd when doing this kind of assignment pi_DK[d,:], info_dict = \ calc_pi_d_K( word_id_U[start_d:stop_d], word_ct_U[start_d:stop_d], topics_KV=topics_KV, alpha=alpha, nef_alpha=nef_alpha, init_pi_d_K=init_pi_d_K, **pi_estimation_kwargs) if return_info or n_seconds_between_print > 0: n_active_per_doc[d] = \ np.sum(pi_DK[d,:] > active_proba_thr) n_docs_restarted += info_dict['n_restarts'] > 0 n_docs_converged += info_dict['did_converge'] iters_per_doc[d] = info_dict['n_iters'] step_size_per_doc[d] = info_dict['pi_step_size'] try: dist_per_doc[d] = info_dict['cur_L1_diff'] except KeyError: dist_per_doc = None try: restarts_per_doc[d] = info_dict['n_restarts'] except KeyError: restarts_per_doc = None try: loss_per_doc[d] = info_dict['loss'] except KeyError: pass cur_time_sec = time.time() if n_seconds_between_print > 0: is_time = cur_time_sec - last_print_sec > n_seconds_between_print is_last = (d + 1) == n_docs if is_last or is_time: msg = make_readable_summary_for_pi_DK_estimation( elapsed_time_sec=cur_time_sec - start_time_sec, n_docs=n_docs, n_docs_completed=d + 1, n_docs_converged=n_docs_converged, n_docs_restarted=n_docs_restarted, iters_per_doc=iters_per_doc, n_active_per_doc=n_active_per_doc, dist_per_doc=dist_per_doc, restarts_per_doc=restarts_per_doc, step_size_per_doc=step_size_per_doc, loss_per_doc=loss_per_doc) last_print_sec = cur_time_sec if n_seconds_between_print > 0: pprint(msg) if return_info: agg_info_dict = dict(summary_msg=msg, iters_per_doc=iters_per_doc, n_active_per_doc=n_active_per_doc, dist_per_doc=dist_per_doc, restarts_per_doc=restarts_per_doc, step_size_per_doc=step_size_per_doc, loss_per_doc=loss_per_doc, loss=np.sum(loss_per_doc), alpha=alpha) return pi_DK, agg_info_dict else: return pi_DK
def minimize(loss_func_wrt_paramvec_and_step=None, grad_func_wrt_paramvec_and_step=None, save_func_wrt_param_dict=None, callback_func_wrt_param_dict=None, callback_kwargs=None, param_tfm_manager=None, dim_P=None, init_param_dict=None, n_line_search_steps=10, n_terms_approx_hessian=10, **kwargs): """ Minimize provided loss function using L-BFGS algorithm Returns ------- param_dict : dict Contains estimated parameters that minimize the loss alg_state_dict : dict Contains algorithm information (num steps completed, etc.) """ pprint('[scipy_lbfgs_minimizer] Begin training...') pprint('--n_line_search_steps %.3f' % n_line_search_steps) pprint('--n_terms_approx_hessian %.3f' % n_terms_approx_hessian) # Parse user input n_line_search_steps = int(n_line_search_steps) n_terms_approx_hessian = int(n_terms_approx_hessian) # Convert provided common param dict # to a flat 1D array with unconstrained values param_vec = param_tfm_manager.flatten_to_differentiable_param_vec( init_param_dict, **dim_P) # Warmup start_time_sec = time.time() init_loss_val = loss_func_wrt_paramvec_and_step(param_vec, step_id=0) loss_eval_time_sec = time.time() - start_time_sec pprint("Loss @ init: %8.3f sec | val %.6e" % (loss_eval_time_sec, init_loss_val)) pprint("Params @ init: %8s | %5d params | l2 norm / entry %.4e" % (' ', param_vec.size, calc_l2_norm_of_vector_per_entry(param_vec))) start_time_sec = time.time() init_grad_vec = grad_func_wrt_paramvec_and_step(param_vec, step_id=0) elapsed_time_sec = time.time() - start_time_sec init_grad_norm_per_entry = calc_l2_norm_of_vector_per_entry(init_grad_vec) pprint("Gradient @ init: %8.3f sec | %5d params | l2 norm / entry %.4e" % (elapsed_time_sec, init_grad_vec.size, init_grad_norm_per_entry)) # Create settings that track algorithm state # cur_step, cur_lap, n_laps, n_steps, etc. alg_state_kwargs = init_alg_state_kwargs(cur_step=0.0, **kwargs) n_steps = alg_state_kwargs['n_steps'] if 'output_path' in alg_state_kwargs: laps_to_save_str, steps_to_save_str = calc_laps_when_snapshots_saved( return_str=True, keep_first=5, keep_last=5, **alg_state_kwargs) pprint("Snapshots will be saved at intervals:") pprint(" laps: %s" % laps_to_save_str) pprint(" steps: %s" % steps_to_save_str) pprint("Snapshot saved to --output_path:\n%s" % (alg_state_kwargs['output_path'])) # Translate settings into scipy's specific options format options_dict = dict( maxiter=n_steps, maxfun=n_line_search_steps * n_steps, maxcor=n_terms_approx_hessian, maxls=n_line_search_steps, ftol=0.0, gtol=0.0, ) alg_state_kwargs['cur_loss_val'] = init_loss_val ## Define special callback function # Which does things like print progress at relevant steps # Save snapshots to files at relevant steps, etc. def my_callback_func(cur_param_vec, is_init=False, alg_state_kwargs=alg_state_kwargs): # Update step counter, timer, etc. if not is_init: alg_state_kwargs.update( update_alg_state_kwargs(**alg_state_kwargs)) if do_print_now(**alg_state_kwargs) or do_save_now(**alg_state_kwargs): cur_loss_val = loss_func_wrt_paramvec_and_step(cur_param_vec) alg_state_kwargs['cur_loss_val'] = cur_loss_val if do_print_now(**alg_state_kwargs): pprint(make_status_string( **alg_state_kwargs)) # assume cur_loss_val is inside save_status_to_txt_files(**alg_state_kwargs) alg_state_kwargs.update( update_alg_state_kwargs_after_print(**alg_state_kwargs)) if do_save_now(**alg_state_kwargs): param_dict = param_tfm_manager.unflatten_to_common_param_dict( cur_param_vec, **dim_P) if save_func_wrt_param_dict is not None: save_func_wrt_param_dict(param_dict=param_dict, **alg_state_kwargs) if callback_func_wrt_param_dict is not None: callback_func_wrt_param_dict( param_dict=param_dict, losstrain_ttl=alg_state_kwargs.get('cur_loss_val', init_loss_val), alg_state_kwargs=alg_state_kwargs, **callback_kwargs) alg_state_kwargs.update( update_alg_state_kwargs_after_save(**alg_state_kwargs)) ## Run training ... my_callback_func(param_vec, is_init=True) if n_steps > 0: opt_result_obj = scipy.optimize.minimize( loss_func_wrt_paramvec_and_step, param_vec, method='l-bfgs-b', jac=grad_func_wrt_paramvec_and_step, options=options_dict, callback=my_callback_func) pprint('[scipy_lbfgs_minimizer] msg %s' % opt_result_obj.message) param_vec = opt_result_obj.x # Relies on alg_state_kwargs already being defined in callback my_callback_func(param_vec) param_dict = param_tfm_manager.unflatten_to_common_param_dict( param_vec, **dim_P) pprint('[scipy_lbfgs_minimizer] Done with training.') return param_dict, alg_state_kwargs
def estimate_w_CK__given_pi_DK( dataset=None, pi_DK=None, lambda_w=0.001, seed=42, prefix='', verbose=False, **kwargs): """ Estimate regression weights from provided probability features. Uses sklearn's regularized regressors under the hood. Returns ------- w_CK : 2D array, size C x K Regression weights """ K = pi_DK.shape[1] C = int(dataset['n_labels']) if verbose: pprint('%s Fitting %d regressions...' % ( prefix, C)) w_CK = np.zeros((C, K)) u_y_vals = np.unique(dataset['y_DC'].flatten()) if u_y_vals.size <= 2 and np.union1d([0.0, 1.0], u_y_vals).size == 2: output_data_type = 'binary' else: output_data_type = 'real' if 'y_rowmask' in dataset: y_DC = dataset['y_DC'][1 == dataset['y_rowmask']] pi_DK = pi_DK[1 == dataset['y_rowmask']] u_y_vals = np.unique(y_DC.sum(axis=1)) assert u_y_vals.size > 1 else: y_DC = dataset['y_DC'] for c in xrange(C): # Do a quick regression to get initial weights! if output_data_type.count('binary') > 0: clf = LogisticRegression( fit_intercept=False, C=0.5/lambda_w, random_state=seed, ) else: clf = RidgeRegression( fit_intercept=False, alpha=lambda_w, random_state=seed, ) clf.fit(pi_DK, y_DC[:, c]) w_CK[c] = clf.coef_ if verbose: pprint(' w_CK[%d, :5]=' % c + ' '.join(['% .2f' % w for w in w_CK[c, :5]])) pprint(' label id %d / %d done with lambda_w = %.5f' % ( c+1, C, lambda_w)) return w_CK
def read_args_from_stdin_and_run(): ''' Main executable function to train and evaluate classifier. Post Condition -------------- AUC and other eval info printed to stdout. Trained classifier saved ???. ''' if not sys.stdin.isatty(): for line in sys.stdin.readlines(): line = line.strip() sys.argv.extend(line.split(' ')) parser = argparse.ArgumentParser() parser.add_argument('--dataset_path', default='/tmp/', type=str, help="Path to folder containing:" + " *.npy files: X_train, y_train, P_train" " *.txt files: X_colnames.txt and y_colnames.txt") parser.add_argument( '--pretrained_clf_path', default='/tmp/', type=str, help="Path to folder to hold output from classifier. Includes:" + " perf_metric*.txt files: auc_train.txt & auc_test.txt" + " settings.txt: description of all settings to reproduce.") parser.add_argument('--split_names', default='test') parser.add_argument('--split_nicknames', default='evaltest') parser.add_argument('--features_path', default='/tmp/', type=str, help="Path to folder with SSAMfeat*.npy files") parser.add_argument( '--target_arr_name', default='Y', type=str, ) parser.add_argument( '--target_names', default='all', type=str, help='Name of response/intervention to test.' + ' To try specific interventions, write names separated by commas.' + ' To try all interventions, use special name "all"') parser.add_argument('--seed_bootstrap', default=42, type=int, help='Seed for bootstrap') parser.add_argument('--n_bootstraps', default=5000, type=int, help='Number of samples for bootstrap conf. intervals') parser.add_argument('--bootstrap_stratify_pos_and_neg', default=True, type=int, help='Whether to stratify examples or not') args, unk_list = parser.parse_known_args() arg_dict = vars(args) dataset_path = arg_dict['dataset_path'] assert os.path.exists(arg_dict['pretrained_clf_path']) output_path = arg_dict['pretrained_clf_path'] clf_opts = list() # Write parsed args to plain-text file # so we can exactly reproduce later with open(os.path.join(output_path, 'settings.txt'), 'r') as f: for line in f.readlines(): line = line.strip() clf_opts.append(line.split(' = ')) clf_opts = dict(clf_opts) feat_path_list = [arg_dict['dataset_path'], arg_dict['features_path']] pprint('[run_classifier says:] Loading dataset ...') start_time = time.time() feature_arr_names = clf_opts['feature_arr_names'].split(',') pprint('feature_arr_names:') feat_colnames_by_arr = OrderedDict() for feat_arr_name in feature_arr_names: pprint(feat_arr_name) cur_feat_colnames = None for feat_path in feat_path_list: colname_fpath = os.path.join(feat_path, feat_arr_name + '_colnames.txt') if os.path.exists(colname_fpath): cur_feat_colnames = \ [unicode(feat_arr_name + ":") + s for s in load_list_of_unicode_from_txt(colname_fpath)] break feat_colnames_by_arr[feat_arr_name] = cur_feat_colnames target_arr_name = arg_dict['target_arr_name'] all_target_names = load_list_of_strings_from_txt( os.path.join(arg_dict['dataset_path'], target_arr_name + '_colnames.txt')) target_names = arg_dict['target_names'] if target_names == 'all': target_names = all_target_names target_cols = np.arange(len(all_target_names)).tolist() else: target_names = target_names.split(',') target_cols = list() for name in target_names: assert name in all_target_names target_cols.append(all_target_names.index(name)) datasets_by_split = dict() split_nicknames = arg_dict['split_nicknames'].split(',') split_names = arg_dict['split_names'].split(',') for nickname, split_name in zip(split_nicknames, split_names): datasets_by_split[nickname] = dict() split_dataset = datasets_by_split[nickname] # Load Y dense_fpath = os.path.join(dataset_path, target_arr_name + "_%s.npy" % split_name) y = np.asarray(np.load(dense_fpath), order='C', dtype=np.int32) if y.ndim < 2: y = y[:, np.newaxis] assert y.ndim == 2 assert y.shape[1] == len(all_target_names) split_dataset['y'] = y[:, target_cols] assert split_dataset['y'].shape[1] == len(target_cols) # Load X x_list = list() for feat_arr_name in feature_arr_names: x_cur = None def fpath_generator(): for feat_path in feat_path_list: for sname in [nickname, split_name]: dense_fpath = os.path.join( feat_path, feat_arr_name + "_" + sname + ".npy") sparse_fpath = os.path.join( feat_path, feat_arr_name + "_csr_" + sname + ".npz") yield dense_fpath, sparse_fpath ds_path_list = [pair for pair in fpath_generator()] for ii, (dense_fpath, sparse_fpath) in enumerate(ds_path_list): try: if os.path.exists(sparse_fpath): x_cur = load_csr_matrix(sparse_fpath) assert np.all(np.isfinite(x_cur.data)) break else: x_cur = np.asarray(np.load(dense_fpath), order='C', dtype=np.float64) if x_cur.ndim < 2: x_cur = np.atleast_2d(x_cur).T assert np.all(np.isfinite(x_cur)) break except IOError as e: if ii == len(ds_path_list) - 1: # Couldn't find desired file in any feat_path raise e else: # Try the next feat_path in the list pass if x_cur is not None: if feat_colnames_by_arr[feat_arr_name] is not None: feat_dim = len(feat_colnames_by_arr[feat_arr_name]) assert x_cur.shape[1] == feat_dim else: # Add dummy colnames feat_dim = x_cur.shape[1] n_sig_digits = np.maximum(3, int(np.ceil(np.log10(feat_dim)))) fmt_str = "%s_%0" + str(n_sig_digits) + "d" feat_colnames_by_arr[feat_arr_name] = [ fmt_str % (feat_arr_name, fid) for fid in range(feat_dim) ] x_list.append(x_cur) if isinstance(x_list[0], np.ndarray): split_dataset['x'] = np.hstack(x_list) else: split_dataset['x'] = scipy.sparse.hstack(x_list, format='csr') assert split_dataset['x'].ndim == 2 assert split_dataset['x'].shape[0] == split_dataset['y'].shape[0] assert (isinstance(split_dataset['x'], np.ndarray) or isinstance(split_dataset['x'], scipy.sparse.csr_matrix)) if split_name == split_names[0]: # Flatten feat colnames into single list feat_colnames = sum(feat_colnames_by_arr.values(), []) assert isinstance(feat_colnames, list) assert len(feat_colnames) == split_dataset['x'].shape[1] print('y colnames: %s' % ' '.join(target_names)) if len(feat_colnames) > 10: print('x colnames: %s ... %s' % (' '.join( feat_colnames[:5]), ' '.join(feat_colnames[-5:]))) else: print('x colnames: %s' % ' '.join(feat_colnames)) print('---- %5s dataset summary' % split_name) print('%9d total examples' % y.shape[0]) print('y : %d x %d targets' % split_dataset['y'].shape) print('x : %d x %d features' % split_dataset['x'].shape) for c in xrange(len(target_names)): y_c = split_dataset['y'][:, c] print('target %s : frac pos %.3f' % (target_names[c], np.mean(y_c))) print(' %6d pos examples' % np.sum(y_c == 1)) print(' %6d neg examples' % np.sum(y_c == 0)) elapsed_time = time.time() - start_time print('[run_classifier says:] dataset loaded after %.2f sec.' % elapsed_time) n_cols = len(target_names) for c in xrange(n_cols): print('[eval_pretrained_classifier says:] eval for target %s' % target_names[c]) eval_pretrained_clf( classifier_name=clf_opts['classifier_name'], classifier_path=arg_dict['pretrained_clf_path'], datasets_by_split=datasets_by_split, y_col_id=c, y_orig_col_id=all_target_names.index(target_names[c]), y_col_name=target_names[c], feat_colnames=feat_colnames, output_path=arg_dict['pretrained_clf_path'], seed_bootstrap=arg_dict['seed_bootstrap'], n_bootstraps=arg_dict['n_bootstraps'], bootstrap_stratify_pos_and_neg=arg_dict[ 'bootstrap_stratify_pos_and_neg'], ) elapsed_time = time.time() - start_time print( '[eval_pretrained_classifier says:] target %s completed after %.2f sec' % (target_names[c], elapsed_time))
def select_best_from_many_runs(legend_name=None, results_path_patterns=None, output_path=None, txt_src_path=None, target_y_name=None, all_y_names=None, col_names_to_use_at_selection=['N_STATES'], col_names_to_keep="", col_names_to_keep_per_split="", min_lap_to_use_at_selection=10, split_name_to_use_at_selection='VALID', selection_score_colname='LOSS_X', selection_score_ranking_func='argmin', unk_list=None, **kwargs): """ """ provided_arg_dict = dict(**locals()) # Create output_path on disk if output_path.count("$"): for key, val in locals().items(): if output_path.count('$' + key): output_path = output_path.replace("$" + key, str(val)) if not os.path.exists(output_path): mkpath(output_path) # Setup logging suffix = "__select_best__y_target=%s_score=%s_legend=%s" % ( target_y_name, selection_score_colname, legend_name) config_pprint_logging(output_path, txtfile='stdout%s.txt' % suffix) # Write parsed args to plain-text file # so we can exactly reproduce later this_script_prefix = '[select_best.py says:]' pprint("%s Parsing args ..." % this_script_prefix) with open(os.path.join(output_path, 'settings%s.txt' % suffix), 'w') as f: for key, val in provided_arg_dict.items(): f.write(key + ' = ' + str(val) + '\n') pprint(key + ' = ' + str(val)) with open(os.path.join(output_path, 'args%s.txt' % suffix), 'w') as f: for key, val in provided_arg_dict.items(): f.write('--' + key + ' ' + str(val) + '\n') pprint('') # Parse unknown args if unk_list is not None and len(unk_list) > 0: pprint("UNKNOWN ARGS (ignored)") for key in unk_list: pprint(key) del unk_list # Parse target y names target_y_name = unicode(target_y_name) if not isinstance(all_y_names, list): if os.path.exists(all_y_names): all_y_names = load_list_of_unicode_from_txt(all_y_names) else: all_y_names = map(unicode, all_y_names.split(",")) def force_list_of_strings(val): if not isinstance(val, list): val = map(str, val.split(",")) return val results_path_patterns = force_list_of_strings(results_path_patterns) col_names_to_use_at_selection = force_list_of_strings( col_names_to_use_at_selection) col_names_to_keep = force_list_of_strings(col_names_to_keep) col_names_to_keep_per_split = force_list_of_strings( col_names_to_keep_per_split) # Load df for all runs that match the query all_matching_runs_df = load_df_from_all_folders_matching_list_of_patterns( list_of_path_patterns=results_path_patterns, legend_name=legend_name, y_ind=all_y_names.index(target_y_name), column_names=COLUMN_NAMES, task_ids=range(1, 10), ) all_matching_runs_df['TARGET_LABEL_NAME'] = target_y_name if selection_score_colname.startswith("="): formula = selection_score_colname.lstrip("=") all_matching_runs_df[selection_score_colname] = 0.0 add_ops = formula.split("+") for op in add_ops: coef, colname = op.lstrip('(').rstrip(')').split("*") coef = float(coef) all_matching_runs_df[ selection_score_colname] += coef * all_matching_runs_df[ colname].values if selection_score_ranking_func is None: selection_score_ranking_func = get_score_ranking_function_for_colname( selection_score_colname) elif selection_score_ranking_func == 'argmax': selection_score_ranking_func = np.argmax else: selection_score_ranking_func = np.argmin ## Create dataframe with only the best task at each legend name best_df = select_best_df_at_each_value_of_specific_vars( all_matching_runs_df, legend_name=legend_name, keys=col_names_to_use_at_selection, query_min_lap=min_lap_to_use_at_selection, score_colname=selection_score_colname, score_ranking_func=selection_score_ranking_func, target_splitname=split_name_to_use_at_selection, ) row_dict_list = list() # Write the legend names to output path for cur_legend_name in np.unique(best_df['LEGEND_NAME_ASCII'].values): ## Make symlink to best run's task_path directory cur_query_str = ("LEGEND_NAME_ASCII == '%s' and IS_BEST_SNAPSHOT > 0" % (cur_legend_name)) # Prepare existing path best_snapshot_df = best_df.query(cur_query_str) assert best_snapshot_df.shape[0] == len(SPLIT_NAMES) best_task_path = best_snapshot_df['TASK_PATH_AT_BEST_SNAPSHOT'].values[ 0] best_task_path = best_task_path.rstrip(os.path.sep) assert os.path.exists(best_task_path) # Prepare symlink path job_path = "best_snapshot_run-legend_name=%s" % ( cur_legend_name.replace(" ", "_")) cur_symlink_output_job_path = os.path.join(output_path, job_path) mkpath(cur_symlink_output_job_path) cur_symlink_output_task_path = os.path.join(output_path, job_path, 'best_task') # Remove any old version if os.path.islink(cur_symlink_output_task_path): os.unlink(cur_symlink_output_task_path) # Finally, make the symlink happen os.symlink(best_task_path, cur_symlink_output_task_path) pprint("\nLEGEND_NAME %s" % cur_legend_name) pprint("NEW BEST TASK PATH:\n%s" % cur_symlink_output_task_path) ## Make symlink to best snapshot directory # Prepare existing snapshot path (download content if necessary) snapshot_path = make_snapshot_path_for_lap( task_path=best_snapshot_df['TASK_PATH_AT_BEST_SNAPSHOT'].values[0], lap=best_snapshot_df['LAP_AT_BEST_SNAPSHOT'].values[0], ) if not os.path.exists(snapshot_path): download_snapshot(snapshot_path) # Prepare new symlink path cur_symlink_snapshot_path = os.path.join(cur_symlink_output_job_path, 'best_snapshot') # Remove any old version if os.path.islink(cur_symlink_snapshot_path): os.unlink(cur_symlink_snapshot_path) # Finally, make the symlink happen os.symlink(snapshot_path, cur_symlink_snapshot_path) pprint("NEW BEST SNAPSHOT PATH:\n%s" % cur_symlink_snapshot_path) ## If needed, make brand new snapshot with only target y column if len(all_y_names) > 1 and target_y_name != 'avg': GP = load_param_dict_at_specific_snapshot( snapshot_path=snapshot_path) new_GP = dict(**GP) new_GP['w_CK'] = GP['w_CK'][all_y_names.index(target_y_name), :][ np.newaxis, :] save_topic_model_snapshot(output_path=cur_symlink_output_job_path, prefix='targety=%s' % (target_y_name), **new_GP) ## Append to .csv file row_dict = OrderedDict() row_dict['LEGEND_NAME'] = legend_name for key in col_names_to_use_at_selection: row_dict[key] = best_snapshot_df[key].values[0] for key in col_names_to_keep: row_dict[key] = best_snapshot_df[key].values[0] for split_name in SPLIT_NAMES: best_split_df = best_snapshot_df.query("SPLIT_NAME == '%s'" % split_name) assert best_split_df.shape[0] == 1 assert isinstance(col_names_to_keep_per_split, list) for key in col_names_to_keep_per_split: split_key = "%s_%s" % (split_name.upper(), key) row_dict[split_key] = best_split_df[key].values[0] row_dict['LAP'] = best_snapshot_df['LAP'].values[0] row_dict['LABEL_NAME'] = best_snapshot_df['TARGET_LABEL_NAME'].values[ 0] row_dict['SNAPSHOT_SRCFILE'] = cur_symlink_snapshot_path row_dict['TXTSRCFILES_PATH'] = txt_src_path row_dict_list.append(row_dict) pprint("\nWriting csv file documenting all best snapshots for legend %s" % (legend_name)) my_df = pd.DataFrame(row_dict_list) basename = "best_snapshots_%s.csv" % legend_name csv_fpath = os.path.join(output_path, basename) my_df.to_csv(csv_fpath, columns=row_dict_list[0].keys(), index=False) pprint("WROTE CSV FILE:\n%s" % csv_fpath)
def make_best_job_df(df, target_query="SPLIT_NAME == 'VALID' and LAP > 50", target_splitname='VALID', score_colname='Y_ERROR_RATE', score_ranking_func=np.argmin, verbose=False): ''' Find single best task among all jobs in provided df. Returns ------- best_job_df : dataframe of best single task ''' default_score = fetch_default_score(score_ranking_func.__name__) job_paths = np.unique(df['JOB_PATH'].values) best_task_idstr_list = ['' for a in range(len(job_paths))] best_score_idx = np.zeros_like(job_paths, dtype=np.int32) best_score = default_score * np.ones_like(job_paths, dtype=np.float64) best_lap_idx = np.zeros_like(job_paths, dtype=np.float64) for jj, job_path in enumerate(job_paths): if job_path is None: continue cur_job_best_df = make_best_task_df( df.query("JOB_PATH == '%s'" % job_path), target_query=target_query, score_colname=score_colname, score_ranking_func=score_ranking_func, default_score=default_score, verbose=verbose) # Narrow down to ___ split, after __ laps cur_job_best_df = cur_job_best_df.query(target_query) if verbose: pprint(job_path.split(os.path.sep)[-1]) if cur_job_best_df.shape[0] < 1: if verbose: pprint(' skipped. Too small to satisfy query.') continue split_name_chk = np.unique(cur_job_best_df['SPLIT_NAME'].values) assert len(split_name_chk) == 1 assert split_name_chk[0].lower() == target_splitname.lower() best_task_idstr_list[jj] = str(cur_job_best_df['TASKID'].values[0]) best_score_idx[jj] = score_ranking_func( cur_job_best_df[score_colname].values) best_score[jj] = cur_job_best_df[score_colname].values[ best_score_idx[jj]] best_lap_idx[jj] = cur_job_best_df['LAP'].values[best_score_idx[jj]] if verbose: print(" best %s = %.4f at lap %9.3f of task %s" % (score_colname, best_score[jj], best_lap_idx[jj], best_task_idstr_list[jj])) # No tasks/jobs exist that satisfy target_query # This can happen when runs havent gone long enough yet if np.allclose(best_score, default_score): return None best_job_idx = score_ranking_func(best_score) best_job_df = df.query( "JOB_PATH == '%s' and TASKID == '%s'" % (job_paths[best_job_idx], best_task_idstr_list[best_job_idx])).copy() best_job_df['SCORE_AT_BEST_SNAPSHOT'] = best_score[best_job_idx] best_job_df['LAP_AT_BEST_SNAPSHOT'] = best_lap_idx[best_job_idx] best_job_df['IS_BEFORE_BEST_SNAPSHOT'] = np.asarray( best_job_df['LAP'].values.copy() <= best_lap_idx[best_job_idx], dtype=np.int32) best_job_df['TASK_PATH_AT_BEST_SNAPSHOT'] = os.path.join( job_paths[best_job_idx], best_task_idstr_list[best_job_idx]) best_job_df['IS_BEST_SNAPSHOT'] = np.asarray( best_job_df['LAP'].values.copy() == best_lap_idx[best_job_idx], dtype=np.int32) best_job_df['FRAC_PROGRESS'] = \ 1.0 * best_job_df['LAP'].values.copy() \ / np.max(best_job_df['LAP'].values) return best_job_df
def select_best_df_at_each_value_of_specific_vars( df, legend_name='Gibbs_LDA', keys=['N_STATES'], disp_keys=None, no_legend_keys=[], query="SPLIT_NAME == '$target_splitname' and LAP >= $query_min_lap", query_min_lap=5, target_splitname='VALID', score_colname='LOSS_X', score_ranking_func=np.argmin, **kwargs): ''' Produce dataframe of best runs at each value of specific variables. Args ---- df : pandas DataFrame Each row represents a snapshot during training. legend_name : string Nickname of all runs provided. keys : list of strings Column names of specified variables used for best run selection. Returns ------- best_df : pandas DataFrame ''' if disp_keys is None: disp_keys = ['LAP_AT_BEST_SNAPSHOT', 'TASKID'] + keys query = query.replace("$query_min_lap", str(query_min_lap)) query = query.replace("$target_splitname", str(target_splitname)) pprint("Finding snapshots with %s of %s" % (score_ranking_func.__name__, score_colname)) pprint("Among snapshots satisfying query: %s" % query) def expand_query_str_list(cur_list, new_vals): new_list = list() if len(cur_list) == 0: for new_q_str in new_vals: new_list.append(new_q_str) else: for q_str in cur_list: for new_q_str in new_vals: new_list.append(q_str + " and " + new_q_str) return new_list query_str_list = list() pprint("Finding best task for each possible combo of these legend keys:") for key in keys: is_finite_mask = np.logical_not(pd.isnull(df[key].values)) if np.sum(is_finite_mask) > 0: u_vals = np.unique(df[key].values[is_finite_mask]).tolist() else: u_vals = [] if not np.all(is_finite_mask): u_vals += [np.nan] new_queries = list() for u_val in u_vals: if isinstance(u_val, str): new_query_str = "%s == '%s'" % (key, u_val) elif np.isfinite(u_val): new_query_str = "%s == %s" % (key, u_val) else: new_query_str = "%s != %s" % (key, key) new_queries.append(new_query_str) if len(new_queries) == 1: if len(query_str_list) < 1: query_str_list.extend(new_queries) continue pprint(" %s: %s" % (key, ','.join(map(str, u_vals)))) query_str_list = expand_query_str_list(query_str_list, new_queries) best_df_list = list() for query_str in query_str_list: best_job_df = make_best_job_df(df.query(query_str), target_query=query, score_colname=score_colname, score_ranking_func=score_ranking_func, target_splitname=target_splitname, **kwargs) if best_job_df is None: pprint("NO BEST TASK AVAILABLE FOR %s + %s" % (legend_name, query_str)) continue # _UNIQUE_LEGEND_NAME distinctly identifies each "best job" # like 'Gibbs_LDA K == 5' # LEGEND_NAME may be simpler with duplicates # like 'Gibbs_LDA', for each of K in [5,10, 20] cur_queries = [s for s in query_str.split('and')] cur_legend_name = legend_name cur_ulegend_name = legend_name for cur_query_str in cur_queries: is_bad = False for no_leg_key in no_legend_keys: if cur_query_str.count(no_leg_key) > 0: is_bad = True cur_ulegend_name += " " + cur_query_str.strip() if not is_bad: cur_legend_name += " " + cur_query_str.strip() best_job_df['_UNIQUE_LEGEND_NAME'] = cur_ulegend_name best_job_df['LEGEND_NAME'] = cur_legend_name best_df_list.append(best_job_df) best_df = pd.concat(best_df_list) pprint("ON SPLIT %s:" % (target_splitname)) q_df = best_df.query("IS_BEST_SNAPSHOT > 0 and SPLIT_NAME == '%s'" % target_splitname) disp_df = q_df[[score_colname] + disp_keys] disp_df = disp_df.apply(pd.to_numeric, errors='ignore') pprint( disp_df.to_string(index=False, header=True, float_format=lambda x: ' %.3f' % float(x))) best_df.reset_index(inplace=True) best_df = simplify_best_df_and_make_unicode_friendly(best_df) best_df.reset_index(inplace=True) return best_df
def init_param_dict(dataset=None, topics_KV=None, w_CK=None, n_states=None, init_name=None, init_name_topics='rand_docs', init_name_w='regress_given_topics', init_model_path=None, max_n_docs=100000, min_n_docs_per_label=10, seed=0, alpha=1.1, tau=1.1, lambda_w=.001, verbose=True, **kwargs): ''' Create initial param dict for slda optimization problem. Returns ------- init_params_dict : dict, with fields topics_KV : 2D array, K x V w_CK : 2D array, C x K ''' if n_states is not None: n_states = int(n_states) n_states = int(n_states) lambda_w = float(lambda_w) tau = float(tau) alpha = float(alpha) # Parse init_name # For backwards compat: init_name means same thing as init_name_topics if init_name is not None: init_name_topics = init_name del init_name if str(init_model_path).lower() != 'none': pprint('[init_params] Loading from init_model_path ...') if init_model_path.count('snapshot'): initfromdisk_param_dict = load_topic_model_param_dict( snapshot_path=init_model_path) else: if init_model_path.endswith(os.path.sep): init_model_path = os.path.join(init_model_path, 'param_dict.dump') initfromdisk_param_dict = joblib.load(init_model_path) topics_KV = initfromdisk_param_dict['topics_KV'] if 'w_CK' in initfromdisk_param_dict: w_CK = initfromdisk_param_dict['w_CK'] if topics_KV is None or topics_KV.shape[0] < n_states: pprint('[init_params] Running init_topics_KV %s ...' % (init_name_topics)) topics_KV = init_topics_KV( dataset=dataset, topics_KV=topics_KV, n_states=n_states, seed=seed, init_name=init_name_topics, alpha=alpha, tau=tau, ) if w_CK is None or w_CK.shape[1] < n_states: pprint('[init_params] Running init_w_CK %s ...' % (init_name_w)) if init_name_w.count('regress'): assert dataset['n_docs'] < 1e6 # don't want this too big pprint('[init_params] Regress Step 1/2: Extract pi_DK...') pi_DK = calc_nef_map_pi_DK(dataset, topics_KV=topics_KV, alpha=alpha, n_seconds_between_print=600) prefix = '[init_params] Regress Step 2/2:' w_CK = estimate_w_CK__given_pi_DK( dataset=dataset, pi_DK=pi_DK, lambda_w=lambda_w, prefix=prefix, verbose=verbose, ) else: raise ValueError("Unsupported init_name_w: " + init_name_w) assert topics_KV is not None assert w_CK is not None assert topics_KV.shape[0] == n_states assert w_CK.shape[1] == n_states pprint('[init_params] Done. Created init_param_dict.') return dict(w_CK=w_CK, topics_KV=topics_KV)