def read_args_from_stdin_and_run(): ''' Main executable function to train and evaluate classifier. Post Condition -------------- AUC and other eval info printed to stdout. Trained classifier saved ???. ''' if not sys.stdin.isatty(): for line in sys.stdin.readlines(): line = line.strip() sys.argv.extend(line.split(' ')) parser = argparse.ArgumentParser() parser.add_argument('--dataset_path', default='/tmp/', type=str, help="Path to folder containing:" + " *.npy files: X_train, y_train, P_train" " *.txt files: X_colnames.txt and y_colnames.txt") parser.add_argument( '--output_path', default='/tmp/', type=str, help="Path to folder to hold output from classifier. Includes:" + " perf_metric*.txt files: auc_train.txt & auc_test.txt" + " settings.txt: description of all settings to reproduce.") parser.add_argument('--feature_arr_names', type=str, default='X', help='Name of feature files to use for training') parser.add_argument('--features_path', default='/tmp/', type=str, help="Path to folder with extra feature files") parser.add_argument( '--target_arr_name', default='Y', type=str, ) parser.add_argument( '--target_names', default='all', type=str, help='Name of response/intervention to test.' + ' To try specific interventions, write names separated by commas.' + ' To try all interventions, use special name "all"') parser.add_argument( '--n_folds', default=1, type=int, help='Number of folds for cross validation during classification.') parser.add_argument('--classifier_name', default='logistic_regression', choices=[ 'k_nearest_neighbors', 'mlp', 'logistic_regression', 'extra_trees', 'svm_with_linear_kernel', 'svm_with_rbf_kernel' ], help='Name of classifier') parser.add_argument( '--class_weight_opts', choices=['none', 'balanced'], default='none', ) parser.add_argument('--max_grid_search_steps', default=None, type=int, help='max number of steps for grid search') parser.add_argument('--frac_labels_train', default=1.0, type=float, help='Fraction of the training data to use') parser.add_argument('--c_logspace_arg_str', default="-6,4,7", type=str, help='Comma-sep list of args to np.logspace') parser.add_argument('--seed', default=8675309, type=int, help='Seed for random number generation') parser.add_argument('--seed_bootstrap', default=42, type=int, help='Seed for bootstrap') parser.add_argument('--n_bootstraps', default=5000, type=int, help='Number of samples for bootstrap conf. intervals') parser.add_argument('--bootstrap_stratify_pos_and_neg', default=True, type=int, help='Whether to stratify examples or not') args, unk_list = parser.parse_known_args() arg_dict = vars(args) dataset_path = arg_dict['dataset_path'] for key, val in arg_dict.items(): if arg_dict['output_path'].count('$' + key): arg_dict['output_path'] = \ arg_dict['output_path'].replace('$' + key, str(val)) if not os.path.exists(arg_dict['output_path']): mkpath(arg_dict['output_path']) config_pprint_logging(arg_dict['output_path'], txtfile='stdout_%s.txt' % arg_dict['target_names']) pprint('[run_classifier says:] Parsing args ...') # Parse possible preprocessors feat_preproc_grid_dict = dict() for key, val in zip(unk_list[::2], unk_list[1::2]): if key.startswith('--preproc_'): feat_preproc_grid_dict[key[2:]] = str(val).split(',') pprint(key + " : " + val) arg_dict[key[2:]] = val for key in feat_preproc_grid_dict.keys(): ii = unk_list.index('--' + key) del unk_list[ii + 1] del unk_list[ii] if len(unk_list) > 0: pprint("UNKNOWN ARGS (ignored)") for key in unk_list: pprint(key) # Set default seed for numpy np.random.seed(arg_dict['seed']) # Write parsed args to plain-text file # so we can exactly reproduce later with open(os.path.join(arg_dict['output_path'], 'settings.txt'), 'w') as f: for key, val in arg_dict.items(): f.write(key + ' = ' + str(val) + '\n') pprint(key + ' = ' + str(val)) with open(os.path.join(arg_dict['output_path'], 'args.txt'), 'w') as f: for key, val in arg_dict.items(): f.write('--' + key + ' ' + str(val) + '\n') pprint('') feat_path_list = [arg_dict['dataset_path'], arg_dict['features_path']] pprint('[run_classifier says:] Loading dataset ...') start_time = time.time() feature_arr_names = arg_dict['feature_arr_names'].split(',') pprint('feature_arr_names:') feat_colnames_by_arr = OrderedDict() for feat_arr_name in feature_arr_names: pprint(feat_arr_name) cur_feat_colnames = None for feat_path in feat_path_list: colname_fpath = os.path.join(feat_path, feat_arr_name + '_colnames.txt') if os.path.exists(colname_fpath): cur_feat_colnames = \ [str(feat_arr_name + ":") + s for s in load_list_of_unicode_from_txt(colname_fpath)] break feat_colnames_by_arr[feat_arr_name] = cur_feat_colnames target_arr_name = arg_dict['target_arr_name'] all_target_names = load_list_of_strings_from_txt( os.path.join(arg_dict['dataset_path'], target_arr_name + '_colnames.txt')) target_names = arg_dict['target_names'] if target_names == 'all': target_names = all_target_names target_cols = np.arange(len(all_target_names)).tolist() else: target_names = target_names.split(',') target_cols = list() for name in target_names: assert name in all_target_names target_cols.append(all_target_names.index(name)) datasets_by_split = dict() for split_name in ['train', 'valid', 'test']: datasets_by_split[split_name] = dict() split_dataset = datasets_by_split[split_name] # Load Y dense_fpath = os.path.join(dataset_path, target_arr_name + "_%s.npy" % split_name) y = np.asarray(np.load(dense_fpath), order='C', dtype=np.float32) # 0/1/nan if y.ndim < 2: y = y[:, np.newaxis] assert y.ndim == 2 assert y.shape[1] == len(all_target_names) split_dataset['y'] = y[:, target_cols] assert split_dataset['y'].shape[1] == len(target_cols) # Load X x_list = list() for feat_arr_name in feature_arr_names: for ii, feat_path in enumerate(feat_path_list): dense_fpath = os.path.join( feat_path, feat_arr_name + "_%s.npy" % split_name) sparse_fpath = os.path.join( feat_path, feat_arr_name + "_csr_%s.npz" % split_name) x_cur = None try: if os.path.exists(sparse_fpath): print("Here is sparse_fpath", sparse_fpath) x_cur = load_csr_matrix(sparse_fpath) print(x_cur) assert np.all(np.isfinite(x_cur.data)) break else: x_cur = np.asarray(np.load(dense_fpath), order='C', dtype=np.float64) if x_cur.ndim < 2: x_cur = np.atleast_2d(x_cur).T assert np.all(np.isfinite(x_cur)) break except IOError as e: if ii == len(feat_path_list) - 1: # Couldn't find desired file in any feat_path raise e else: # Try the next feat_path in the list pass if x_cur is not None: if feat_colnames_by_arr[feat_arr_name] is not None: feat_dim = len(feat_colnames_by_arr[feat_arr_name]) print('feat name, %s, feat_dim %d' % (feat_arr_name, feat_dim)) print('x_cur shape', x_cur.shape[1]) assert x_cur.shape[1] == feat_dim else: # Add dummy colnames feat_dim = x_cur.shape[1] n_sig_digits = np.maximum(3, int(np.ceil(np.log10(feat_dim)))) fmt_str = "%s_%0" + str(n_sig_digits) + "d" feat_colnames_by_arr[feat_arr_name] = [ fmt_str % (feat_arr_name, fid) for fid in range(feat_dim) ] x_list.append(x_cur) if isinstance(x_list[0], np.ndarray): split_dataset['x'] = np.hstack(x_list) else: split_dataset['x'] = scipy.sparse.hstack(x_list, format='csr') #Use only a fraction of the training dataset if specified frac_labels_train = arg_dict['frac_labels_train'] if split_name == 'train' and frac_labels_train < 1.0: # Same random seed taken from bow_dataset.py data_prng = np.random.RandomState(int(42)) n_rows = y.shape[0] #Note: does not handle truly missing labels indexed_rows = np.arange(n_rows) shuffled_rows = data_prng.permutation(indexed_rows) n_visible = int(np.ceil(frac_labels_train * n_rows)) visible_rows = shuffled_rows[:n_visible] split_dataset['x'] = split_dataset['x'][visible_rows, :] split_dataset['y'] = split_dataset['y'][visible_rows, :] assert split_dataset['x'].ndim == 2 assert split_dataset['x'].shape[0] == split_dataset['y'].shape[0] assert (isinstance(split_dataset['x'], np.ndarray) or isinstance(split_dataset['x'], scipy.sparse.csr_matrix)) if split_name == 'train': # Flatten feat colnames into single list feat_colnames = sum(feat_colnames_by_arr.values(), []) assert isinstance(feat_colnames, list) assert len(feat_colnames) == split_dataset['x'].shape[1] if len(feat_colnames) > 10: pprint('x colnames: %s ... %s' % (' '.join( feat_colnames[:5]), ' '.join(feat_colnames[-5:]))) else: pprint('x colnames: %s' % ' '.join(feat_colnames)) pprint('y colnames: %s' % ' '.join(target_names)) pprint('---- %5s dataset summary' % split_name) pprint('%9d total examples' % y.shape[0]) pprint('y : %d x %d targets' % split_dataset['y'].shape) pprint('x : %d x %d features' % split_dataset['x'].shape) for c in range(len(target_names)): y_c = split_dataset['y'][:, c] nan_bmask = np.isnan(y_c) pos_bmask = y_c == 1 neg_bmask = y_c == 0 pprint('target %s :' % target_names[c]) pprint(' %6d pos examples | %.3f' % (np.sum(pos_bmask), calcfrac(pos_bmask))) pprint(' %6d neg examples | %.3f' % (np.sum(neg_bmask), calcfrac(neg_bmask))) pprint(' %6d NaN examples | %.3f' % (np.sum(nan_bmask), calcfrac(nan_bmask))) assert nan_bmask.sum() + pos_bmask.sum() + neg_bmask.sum( ) == neg_bmask.size elapsed_time = time.time() - start_time pprint('[run_classifier says:] dataset loaded after %.2f sec.' % elapsed_time) n_cols = len(target_names) for c in range(n_cols): pprint('[run_classifier says:] train for target %s' % target_names[c]) train_and_eval_clf_with_best_params_via_grid_search( arg_dict['classifier_name'], datasets_by_split=datasets_by_split, y_col_id=c, y_orig_col_id=all_target_names.index(target_names[c]), y_col_name=target_names[c], feat_colnames=feat_colnames, feat_preproc_grid_dict=feat_preproc_grid_dict, output_path=arg_dict['output_path'], max_grid_search_steps=arg_dict['max_grid_search_steps'], class_weight_opts=arg_dict['class_weight_opts'], c_logspace_arg_str=arg_dict['c_logspace_arg_str'], random_state=arg_dict['seed'], seed_bootstrap=arg_dict['seed_bootstrap'], n_bootstraps=arg_dict['n_bootstraps'], bootstrap_stratify_pos_and_neg=arg_dict[ 'bootstrap_stratify_pos_and_neg'], ) elapsed_time = time.time() - start_time pprint('[run_classifier says:] target %s completed after %.2f sec' % (target_names[c], elapsed_time))
def read_args_from_stdin_and_run(): ''' Main executable function to train and evaluate classifier. Post Condition -------------- AUC and other eval info printed to stdout. Trained classifier saved ???. ''' if not sys.stdin.isatty(): for line in sys.stdin.readlines(): line = line.strip() sys.argv.extend(line.split(' ')) parser = argparse.ArgumentParser() parser.add_argument('--dataset_path', default='/tmp/', type=str, help="Path to folder containing:" + " *.npy files: X_train, y_train, P_train" " *.txt files: X_colnames.txt and y_colnames.txt") parser.add_argument( '--pretrained_clf_path', default='/tmp/', type=str, help="Path to folder to hold output from classifier. Includes:" + " perf_metric*.txt files: auc_train.txt & auc_test.txt" + " settings.txt: description of all settings to reproduce.") parser.add_argument('--split_names', default='test') parser.add_argument('--split_nicknames', default='evaltest') parser.add_argument('--features_path', default='/tmp/', type=str, help="Path to folder with SSAMfeat*.npy files") parser.add_argument( '--target_arr_name', default='Y', type=str, ) parser.add_argument( '--target_names', default='all', type=str, help='Name of response/intervention to test.' + ' To try specific interventions, write names separated by commas.' + ' To try all interventions, use special name "all"') parser.add_argument('--seed_bootstrap', default=42, type=int, help='Seed for bootstrap') parser.add_argument('--n_bootstraps', default=5000, type=int, help='Number of samples for bootstrap conf. intervals') parser.add_argument('--bootstrap_stratify_pos_and_neg', default=True, type=int, help='Whether to stratify examples or not') args, unk_list = parser.parse_known_args() arg_dict = vars(args) dataset_path = arg_dict['dataset_path'] assert os.path.exists(arg_dict['pretrained_clf_path']) output_path = arg_dict['pretrained_clf_path'] clf_opts = list() # Write parsed args to plain-text file # so we can exactly reproduce later with open(os.path.join(output_path, 'settings.txt'), 'r') as f: for line in f.readlines(): line = line.strip() clf_opts.append(line.split(' = ')) clf_opts = dict(clf_opts) feat_path_list = [arg_dict['dataset_path'], arg_dict['features_path']] pprint('[run_classifier says:] Loading dataset ...') start_time = time.time() feature_arr_names = clf_opts['feature_arr_names'].split(',') pprint('feature_arr_names:') feat_colnames_by_arr = OrderedDict() for feat_arr_name in feature_arr_names: pprint(feat_arr_name) cur_feat_colnames = None for feat_path in feat_path_list: colname_fpath = os.path.join(feat_path, feat_arr_name + '_colnames.txt') if os.path.exists(colname_fpath): cur_feat_colnames = \ [unicode(feat_arr_name + ":") + s for s in load_list_of_unicode_from_txt(colname_fpath)] break feat_colnames_by_arr[feat_arr_name] = cur_feat_colnames target_arr_name = arg_dict['target_arr_name'] all_target_names = load_list_of_strings_from_txt( os.path.join(arg_dict['dataset_path'], target_arr_name + '_colnames.txt')) target_names = arg_dict['target_names'] if target_names == 'all': target_names = all_target_names target_cols = np.arange(len(all_target_names)).tolist() else: target_names = target_names.split(',') target_cols = list() for name in target_names: assert name in all_target_names target_cols.append(all_target_names.index(name)) datasets_by_split = dict() split_nicknames = arg_dict['split_nicknames'].split(',') split_names = arg_dict['split_names'].split(',') for nickname, split_name in zip(split_nicknames, split_names): datasets_by_split[nickname] = dict() split_dataset = datasets_by_split[nickname] # Load Y dense_fpath = os.path.join(dataset_path, target_arr_name + "_%s.npy" % split_name) y = np.asarray(np.load(dense_fpath), order='C', dtype=np.int32) if y.ndim < 2: y = y[:, np.newaxis] assert y.ndim == 2 assert y.shape[1] == len(all_target_names) split_dataset['y'] = y[:, target_cols] assert split_dataset['y'].shape[1] == len(target_cols) # Load X x_list = list() for feat_arr_name in feature_arr_names: x_cur = None def fpath_generator(): for feat_path in feat_path_list: for sname in [nickname, split_name]: dense_fpath = os.path.join( feat_path, feat_arr_name + "_" + sname + ".npy") sparse_fpath = os.path.join( feat_path, feat_arr_name + "_csr_" + sname + ".npz") yield dense_fpath, sparse_fpath ds_path_list = [pair for pair in fpath_generator()] for ii, (dense_fpath, sparse_fpath) in enumerate(ds_path_list): try: if os.path.exists(sparse_fpath): x_cur = load_csr_matrix(sparse_fpath) assert np.all(np.isfinite(x_cur.data)) break else: x_cur = np.asarray(np.load(dense_fpath), order='C', dtype=np.float64) if x_cur.ndim < 2: x_cur = np.atleast_2d(x_cur).T assert np.all(np.isfinite(x_cur)) break except IOError as e: if ii == len(ds_path_list) - 1: # Couldn't find desired file in any feat_path raise e else: # Try the next feat_path in the list pass if x_cur is not None: if feat_colnames_by_arr[feat_arr_name] is not None: feat_dim = len(feat_colnames_by_arr[feat_arr_name]) assert x_cur.shape[1] == feat_dim else: # Add dummy colnames feat_dim = x_cur.shape[1] n_sig_digits = np.maximum(3, int(np.ceil(np.log10(feat_dim)))) fmt_str = "%s_%0" + str(n_sig_digits) + "d" feat_colnames_by_arr[feat_arr_name] = [ fmt_str % (feat_arr_name, fid) for fid in range(feat_dim) ] x_list.append(x_cur) if isinstance(x_list[0], np.ndarray): split_dataset['x'] = np.hstack(x_list) else: split_dataset['x'] = scipy.sparse.hstack(x_list, format='csr') assert split_dataset['x'].ndim == 2 assert split_dataset['x'].shape[0] == split_dataset['y'].shape[0] assert (isinstance(split_dataset['x'], np.ndarray) or isinstance(split_dataset['x'], scipy.sparse.csr_matrix)) if split_name == split_names[0]: # Flatten feat colnames into single list feat_colnames = sum(feat_colnames_by_arr.values(), []) assert isinstance(feat_colnames, list) assert len(feat_colnames) == split_dataset['x'].shape[1] print('y colnames: %s' % ' '.join(target_names)) if len(feat_colnames) > 10: print('x colnames: %s ... %s' % (' '.join( feat_colnames[:5]), ' '.join(feat_colnames[-5:]))) else: print('x colnames: %s' % ' '.join(feat_colnames)) print('---- %5s dataset summary' % split_name) print('%9d total examples' % y.shape[0]) print('y : %d x %d targets' % split_dataset['y'].shape) print('x : %d x %d features' % split_dataset['x'].shape) for c in xrange(len(target_names)): y_c = split_dataset['y'][:, c] print('target %s : frac pos %.3f' % (target_names[c], np.mean(y_c))) print(' %6d pos examples' % np.sum(y_c == 1)) print(' %6d neg examples' % np.sum(y_c == 0)) elapsed_time = time.time() - start_time print('[run_classifier says:] dataset loaded after %.2f sec.' % elapsed_time) n_cols = len(target_names) for c in xrange(n_cols): print('[eval_pretrained_classifier says:] eval for target %s' % target_names[c]) eval_pretrained_clf( classifier_name=clf_opts['classifier_name'], classifier_path=arg_dict['pretrained_clf_path'], datasets_by_split=datasets_by_split, y_col_id=c, y_orig_col_id=all_target_names.index(target_names[c]), y_col_name=target_names[c], feat_colnames=feat_colnames, output_path=arg_dict['pretrained_clf_path'], seed_bootstrap=arg_dict['seed_bootstrap'], n_bootstraps=arg_dict['n_bootstraps'], bootstrap_stratify_pos_and_neg=arg_dict[ 'bootstrap_stratify_pos_and_neg'], ) elapsed_time = time.time() - start_time print( '[eval_pretrained_classifier says:] target %s completed after %.2f sec' % (target_names[c], elapsed_time))