Beispiel #1
0
def read_args_from_stdin_and_run():
    ''' Main executable function to train and evaluate classifier.

    Post Condition
    --------------
    AUC and other eval info printed to stdout.
    Trained classifier saved ???.
    '''
    if not sys.stdin.isatty():
        for line in sys.stdin.readlines():
            line = line.strip()
            sys.argv.extend(line.split(' '))
    parser = argparse.ArgumentParser()
    parser.add_argument('--dataset_path',
                        default='/tmp/',
                        type=str,
                        help="Path to folder containing:" +
                        " *.npy files: X_train, y_train, P_train"
                        " *.txt files: X_colnames.txt and y_colnames.txt")
    parser.add_argument(
        '--output_path',
        default='/tmp/',
        type=str,
        help="Path to folder to hold output from classifier. Includes:" +
        " perf_metric*.txt files: auc_train.txt & auc_test.txt" +
        " settings.txt: description of all settings to reproduce.")
    parser.add_argument('--feature_arr_names',
                        type=str,
                        default='X',
                        help='Name of feature files to use for training')
    parser.add_argument('--features_path',
                        default='/tmp/',
                        type=str,
                        help="Path to folder with extra feature files")
    parser.add_argument(
        '--target_arr_name',
        default='Y',
        type=str,
    )
    parser.add_argument(
        '--target_names',
        default='all',
        type=str,
        help='Name of response/intervention to test.' +
        ' To try specific interventions, write names separated by commas.' +
        ' To try all interventions, use special name "all"')
    parser.add_argument(
        '--n_folds',
        default=1,
        type=int,
        help='Number of folds for cross validation during classification.')
    parser.add_argument('--classifier_name',
                        default='logistic_regression',
                        choices=[
                            'k_nearest_neighbors', 'mlp',
                            'logistic_regression', 'extra_trees',
                            'svm_with_linear_kernel', 'svm_with_rbf_kernel'
                        ],
                        help='Name of classifier')
    parser.add_argument(
        '--class_weight_opts',
        choices=['none', 'balanced'],
        default='none',
    )
    parser.add_argument('--max_grid_search_steps',
                        default=None,
                        type=int,
                        help='max number of steps for grid search')
    parser.add_argument('--frac_labels_train',
                        default=1.0,
                        type=float,
                        help='Fraction of the training data to use')
    parser.add_argument('--c_logspace_arg_str',
                        default="-6,4,7",
                        type=str,
                        help='Comma-sep list of args to np.logspace')
    parser.add_argument('--seed',
                        default=8675309,
                        type=int,
                        help='Seed for random number generation')
    parser.add_argument('--seed_bootstrap',
                        default=42,
                        type=int,
                        help='Seed for bootstrap')
    parser.add_argument('--n_bootstraps',
                        default=5000,
                        type=int,
                        help='Number of samples for bootstrap conf. intervals')
    parser.add_argument('--bootstrap_stratify_pos_and_neg',
                        default=True,
                        type=int,
                        help='Whether to stratify examples or not')
    args, unk_list = parser.parse_known_args()
    arg_dict = vars(args)

    dataset_path = arg_dict['dataset_path']
    for key, val in arg_dict.items():
        if arg_dict['output_path'].count('$' + key):
            arg_dict['output_path'] = \
                arg_dict['output_path'].replace('$' + key, str(val))
    if not os.path.exists(arg_dict['output_path']):
        mkpath(arg_dict['output_path'])

    config_pprint_logging(arg_dict['output_path'],
                          txtfile='stdout_%s.txt' % arg_dict['target_names'])
    pprint('[run_classifier says:] Parsing args ...')

    # Parse possible preprocessors
    feat_preproc_grid_dict = dict()
    for key, val in zip(unk_list[::2], unk_list[1::2]):
        if key.startswith('--preproc_'):
            feat_preproc_grid_dict[key[2:]] = str(val).split(',')
            pprint(key + " : " + val)
            arg_dict[key[2:]] = val

    for key in feat_preproc_grid_dict.keys():
        ii = unk_list.index('--' + key)
        del unk_list[ii + 1]
        del unk_list[ii]
    if len(unk_list) > 0:
        pprint("UNKNOWN ARGS (ignored)")
        for key in unk_list:
            pprint(key)

    # Set default seed for numpy
    np.random.seed(arg_dict['seed'])

    # Write parsed args to plain-text file
    # so we can exactly reproduce later
    with open(os.path.join(arg_dict['output_path'], 'settings.txt'), 'w') as f:
        for key, val in arg_dict.items():
            f.write(key + ' = ' + str(val) + '\n')
            pprint(key + ' = ' + str(val))
    with open(os.path.join(arg_dict['output_path'], 'args.txt'), 'w') as f:
        for key, val in arg_dict.items():
            f.write('--' + key + ' ' + str(val) + '\n')
    pprint('')

    feat_path_list = [arg_dict['dataset_path'], arg_dict['features_path']]

    pprint('[run_classifier says:] Loading dataset ...')
    start_time = time.time()
    feature_arr_names = arg_dict['feature_arr_names'].split(',')
    pprint('feature_arr_names:')
    feat_colnames_by_arr = OrderedDict()
    for feat_arr_name in feature_arr_names:
        pprint(feat_arr_name)
        cur_feat_colnames = None
        for feat_path in feat_path_list:
            colname_fpath = os.path.join(feat_path,
                                         feat_arr_name + '_colnames.txt')
            if os.path.exists(colname_fpath):
                cur_feat_colnames = \
                    [str(feat_arr_name + ":") + s
                        for s in load_list_of_unicode_from_txt(colname_fpath)]
                break
        feat_colnames_by_arr[feat_arr_name] = cur_feat_colnames

    target_arr_name = arg_dict['target_arr_name']
    all_target_names = load_list_of_strings_from_txt(
        os.path.join(arg_dict['dataset_path'],
                     target_arr_name + '_colnames.txt'))

    target_names = arg_dict['target_names']
    if target_names == 'all':
        target_names = all_target_names
        target_cols = np.arange(len(all_target_names)).tolist()
    else:
        target_names = target_names.split(',')
        target_cols = list()
        for name in target_names:
            assert name in all_target_names
            target_cols.append(all_target_names.index(name))

    datasets_by_split = dict()
    for split_name in ['train', 'valid', 'test']:
        datasets_by_split[split_name] = dict()
        split_dataset = datasets_by_split[split_name]

        # Load Y
        dense_fpath = os.path.join(dataset_path,
                                   target_arr_name + "_%s.npy" % split_name)
        y = np.asarray(np.load(dense_fpath), order='C',
                       dtype=np.float32)  # 0/1/nan
        if y.ndim < 2:
            y = y[:, np.newaxis]
        assert y.ndim == 2
        assert y.shape[1] == len(all_target_names)
        split_dataset['y'] = y[:, target_cols]
        assert split_dataset['y'].shape[1] == len(target_cols)

        # Load X
        x_list = list()
        for feat_arr_name in feature_arr_names:
            for ii, feat_path in enumerate(feat_path_list):
                dense_fpath = os.path.join(
                    feat_path, feat_arr_name + "_%s.npy" % split_name)
                sparse_fpath = os.path.join(
                    feat_path, feat_arr_name + "_csr_%s.npz" % split_name)
                x_cur = None
                try:
                    if os.path.exists(sparse_fpath):
                        print("Here is sparse_fpath", sparse_fpath)
                        x_cur = load_csr_matrix(sparse_fpath)
                        print(x_cur)
                        assert np.all(np.isfinite(x_cur.data))
                        break
                    else:
                        x_cur = np.asarray(np.load(dense_fpath),
                                           order='C',
                                           dtype=np.float64)
                        if x_cur.ndim < 2:
                            x_cur = np.atleast_2d(x_cur).T
                        assert np.all(np.isfinite(x_cur))
                        break
                except IOError as e:
                    if ii == len(feat_path_list) - 1:
                        # Couldn't find desired file in any feat_path
                        raise e
                    else:
                        # Try the next feat_path in the list
                        pass
            if x_cur is not None:
                if feat_colnames_by_arr[feat_arr_name] is not None:
                    feat_dim = len(feat_colnames_by_arr[feat_arr_name])
                    print('feat name, %s, feat_dim %d' %
                          (feat_arr_name, feat_dim))
                    print('x_cur shape', x_cur.shape[1])
                    assert x_cur.shape[1] == feat_dim
                else:
                    # Add dummy colnames
                    feat_dim = x_cur.shape[1]
                    n_sig_digits = np.maximum(3,
                                              int(np.ceil(np.log10(feat_dim))))
                    fmt_str = "%s_%0" + str(n_sig_digits) + "d"
                    feat_colnames_by_arr[feat_arr_name] = [
                        fmt_str % (feat_arr_name, fid)
                        for fid in range(feat_dim)
                    ]
                x_list.append(x_cur)

        if isinstance(x_list[0], np.ndarray):
            split_dataset['x'] = np.hstack(x_list)
        else:
            split_dataset['x'] = scipy.sparse.hstack(x_list, format='csr')

        #Use only a fraction of the training dataset if specified
        frac_labels_train = arg_dict['frac_labels_train']
        if split_name == 'train' and frac_labels_train < 1.0:
            # Same random seed taken from bow_dataset.py
            data_prng = np.random.RandomState(int(42))
            n_rows = y.shape[0]

            #Note: does not handle truly missing labels
            indexed_rows = np.arange(n_rows)
            shuffled_rows = data_prng.permutation(indexed_rows)
            n_visible = int(np.ceil(frac_labels_train * n_rows))
            visible_rows = shuffled_rows[:n_visible]
            split_dataset['x'] = split_dataset['x'][visible_rows, :]
            split_dataset['y'] = split_dataset['y'][visible_rows, :]

        assert split_dataset['x'].ndim == 2
        assert split_dataset['x'].shape[0] == split_dataset['y'].shape[0]
        assert (isinstance(split_dataset['x'], np.ndarray)
                or isinstance(split_dataset['x'], scipy.sparse.csr_matrix))

        if split_name == 'train':
            # Flatten feat colnames into single list
            feat_colnames = sum(feat_colnames_by_arr.values(), [])
            assert isinstance(feat_colnames, list)
            assert len(feat_colnames) == split_dataset['x'].shape[1]
            if len(feat_colnames) > 10:
                pprint('x colnames: %s ... %s' % (' '.join(
                    feat_colnames[:5]), ' '.join(feat_colnames[-5:])))
            else:
                pprint('x colnames: %s' % ' '.join(feat_colnames))
            pprint('y colnames: %s' % ' '.join(target_names))

        pprint('---- %5s dataset summary' % split_name)
        pprint('%9d total examples' % y.shape[0])
        pprint('y : %d x %d targets' % split_dataset['y'].shape)
        pprint('x : %d x %d features' % split_dataset['x'].shape)

        for c in range(len(target_names)):
            y_c = split_dataset['y'][:, c]
            nan_bmask = np.isnan(y_c)
            pos_bmask = y_c == 1
            neg_bmask = y_c == 0
            pprint('target %s :' % target_names[c])
            pprint('    %6d pos examples | %.3f' %
                   (np.sum(pos_bmask), calcfrac(pos_bmask)))
            pprint('    %6d neg examples | %.3f' %
                   (np.sum(neg_bmask), calcfrac(neg_bmask)))
            pprint('    %6d NaN examples | %.3f' %
                   (np.sum(nan_bmask), calcfrac(nan_bmask)))
            assert nan_bmask.sum() + pos_bmask.sum() + neg_bmask.sum(
            ) == neg_bmask.size

    elapsed_time = time.time() - start_time
    pprint('[run_classifier says:] dataset loaded after %.2f sec.' %
           elapsed_time)

    n_cols = len(target_names)
    for c in range(n_cols):
        pprint('[run_classifier says:] train for target %s' % target_names[c])
        train_and_eval_clf_with_best_params_via_grid_search(
            arg_dict['classifier_name'],
            datasets_by_split=datasets_by_split,
            y_col_id=c,
            y_orig_col_id=all_target_names.index(target_names[c]),
            y_col_name=target_names[c],
            feat_colnames=feat_colnames,
            feat_preproc_grid_dict=feat_preproc_grid_dict,
            output_path=arg_dict['output_path'],
            max_grid_search_steps=arg_dict['max_grid_search_steps'],
            class_weight_opts=arg_dict['class_weight_opts'],
            c_logspace_arg_str=arg_dict['c_logspace_arg_str'],
            random_state=arg_dict['seed'],
            seed_bootstrap=arg_dict['seed_bootstrap'],
            n_bootstraps=arg_dict['n_bootstraps'],
            bootstrap_stratify_pos_and_neg=arg_dict[
                'bootstrap_stratify_pos_and_neg'],
        )
        elapsed_time = time.time() - start_time
        pprint('[run_classifier says:] target %s completed after %.2f sec' %
               (target_names[c], elapsed_time))
def read_args_from_stdin_and_run():
    ''' Main executable function to train and evaluate classifier.

    Post Condition
    --------------
    AUC and other eval info printed to stdout.
    Trained classifier saved ???.
    '''
    if not sys.stdin.isatty():
        for line in sys.stdin.readlines():
            line = line.strip()
            sys.argv.extend(line.split(' '))
    parser = argparse.ArgumentParser()
    parser.add_argument('--dataset_path',
                        default='/tmp/',
                        type=str,
                        help="Path to folder containing:" +
                        " *.npy files: X_train, y_train, P_train"
                        " *.txt files: X_colnames.txt and y_colnames.txt")
    parser.add_argument(
        '--pretrained_clf_path',
        default='/tmp/',
        type=str,
        help="Path to folder to hold output from classifier. Includes:" +
        " perf_metric*.txt files: auc_train.txt & auc_test.txt" +
        " settings.txt: description of all settings to reproduce.")
    parser.add_argument('--split_names', default='test')
    parser.add_argument('--split_nicknames', default='evaltest')

    parser.add_argument('--features_path',
                        default='/tmp/',
                        type=str,
                        help="Path to folder with SSAMfeat*.npy files")
    parser.add_argument(
        '--target_arr_name',
        default='Y',
        type=str,
    )
    parser.add_argument(
        '--target_names',
        default='all',
        type=str,
        help='Name of response/intervention to test.' +
        ' To try specific interventions, write names separated by commas.' +
        ' To try all interventions, use special name "all"')
    parser.add_argument('--seed_bootstrap',
                        default=42,
                        type=int,
                        help='Seed for bootstrap')
    parser.add_argument('--n_bootstraps',
                        default=5000,
                        type=int,
                        help='Number of samples for bootstrap conf. intervals')
    parser.add_argument('--bootstrap_stratify_pos_and_neg',
                        default=True,
                        type=int,
                        help='Whether to stratify examples or not')
    args, unk_list = parser.parse_known_args()
    arg_dict = vars(args)

    dataset_path = arg_dict['dataset_path']
    assert os.path.exists(arg_dict['pretrained_clf_path'])
    output_path = arg_dict['pretrained_clf_path']

    clf_opts = list()
    # Write parsed args to plain-text file
    # so we can exactly reproduce later
    with open(os.path.join(output_path, 'settings.txt'), 'r') as f:
        for line in f.readlines():
            line = line.strip()
            clf_opts.append(line.split(' = '))
    clf_opts = dict(clf_opts)

    feat_path_list = [arg_dict['dataset_path'], arg_dict['features_path']]

    pprint('[run_classifier says:] Loading dataset ...')
    start_time = time.time()
    feature_arr_names = clf_opts['feature_arr_names'].split(',')
    pprint('feature_arr_names:')
    feat_colnames_by_arr = OrderedDict()
    for feat_arr_name in feature_arr_names:
        pprint(feat_arr_name)
        cur_feat_colnames = None
        for feat_path in feat_path_list:
            colname_fpath = os.path.join(feat_path,
                                         feat_arr_name + '_colnames.txt')
            if os.path.exists(colname_fpath):
                cur_feat_colnames = \
                    [unicode(feat_arr_name + ":") + s
                        for s in load_list_of_unicode_from_txt(colname_fpath)]
                break
        feat_colnames_by_arr[feat_arr_name] = cur_feat_colnames

    target_arr_name = arg_dict['target_arr_name']
    all_target_names = load_list_of_strings_from_txt(
        os.path.join(arg_dict['dataset_path'],
                     target_arr_name + '_colnames.txt'))
    target_names = arg_dict['target_names']
    if target_names == 'all':
        target_names = all_target_names
        target_cols = np.arange(len(all_target_names)).tolist()
    else:
        target_names = target_names.split(',')
        target_cols = list()
        for name in target_names:
            assert name in all_target_names
            target_cols.append(all_target_names.index(name))

    datasets_by_split = dict()
    split_nicknames = arg_dict['split_nicknames'].split(',')
    split_names = arg_dict['split_names'].split(',')

    for nickname, split_name in zip(split_nicknames, split_names):
        datasets_by_split[nickname] = dict()
        split_dataset = datasets_by_split[nickname]

        # Load Y
        dense_fpath = os.path.join(dataset_path,
                                   target_arr_name + "_%s.npy" % split_name)
        y = np.asarray(np.load(dense_fpath), order='C', dtype=np.int32)
        if y.ndim < 2:
            y = y[:, np.newaxis]
        assert y.ndim == 2
        assert y.shape[1] == len(all_target_names)
        split_dataset['y'] = y[:, target_cols]
        assert split_dataset['y'].shape[1] == len(target_cols)

        # Load X
        x_list = list()
        for feat_arr_name in feature_arr_names:
            x_cur = None

            def fpath_generator():
                for feat_path in feat_path_list:
                    for sname in [nickname, split_name]:
                        dense_fpath = os.path.join(
                            feat_path, feat_arr_name + "_" + sname + ".npy")
                        sparse_fpath = os.path.join(
                            feat_path,
                            feat_arr_name + "_csr_" + sname + ".npz")
                        yield dense_fpath, sparse_fpath

            ds_path_list = [pair for pair in fpath_generator()]
            for ii, (dense_fpath, sparse_fpath) in enumerate(ds_path_list):
                try:
                    if os.path.exists(sparse_fpath):
                        x_cur = load_csr_matrix(sparse_fpath)
                        assert np.all(np.isfinite(x_cur.data))
                        break
                    else:
                        x_cur = np.asarray(np.load(dense_fpath),
                                           order='C',
                                           dtype=np.float64)
                        if x_cur.ndim < 2:
                            x_cur = np.atleast_2d(x_cur).T
                        assert np.all(np.isfinite(x_cur))
                        break
                except IOError as e:
                    if ii == len(ds_path_list) - 1:
                        # Couldn't find desired file in any feat_path
                        raise e
                    else:
                        # Try the next feat_path in the list
                        pass
            if x_cur is not None:
                if feat_colnames_by_arr[feat_arr_name] is not None:
                    feat_dim = len(feat_colnames_by_arr[feat_arr_name])
                    assert x_cur.shape[1] == feat_dim
                else:
                    # Add dummy colnames
                    feat_dim = x_cur.shape[1]
                    n_sig_digits = np.maximum(3,
                                              int(np.ceil(np.log10(feat_dim))))
                    fmt_str = "%s_%0" + str(n_sig_digits) + "d"
                    feat_colnames_by_arr[feat_arr_name] = [
                        fmt_str % (feat_arr_name, fid)
                        for fid in range(feat_dim)
                    ]
                x_list.append(x_cur)

        if isinstance(x_list[0], np.ndarray):
            split_dataset['x'] = np.hstack(x_list)
        else:
            split_dataset['x'] = scipy.sparse.hstack(x_list, format='csr')

        assert split_dataset['x'].ndim == 2
        assert split_dataset['x'].shape[0] == split_dataset['y'].shape[0]
        assert (isinstance(split_dataset['x'], np.ndarray)
                or isinstance(split_dataset['x'], scipy.sparse.csr_matrix))

        if split_name == split_names[0]:
            # Flatten feat colnames into single list
            feat_colnames = sum(feat_colnames_by_arr.values(), [])
            assert isinstance(feat_colnames, list)
            assert len(feat_colnames) == split_dataset['x'].shape[1]

            print('y colnames: %s' % ' '.join(target_names))
            if len(feat_colnames) > 10:
                print('x colnames: %s ... %s' % (' '.join(
                    feat_colnames[:5]), ' '.join(feat_colnames[-5:])))
            else:
                print('x colnames: %s' % ' '.join(feat_colnames))

        print('---- %5s dataset summary' % split_name)
        print('%9d total examples' % y.shape[0])
        print('y : %d x %d targets' % split_dataset['y'].shape)
        print('x : %d x %d features' % split_dataset['x'].shape)

        for c in xrange(len(target_names)):
            y_c = split_dataset['y'][:, c]
            print('target %s : frac pos %.3f' %
                  (target_names[c], np.mean(y_c)))
            print('    %6d pos examples' % np.sum(y_c == 1))
            print('    %6d neg examples' % np.sum(y_c == 0))

    elapsed_time = time.time() - start_time
    print('[run_classifier says:] dataset loaded after %.2f sec.' %
          elapsed_time)

    n_cols = len(target_names)
    for c in xrange(n_cols):
        print('[eval_pretrained_classifier says:] eval for target %s' %
              target_names[c])
        eval_pretrained_clf(
            classifier_name=clf_opts['classifier_name'],
            classifier_path=arg_dict['pretrained_clf_path'],
            datasets_by_split=datasets_by_split,
            y_col_id=c,
            y_orig_col_id=all_target_names.index(target_names[c]),
            y_col_name=target_names[c],
            feat_colnames=feat_colnames,
            output_path=arg_dict['pretrained_clf_path'],
            seed_bootstrap=arg_dict['seed_bootstrap'],
            n_bootstraps=arg_dict['n_bootstraps'],
            bootstrap_stratify_pos_and_neg=arg_dict[
                'bootstrap_stratify_pos_and_neg'],
        )
        elapsed_time = time.time() - start_time
        print(
            '[eval_pretrained_classifier says:] target %s completed after %.2f sec'
            % (target_names[c], elapsed_time))