Beispiel #1
0
def main(_):
  experiment_string = utils.get_experiment_string(
      FLAGS.dataset,
      vec=FLAGS.vec,
      dim_red=FLAGS.dim_red,
      features=FLAGS.features,
      problem=FLAGS.problem,
      alternate=FLAGS.alternate)
  worker_dir = '{}/params/{}/worker_out'.format(FILE_PATH, experiment_string)

  logging.info('\n' * 3)
  logging.info('Parameter search search_idx=%d for %s and method=%s',
               FLAGS.search_idx, experiment_string, FLAGS.method)
  logging.info('\n' * 3)

  if FLAGS.debug:
    logging.warn('Running in debug mode')

  # setup methods
  methods = parse_method_flag(FLAGS.method)

  # setup datasets
  assert FLAGS.problem in ['classification', 'regression']
  if (FLAGS.dataset not in ['20news', 'sentiment_sentences'] and
      FLAGS.vec != 'none'):
    raise ValueError(
        'Should not be using text vectorization with {} dataset'.format(
            FLAGS.dataset))

  datasets = []
  if 'sim' in FLAGS.dataset:
    d = utils.get_sim(
        FLAGS.dataset,
        problem=FLAGS.problem,
        features=FLAGS.features,
        alternate=FLAGS.alternate)
    for _ in range(FLAGS.num_dataset):
      d.reset()
      # must use get() over generate() for consistency in random sampling calls
      x_train, y_train, _, _ = d.get()

      if FLAGS.debug:  # use smaller subset
        x_train, y_train = shuffle_coordinately(x_train, y_train)
        x_train, y_train = x_train[:500, :250], y_train[:500]
      datasets.append((x_train, y_train))
  else:
    assert FLAGS.num_dataset == 1
    x_train, y_train, _, _ = utils.load_nonsim_data(
        FLAGS.dataset, vec=FLAGS.vec, dim_red=FLAGS.dim_red)

    if FLAGS.debug:  # use smaller subset
      x_train, y_train = shuffle_coordinately(x_train, y_train)
      x_train, y_train = x_train[:500, :250], y_train[:500]

    datasets.append((x_train, y_train))

  for method in methods:
    # define methods and parameter grids here
    if method == 'l1_linear' and FLAGS.problem == 'regression':
      submodule = linear
      param_grid = LINEAR_REGR_PARAM_GRID.copy()
      param_grid['penalty'] = ['l1']
    elif method == 'l1_linear':
      submodule = linear
      param_grid = LINEAR_CLF_PARAM_GRID.copy()
      param_grid['penalty'] = ['l1']
    elif method == 'l2_linear' and FLAGS.problem == 'regression':
      submodule = linear
      param_grid = LINEAR_REGR_PARAM_GRID.copy()
      param_grid['penalty'] = ['l2']
    elif method == 'l2_linear':
      submodule = linear
      param_grid = LINEAR_CLF_PARAM_GRID.copy()
      param_grid['penalty'] = ['l2']
    elif method == 'random_forest':
      submodule = random_forest
      param_grid = RF_PARAM_GRID.copy()
    elif method == 'l1_gbdt':
      submodule = gbdt
      param_grid = GBDT_PARAM_GRID.copy()
      param_grid['reg_alpha'] = [0.0, 0.5, 1.0, 2.0, 4.0, 10.]
    elif method == 'l2_gbdt':
      submodule = gbdt
      param_grid = GBDT_PARAM_GRID.copy()
      param_grid['reg_lambda'] = [0.0, 0.5, 1.0, 2.0, 4.0, 10.]
    elif method == 'l1_dnn':
      submodule = dnn
      param_grid = DNN_PARAM_GRID.copy()
      param_grid['l1'] = [0.0, 1e-3, 1e-2, 1e-1]
    elif method == 'l2_dnn':
      submodule = dnn
      param_grid = DNN_PARAM_GRID.copy()
      param_grid['l2'] = [0.0, 1e-3, 1e-2, 1e-1]
    else:
      raise ValueError('Unknown learning method: {}'.format(method))

    params = generate_param_configs(
        param_grid, num_iteration=FLAGS.num_search, seed=SEED)

    if FLAGS.search_idx >= len(params):  # less configs than number of searches
      continue

    param_dict = params[FLAGS.search_idx]

    for dataset_idx, (x_train, y_train) in enumerate(datasets):
      # recursively make parent directory
      save_dir = '{}/dataset_idx={}_{}/{}'.format(worker_dir, dataset_idx,
                                                  FLAGS.num_dataset, method)
      tf.gfile.MakeDirs(save_dir)

      # skip search if already performed
      save_path = '{}/search_idx={}_{}.out'.format(save_dir, FLAGS.search_idx,
                                                   FLAGS.num_search)
      if tf.gfile.Exists(save_path) and not FLAGS.overwrite and not FLAGS.debug:
        logging.info('Parameter search already completed for %s, dataset %d/%d',
                     method, dataset_idx, FLAGS.num_dataset)
        continue

      # k-fold cross-validation
      start = time.time()
      tuning_scores = []
      kf = KFold(n_splits=FLAGS.k, shuffle=True, random_state=SEED)
      for cv_train_idx, cv_test_idx in kf.split(x_train):
        x_train_cv, y_train_cv = x_train[cv_train_idx], y_train[cv_train_idx]
        x_test_cv, y_test_cv = x_train[cv_test_idx], y_train[cv_test_idx]

        _, metrics = submodule.pipeline(
            x_train_cv,
            y_train_cv,
            x_test_cv,
            y_test_cv,
            param_dict,
            problem=FLAGS.problem)

        # assume that we maximize the score
        if FLAGS.problem == 'regression':
          tuning_scores.append(-metrics['test_mse'])
        else:
          tuning_scores.append(metrics['test_acc'])
        if method == 'dnn':
          dnn.clear_keras_session()

      mean_score = np.mean(tuning_scores)

      logging.info(
          'Worker result for method=%s, search %d/%d, dataset %d/%d '
          '(%.3f s)', method, FLAGS.search_idx, FLAGS.num_search, dataset_idx,
          FLAGS.num_dataset,
          time.time() - start)
      logging.info(param_dict)
      logging.info('%.4f', mean_score)
      logging.info('\n' * 2)

      if not FLAGS.debug:
        # save parameters and worker results to file
        with tf.gfile.GFile(save_path, 'w') as f:
          s = ','.join(['{}={}'.format(k, v) for k, v in param_dict.items()])
          f.write(s)
          f.write('\n')
          f.write('{:.8f}'.format(mean_score))
        logging.info('Saved results to %s', save_path)
    logging.info('\n\n')
def main(_):
    experiment_string = utils.get_experiment_string(FLAGS.dataset,
                                                    vec=FLAGS.vec,
                                                    dim_red=FLAGS.dim_red,
                                                    features=FLAGS.features,
                                                    problem=FLAGS.problem,
                                                    alternate=FLAGS.alternate)

    worker_dir = '{}/params/{}/worker_out'.format(FILE_PATH, experiment_string)
    param_dir = '{}/params/{}'.format(FILE_PATH, experiment_string)

    logging.info('\n' * 3)
    logging.info('Collation (parameter search results) for %s',
                 experiment_string)
    logging.info('\n' * 3)

    # worker out path format is {0}/dataset_idx={1}_{2}/{3}/search_idx={4}_{5}.out
    # where 0 = worker_out_dir, 1 = dataset_idx, 2 = num_dataset, 3 = method,
    # 4 = search_idx, 5 = num_search

    # gets idx from a str with format {text}={idx}_{text}
    extract_idx_func = lambda x: int(x.split('=')[1].split('_')[0])

    dataset_ids = sorted(tf.gfile.ListDir(worker_dir), key=extract_idx_func)
    assert dataset_ids  # check non-empty

    num_dataset = int(dataset_ids[0].split('_')[-1])
    if FLAGS.strict:
        assert num_dataset == FLAGS.num_dataset

    for d_idx, dataset_id in enumerate(dataset_ids):
        assert dataset_id == 'dataset_idx={}_{}'.format(d_idx, num_dataset)
        dataset_id_path = '{}/{}'.format(worker_dir, dataset_id)
        methods = tf.gfile.ListDir(dataset_id_path)

        for method in methods:
            # setup directories
            method_path = '{}/{}'.format(dataset_id_path, method)
            save_dir = '{}/{}'.format(param_dir, dataset_id)
            tf.gfile.MakeDirs(save_dir)

            # skip collation if already performed
            save_path = '{}/{}.param'.format(save_dir, method)
            if tf.gfile.Exists(save_path) and not FLAGS.overwrite:
                logging.info(
                    'Collation already completed for %s, dataset %d/%d',
                    method, d_idx, num_dataset)
                continue

            search_ids = sorted(tf.gfile.ListDir(method_path),
                                key=extract_idx_func)
            num_search = int(search_ids[0].split('_')[-1].rstrip('.out'))

            start = time.time()

            # look for best and worst tuning scores and save related parameters
            best_score, worst_score = float('-inf'), float('inf')
            best_param_str = None
            best_path = None
            for s_idx, search_id in enumerate(search_ids):
                if FLAGS.strict:
                    assert search_id == 'search_idx={}_{}.out'.format(
                        s_idx, num_search)

                read_path = '{}/{}'.format(method_path, search_id)
                with tf.gfile.GFile(read_path, 'r') as f:
                    lines = f.read().splitlines()

                score = float(lines[1])
                # assume that scores should be maximized (e.g., negative MSE or acc)
                if score > best_score:
                    best_param_str, best_path, best_score = lines[
                        0], read_path, score
                if score < worst_score:  # save worst config for debugging
                    worst_score = lines[0], score

            # note: reports number of worker results read which is less than the value
            # of num_search which describes the *maximum* number of worker searches
            logging.info(
                'Collation for method=%s, dataset %d/%d (%d reads, %.3f s)',
                method, d_idx, num_dataset, len(search_ids),
                time.time() - start)
            logging.info('best score=%.3f, %s', best_score, best_param_str)
            logging.info('\n' * 2)

            # save best parameters
            tf.gfile.Copy(best_path, save_path, overwrite=True)
            logging.info('Saved collation results to %s', save_path)
def main(_):
    experiment_string = utils.get_experiment_string(
        FLAGS.dataset,
        vec=FLAGS.vec,
        dim_red=FLAGS.dim_red,
        which_features=FLAGS.which_features,
        problem=FLAGS.problem,
        alternate=FLAGS.alternate)
    file_path = os.path.expanduser(FILE_PATH)
    logging.info('DIRECTORY USED %s', file_path)
    param_dir = '{}/params/{}'.format(file_path, experiment_string)
    save_dir = '{}/logs/{}'.format(file_path, experiment_string)
    tf.gfile.MakeDirs(save_dir)

    logging.info('\n' * 3)
    logging.info('Experiment %s and method=%s', experiment_string,
                 FLAGS.method)
    logging.info('\n' * 3)

    if FLAGS.debug:
        logging.warn('Running in debug mode')

    # setup methods
    methods = parse_method_flag(FLAGS.method)

    # setup datasets
    assert FLAGS.problem in ['classification', 'regression']
    if (FLAGS.dataset not in ['20news', 'sentiment_sentences']
            and FLAGS.vec != 'none'):
        raise ValueError(
            'Should not be using text vectorization with {} dataset'.format(
                FLAGS.dataset))

    datasets = []
    if 'sim' in FLAGS.dataset:
        oracle_preds = []
        d = utils.get_sim(FLAGS.dataset,
                          problem=FLAGS.problem,
                          which_features=FLAGS.which_features,
                          alternate=FLAGS.alternate)
        try:
            num_inf_feature = d._num_inf_feature  # pylint: disable=protected-access
        except:  # pylint: disable=bare-except
            num_inf_feature = None

        for _ in range(FLAGS.num_dataset):
            d.reset()
            # must use get() over generate() for consistency in random sampling calls
            x_train, y_train, x_test, y_test = d.get()

            if FLAGS.debug:  # use smaller subset
                x_train, y_train = shuffle_coordinately(x_train, y_train)
                x_train, y_train = x_train[:500, :], y_train[:500]
                x_test, y_test = shuffle_coordinately(x_test, y_test)
                x_test, y_test = x_test[:500, :], y_test[:500]

            datasets.append((x_train, y_train, x_test, y_test))
            oracle_preds.append(
                (d.oracle_predict(x_train), d.oracle_predict(x_test)))
    else:
        x_train, y_train, x_test, y_test = utils.load_nonsim_data(
            FLAGS.dataset, vec=FLAGS.vec, dim_red=FLAGS.dim_red)

        if FLAGS.debug:  # use smaller subset
            x_train, y_train = shuffle_coordinately(x_train, y_train)
            x_train, y_train = x_train[:500, :250], y_train[:500]
            x_test, y_test = shuffle_coordinately(x_test, y_test)
            x_test, y_test = x_test[:500, :250], y_test[:500]

        datasets.append((x_train, x_test, x_test, y_test))

    # evaluate oracle if experiment involves a simulation dataset
    if 'sim' in FLAGS.dataset:
        if FLAGS.problem == 'regression':
            oracle_metrics = {'train_mse': [], 'test_mse': []}
        else:
            oracle_metrics = {'train_acc': [], 'test_acc': []}

        for ((_, y_train, _, y_test),
             (y_train_pred, y_test_pred)) in zip(datasets, oracle_preds):
            if FLAGS.problem == 'regression':
                oracle_metrics['train_mse'].append(
                    mean_squared_error(y_train, y_train_pred))
                oracle_metrics['test_mse'].append(
                    mean_squared_error(y_test, y_test_pred))
            else:
                oracle_metrics['train_acc'].append(
                    accuracy(y_train, y_train_pred))
                oracle_metrics['test_acc'].append(accuracy(
                    y_test, y_test_pred))

        logging.info('\n' * 3)
        logging.info('oracle_results')
        logging.info('---')
        oracle_metrics = sorted(oracle_metrics.items(), key=lambda x: x[0])
        print_out = '\n'.join([
            '{}={:.6f}'.format(metric, np.mean(values))
            for metric, values in oracle_metrics
        ])
        print_out += '\n\n'
        print_out += '\n'.join([
            '{}_SE={:.6f}'.format(
                metric,
                np.true_divide(np.std(values), np.sqrt(FLAGS.num_dataset)))
            for metric, values in oracle_metrics
        ])
        logging.info(print_out)

        if not FLAGS.debug and FLAGS.logtofile:
            save_path = '{}/{}.log'.format(save_dir, 'oracle')
            with tf.gfile.GFile(save_path, 'w') as f:  # save logs to file
                f.write(print_out)
            logging.info('Saved oracle results to %s', save_path)

        logging.info('\n' * 2)

    # evaluate learning methods
    for method in methods:
        if method in ['l1_linear', 'l2_linear']:
            submodule = linear
        elif method == 'random_forest':
            submodule = random_forest
        elif method in ['l1_gbdt', 'l2_gbdt']:
            submodule = gbdt
        elif method in ['l1_dnn', 'l2_dnn']:
            submodule = dnn
        else:
            raise ValueError('Unknown learning method: {}'.format(method))

        start = time.time()
        all_metrics = {}
        other_info = {}
        for d_idx, (x_train, y_train, x_test, y_test) in enumerate(datasets):
            load_path = '{}/dataset_idx={}_{}/{}.param'.format(
                param_dir, d_idx, FLAGS.num_dataset, method)

            if tf.gfile.Exists(load_path):
                with tf.gfile.GFile(
                        load_path, 'r') as f:  # load best parameters from file
                    lines = f.read().splitlines()
                    param_str = lines[0]
                    param_dict = {
                        i.split('=')[0]: parse_value(i.split('=')[1])
                        for i in param_str.split(',')
                    }
            else:
                if FLAGS.fast:
                    logging.warn(
                        'No tuned parameters found (at %s), but using default '
                        'parameters since running in FAST mode.', load_path)
                    param_dict = None
                else:
                    raise RuntimeError(
                        '{} does not exist on Colossus'.format(load_path))

            model, metrics = submodule.pipeline(x_train,
                                                y_train,
                                                x_test,
                                                y_test,
                                                param_dict=param_dict,
                                                problem=FLAGS.problem)
            for k, v in metrics.items():
                if k not in all_metrics:
                    all_metrics[k] = []
                all_metrics[k].append(v)

            if FLAGS.problem == 'classification':
                if 'class_props' not in other_info:
                    other_info['class_props'] = []
                class_prop = np.true_divide(np.sum(y_test), np.size(y_test, 0))
                other_info['class_props'].append(class_prop)

            if (num_inf_feature is not None and FLAGS.which_features == 'all'
                    and 'dnn' not in method):
                if 'prop_inf_feat_importance' not in other_info:
                    other_info['prop_inf_feat_importance'] = []
                other_info['prop_inf_feat_importance'].append(
                    get_prop_inf_feature_importance(method, model,
                                                    num_inf_feature))

            if 'dnn' in method:
                dnn.clear_keras_session()

        logging.info('Experiment results for method=%s, (%d datasets, %.3f s)',
                     method, FLAGS.num_dataset,
                     time.time() - start)
        logging.info('---')

        all_metrics = sorted(all_metrics.items(), key=lambda x: x[0])
        print_out = '\n'.join([
            '{}={:.6f}'.format(metric, np.mean(values))
            for metric, values in all_metrics
        ])
        print_out += '\n\n'
        print_out += '\n'.join([
            '{}_SE={:.6f}'.format(
                metric,
                np.true_divide(np.std(values), np.sqrt(FLAGS.num_dataset)))
            for metric, values in all_metrics
        ])

        if other_info:
            print_out += '\n\n'
            other_info = sorted(other_info.items(), key=lambda x: x[0])
            print_out += '\n'.join([
                '{}={:.6f}'.format(metric, np.mean(values))
                for metric, values in other_info
            ])

        logging.info(print_out)

        if not FLAGS.debug and FLAGS.logtofile:
            save_path = '{}/{}.log'.format(save_dir, method)
            with tf.gfile.GFile(save_path, 'w') as f:  # save logs to file
                f.write(print_out)
            logging.info('Saved %s results to %s', method, save_path)

        logging.info('\n' * 2)