Exemple #1
0
def main(argv):
    args = parser.parse_args(argv[1:])
    log_codes = dict(e=tf.logging.ERROR,
                     i=tf.logging.INFO,
                     w=tf.logging.WARN,
                     d=tf.logging.DEBUG)
    tf.logging.set_verbosity(
        log_codes.get(args.log.lower()[0], tf.logging.ERROR))
    dataset_name = args.dataset
    epochs = args.epochs
    batch_size = args.batch_size

    if args.eager:
        tf.enable_eager_execution()
        print('******** TF EAGER MODE ENABLED ***************')

    params = Bunch(perturb_norm_bound=args.eps,
                   perturb_norm_order=args.norm,
                   epochs=epochs,
                   batch_size=batch_size,
                   lr=0.01,
                   adv_reg_lambda=0.0,
                   clean_pre_train=0.0,
                   multi_gpu=args.multi_gpu)

    get_run_results(dataset_name, params.mod(perturb_frac=args.cont))
Exemple #2
0
def main(argv):
    args = parser.parse_args(argv[1:])
    params = Bunch(yaml.load(open(args.params)))

    pkl_file = args.file or params.file  # allow override on cmd line
    params.file = pkl_file

    plot_drop(params)
Exemple #3
0
def main(argv):
    args = parser.parse_args(argv[1:])
    log_codes = dict(e=tf.logging.ERROR,
                     i=tf.logging.INFO,
                     w=tf.logging.WARN,
                     d=tf.logging.DEBUG)

    params = Bunch(yaml.load(open(args.hparams)))

    tf.logging.set_verbosity(
        log_codes.get(params.log.lower()[0], tf.logging.ERROR))

    if params.eager:
        tf.enable_eager_execution()
        print('******** TF EAGER MODE ENABLED ***************')

    timestr = time.strftime("%Y%m%d-%H%M%S")
    default_out_file_base = os.path.join(
        os.path.basename(args.hparams).split('.')[0], timestr)
    out_file_base = args.out or default_out_file_base

    results = run_grid(params)

    if not params.get('dataset'):  # i.e if synthetic, not UCI/pmlb
        results['Wts_x_L1'] = results['wts_dict']. \
          map(lambda w: np.sum(np.abs(np.array(
              list(sub_dict_prefix(w, 'x').values())))))

        results['Wts_r_L1'] = results['wts_dict']. \
          map(lambda w: np.sum(np.abs(np.array(
              list(sub_dict_prefix(w, 'r').values())))))

        results['Wts_w_L1'] = results['wts_dict']. \
          map(lambda w: np.sum(np.abs(np.array(
          list(sub_dict_prefix(w, 'w').values())))))

    results_simple = results.drop(['wts_dict', 'f_g_dict', 'f_a_dict'], axis=1)

    results_dir = make_sibling_dir(__file__, 'results')
    results_simple_file = os.path.join(results_dir, out_file_base + '.csv')
    os.makedirs(os.path.dirname(results_simple_file), exist_ok=True)
    with open(results_simple_file, 'w+') as fd:
        results_simple.to_csv(fd, float_format='%.3f', index=False)

    pkl_file = os.path.join(results_dir, out_file_base + '.pkl')
    results.to_pickle(pkl_file)
    print(df_simple(results))

    print(f'******** Summary csv written to {results_simple_file}')
    print(f'******** Pickled results dataframe at {pkl_file}')
Exemple #4
0
def main(argv):
    args = parser.parse_args(argv[1:])
    log_codes = dict(e=tf.logging.ERROR,
                     i=tf.logging.INFO,
                     w=tf.logging.WARN,
                     d=tf.logging.DEBUG)
    tf.logging.set_verbosity(
        log_codes.get(args.log.lower()[0], tf.logging.ERROR))
    dataset_name = args.dataset
    epochs = args.epochs
    batch_size = args.batch_size

    if args.eager:
        tf.enable_eager_execution()
        print('******** TF EAGER MODE ENABLED ***************')

    params = Bunch(
        perturb_norm_bound=args.eps,
        perturb_norm_order=args.norm,
        std=True,  # DO NOT NORMALIZE DATA
        epochs=epochs,
        batch_size=batch_size,
        lr=0.01,
        adv_reg_lambda=0.0,
        clean_pre_train=0.0,
        multi_gpu=args.multi_gpu)

    # synthetic dataset
    N = 1000
    nF = 2  # number of noise featuers

    np.random.seed(123)
    y = np.random.choice(2, N, p=[0.5, 0.5])
    y1 = 2 * y - 1.0
    agree = np.random.choice(2, N, p=[0.2, 0.8])
    agree = 2 * agree - 1.0
    x1 = np.array(agree * y1, dtype=np.float32)
    df1 = pd.DataFrame(dict(x1=x1))
    df_tar = pd.DataFrame(dict(target=y))


    rest = np.array(np.random.choice(2, N * nF), dtype=np.float32). \
      reshape([-1, nF])
    rest_cols = ['x' + str(i + 2) for i in range(nF)]
    df_rest = pd.DataFrame((2 * rest - 1.0), columns=rest_cols)
    df = pd.concat([df1, df_rest, df_tar], axis=1)
    get_run_results(dataset_name, params.mod(perturb_frac=args.cont), df=df)
    def __init__(self, config: Bunch = None):
        if not config:
            config = Bunch(
                N=5000,
                batch_size=50,
                col_spec=[
                    dict(type='cat', card=3),
                    dict(type='cat', card=5),
                    dict(type='num', min=0, max=6),
                    dict(type='num', min=-3, max=3)
                ],
                noise=0.4,
                coefs=np.array([
                    -1.0,
                    1.0,
                    0.4,  # cat feature 1
                    -1.0,
                    3.0,
                    0.9,
                    -1.0,
                    -0.5,  # cat feature 2
                    0.8,  # n1
                    -0.9
                ]))
        self.config = config
        N = config.N
        columns = []
        for spec in config.col_spec:
            if spec['type'] == 'cat':
                columns += [make_cat(spec['card'], N)]
            else:
                columns += [make_num(N)]

        arr = np.array(columns).transpose()
        cat_features = [
            i for i, spec in enumerate(config.col_spec)
            if spec['type'] == 'cat'
        ]
        arr_hot = arr
        if len(cat_features) > 0:
            enc = OneHotEncoder(categorical_features=cat_features)
            enc.fit(arr)
            arr_hot = enc.transform(arr).toarray()

        logits = np.matmul(arr_hot, config.coefs) + config.bias

        noise = np.random.normal(0.0, config.noise, size=N)
        logits += noise

        probs = 1.0 / (1 + np.exp(-logits))
        labels = np.array([
            np.random.choice(2, 1, p=[1 - p, p])[0] for p in probs
        ]).reshape([-1, 1])

        # features = df[['c1', 'c2', 'n1', 'n2']].to_dict('list')
        # labels = df[['label']].to_dict('list')
        self.X = tf.cast(arr, tf.float32)
        self.Y = tf.cast(labels, tf.float32)
Exemple #6
0
    def train(self, params: Bunch):
        tf.set_random_seed(123)
        np.random.seed(123)

        config = None
        if platform.system() == 'Linux' and multi_gpu:  # we're on the GPU
            # Thanks to
            # https://medium.com/tensorflow/multi-gpu-training-with-estimators-tf-keras-and-tf-data-ba584c3134db
            NUM_GPUS = 10
            strategy = tf.contrib.distribute.MirroredStrategy(
                num_gpus=NUM_GPUS)
            config = tf.estimator.RunConfig(train_distribute=strategy)

        classifier = tf.estimator.Estimator(
            model_fn=binary_classification_model,
            model_dir=self.model_dir,
            config=config,
            params=params.mod(segments=self.segments,
                              feature_columns=self.feature_columns,
                              train_perturb_frac=params.perturb_frac,
                              test_perturb_frac=0.0))

        def input_fn():
            df_target = self.df_train[self.target_name]
            ds = tf.data.Dataset.from_tensor_slices(
                (dict(self.df_train), df_target))
            ds = ds.shuffle(buffer_size=1000, seed=123).\
              repeat(params.epochs).prefetch(params.batch_size*3).\
              batch(params.batch_size)
            if platform.system() == 'Linux':
                ds = ds.apply(
                    tf.contrib.data.prefetch_to_device(device='/device:gpu:0'))
            return ds

        # define train input_fn
        # input_fn = lambda: tf.contrib.data.make_csv_dataset(
        #   self.train_file,
        #   batch_size=batch_size,
        #   num_epochs=epochs,
        #   shuffle=True,
        #   shuffle_buffer_size=10000,
        #   shuffle_seed=123,
        #   prefetch_buffer_size=batch_size*3,
        #   num_parallel_reads=10,
        #   label_name=self.target_name
        # )

        # Tried to use this to show train/test loss at each epoch but
        # doesn't work as there's an outstanding bug-fix in TF.
        # evaluator = tf.contrib.estimator.InMemoryEvaluatorHook(
        #   classifier, eval_input_fn, steps=None, every_n_iter=100)

        # Train the Model.
        classifier.train(
            input_fn=input_fn,  # hooks=[evaluator],
            steps=None)
        return classifier
Exemple #7
0
    def __init__(self, run_args, lite_mode=True):

        if lite_mode:
            EnsembleModel.forward_decoder = forward_decoder

        run_args = merge_dicts(default_args, vars(run_args))
        self._fill_hardware_args(run_args)
        self.args = Bunch(run_args)
        self._load_tokenizer()
        self._load_model(lite_mode)
Exemple #8
0
def main(argv):
    args = parser.parse_args(argv[1:])
    log_codes = dict(e=tf.logging.ERROR,
                     i=tf.logging.INFO,
                     w=tf.logging.WARN,
                     d=tf.logging.DEBUG)
    tf.logging.set_verbosity(
        log_codes.get(args.log.lower()[0], tf.logging.ERROR))
    dataset_name = args.dataset
    epochs = args.epochs
    batch_size = args.batch_size

    if args.eager:
        tf.enable_eager_execution()
        print('******** TF EAGER MODE ENABLED ***************')

    params = Bunch(
        perturb_norm_bound=args.eps,
        perturb_norm_order=args.norm,
        std=False,  # DO NOT NORMALIZE DATA
        epochs=epochs,
        activation=args.act,
        batch_size=batch_size,
        lr=0.01,
        adv_reg_lambda=0.0,
        clean_pre_train=0.0,
        multi_gpu=args.multi_gpu)

    # synthetic dataset
    # three features: one strongly indicative of label,
    # the other two highly correlated by random
    data_params = Bunch(N=1000,
                        nc=args.nc,
                        nr=args.nf,
                        nw=args.nw,
                        corr=args.corr,
                        corrp=args.corr1,
                        cat=args.cat)

    df = gen_synth_df(data_params)

    get_run_results(dataset_name, params.mod(perturb_frac=args.cont), df=df)
Exemple #9
0
def run_5_combos(dataset_name, df, model_dir, params: Bunch):

    runner = Runner(dataset_name, df, model_dir, params)
    make_clear_dir(model_dir)
    # exp_dir = make_sibling_dir(__file__, 'experiments')
    # exp = Experiment(name=dataset_name, debug=False,  save_dir=exp_dir)
    # exp.tag(params)

    # natural training
    runner.train(params.mod(perturb_frac=0.0))
    nat_nat = runner.eval(params.mod(perturb_frac=0.0))
    nat_nat.update(train=0.0, test=0.0)
    # exp.log(nat_nat)
    printed_cols = list(
        set(nat_nat.keys()).difference(
            ['f_a_dict', 'f_g_dict', 'wts_dict', 'corrs_dict']))
    print(pd.DataFrame([nat_nat])[printed_cols])

    nat_per = runner.eval(params)
    nat_per.update(train=0.0, test=params.perturb_frac)
    # exp.log(nat_per)
    print(pd.DataFrame([nat_per])[printed_cols])

    # adversarial training: Start with naturally trained classifier
    # for epochs/2, then train on adversarial inputs for remaining
    # epochs/2
    make_clear_dir(model_dir)
    clean_train_epochs = int(
        params.get('clean_pre_train', 0.5) * params.epochs)
    dirty_train_epochs = params.epochs - clean_train_epochs
    if clean_train_epochs > 0:
        runner.train(params.mod(perturb_frac=0.0, epochs=clean_train_epochs))
    runner.train(params.mod(epochs=dirty_train_epochs))

    per_nat = runner.eval(params.mod(perturb_frac=0.0))
    per_nat.update(train=params.perturb_frac, test=0.0)
    # exp.log(per_nat)
    print(pd.DataFrame([per_nat])[printed_cols])

    per_per = runner.eval(params)
    per_per.update(train=params.perturb_frac, test=params.perturb_frac)
    # exp.log(per_per)
    print(pd.DataFrame([per_per])[printed_cols])

    per_per_all = runner.eval(params.mod(perturb_frac=1.0))
    per_per_all.update(train=params.perturb_frac, test=1.0)
    # exp.log(per_per_all)
    print(pd.DataFrame([per_per_all])[printed_cols])

    all_results = dict(nat_nat=nat_nat,
                       nat_per=nat_per,
                       per_nat=per_nat,
                       per_per=per_per,
                       per_per_all=per_per_all)

    return all_results
Exemple #10
0
def run_grid(params: Bunch):
    '''
  Run a grid of experiments based on params.grid and return collated
  values/metrics in a data-frame
  :param params:
  :return:
  '''
    if params.get('dataset'):
        df = fetch_data(params.dataset)
    else:
        df = gen_synth_df(params)

    grid_dict = params.grid
    params_list = list(ParameterGrid(grid_dict))
    results = []
    for p in params_list:
        result = run_one(df,
                         params.mod(p),
                         name=params.get('dataset', 'synth'))
        results += [result]
    results = pd.DataFrame(results)
    return results
Exemple #11
0
def label_corr_stats(X, y):
    '''
   Stats for streaming compute of corr of (exploded) features in X with
   labels y
  :param X: (?nB, nF)
  :param y: (?nB)
  :return: (nF)
  '''
    Xy_av = tf.reduce_mean(X * y, axis=0)
    X_av = tf.reduce_mean(X, axis=0)
    y_av = tf.reduce_mean(y, axis=0)
    Xsq_av = tf.reduce_mean(X * X, axis=0)
    ysq_av = tf.reduce_mean(y * y, axis=0)
    return Bunch(xy=Xy_av, x=X_av, y=y_av, xsq=Xsq_av, ysq=ysq_av)
Exemple #12
0
def main(argv):
    args = parser.parse_args(argv[1:])
    log_codes = dict(e=tf.logging.ERROR,
                     i=tf.logging.INFO,
                     w=tf.logging.WARN,
                     d=tf.logging.DEBUG)
    tf.logging.set_verbosity(
        log_codes.get(args.log.lower()[0], tf.logging.ERROR))
    dataset_name = args.dataset
    epochs = args.epochs
    batch_size = args.batch_size

    if args.eager:
        tf.enable_eager_execution()
        print('******** TF EAGER MODE ENABLED ***************')

    params = Bunch(perturb_l2_bound=0.2,
                   epochs=epochs,
                   batch_size=batch_size,
                   multi_gpu=args.multi_gpu,
                   lr=0.01)

    res_10 = get_run_results(dataset_name, params.mod(perturb_frac=0.1))

    res_20 = get_run_results(dataset_name, params.mod(perturb_frac=0.2))

    res_50 = get_run_results(dataset_name, params.mod(perturb_frac=0.5))

    res_80 = get_run_results(dataset_name, params.mod(perturb_frac=0.8))

    res_100 = get_run_results(dataset_name, params.mod(perturb_frac=1.0))

    def get_field(dfs, conditions, column):
        return [df.query(conditions).iloc[0][column] for df in dfs]

    dfs = [res_10, res_20, res_50, res_80, res_100]

    df = pd.DataFrame(
        dict(TrainAdvPct=[10, 20, 50, 80, 100],
             TestNat=get_field(dfs, 'train > 0 & test == 0.0', 'acc'),
             TestAdv=get_field(dfs, 'train > 0 & test == 1.0', 'acc')))

    if platform.system() != 'Linux':

        dfm = df.melt('TrainAdvPct',
                      var_name='TestMode',
                      value_name='Accuracy')
        g = sns.factorplot(x="TrainAdvPct",
                           y="Accuracy",
                           hue='TestMode',
                           data=dfm)
        plt.show()
Exemple #13
0
def test(model: RobustLogisticModel,
         data,
         perturb=0.0,
         attacker: RobustLogisticModel = None):
    if not attacker:
        attacker = model
    test_loss_avg = tfe.metrics.Mean()
    feat_attribs = []
    ytrue = ypred = np.array([])
    for (x, y) in data:
        # if perturb:
        #   x = model.perturb_continuous(x, y, robust=1.0)
        loss_value, y_ = loss(model, x, y, robust=perturb, attacker=attacker)
        ypred = np.append(ypred, y_.numpy().squeeze())
        ytrue = np.append(ytrue, y.numpy().squeeze())
        test_loss_avg(loss_value)
        feat_attrib_values = model.feature_attributions(x, y, perturb=perturb)
        feat_attribs += [feat_attrib_values]
    auc = roc_auc_score(ytrue, ypred)
    r2 = r2_score(ytrue, ypred)
    ypred_binary = 1 * (ypred > 0.5)
    acc = accuracy_score(ytrue, ypred_binary)
    feat_attribs = tf.concat(feat_attribs, axis=0)
    feat_abs_attribs = tf.reduce_mean(tf.abs(feat_attribs), axis=0)
    ent = entropy(feat_abs_attribs)
    feat_avg_attribs = tf.reduce_mean(feat_attribs, axis=0)
    av_loss = test_loss_avg.result()
    print(
        f"Test set loss: {av_loss:.2}, AUC={auc:.2}, R2={r2:.2}, acc={acc:2}")
    weights = model.get()
    # print('Weights:')
    # print(weights)
    # print('Feature avg attribs: ')
    # print(feat_avg_attribs.numpy())
    # print('Feature abs attribs: ')
    # print(feat_abs_attribs.numpy())
    return Bunch(auc=np.round(auc, 2),
                 r2=np.round(r2, 2),
                 acc=np.round(acc, 2),
                 loss=np.round(av_loss.numpy(), 3),
                 coefs=np.round(weights['coefs'].squeeze(), 2),
                 bias=np.round(weights['bias'][0], 2),
                 attr_ave=np.round(feat_avg_attribs.numpy().squeeze(), 2),
                 attr_abs=np.round(feat_abs_attribs.numpy().squeeze(), 2),
                 ent=ent)
Exemple #14
0
    def __init__(self,
                 dataset_name,
                 df: pd.DataFrame,
                 model_dir: str,
                 params: Bunch = None):
        self.col_spec, self.target_name = df_column_specs(df)

        self.segments = np.array([], dtype=np.int32)
        self.feature_value_names = []
        for i, s in enumerate(self.col_spec):
            self.segments = np.append(self.segments, [np.repeat(i, s['card'])])
            col_name = s['name']
            if s['card'] == 1:
                self.feature_value_names += [col_name]
            else:
                self.feature_value_names += [
                    col_name + '=' + str(i) for i in range(s['card'])
                ]

        # prepend numeric to enforce lexicographic order so
        # we can recover the model weights from tensorflow's variables
        # in the right order
        df = df.copy(deep=True)
        df.columns = [c if c == self.target_name else f'{i:05d}_' + c \
                      for i, c in enumerate(df.columns)]

        self.feature_columns = tf_feature_columns(df)
        self.df_train, self.df_test = \
          split_and_standardize(df, params.get('std', True))

        dir = make_sibling_dir(__file__, f'datasets/{dataset_name}')
        self.train_file = f'{dir}/train.csv'
        self.test_file = f'{dir}/test.csv'

        with open(self.train_file, mode='w+') as fd:
            self.df_train.to_csv(fd, header=True, index=False)

        with open(self.test_file, mode='w+') as fd:
            self.df_test.to_csv(fd, header=True, index=False)

        self.model_dir = model_dir
Exemple #15
0
def run_one(df: pd.DataFrame, params: Bunch, name='one'):
    '''
  Only do a single train (nat or adv) and test (nat or adv) combo
  and return some metrics/values
  :param df:
  :param params:
  :return:
  '''

    tf.set_random_seed(123)
    np.random.seed(123)
    model_dir = os.path.join('/tmp/robulin/exp', name)

    runner = Runner(name, df, model_dir, params)
    make_clear_dir(model_dir)

    # adversarial training: pre-train on nat examples for
    # some fraction of epochs, then on adversarial for remaining epochs
    clean_train_epochs = int(
        params.get('clean_pre_train', 0.5) * params.epochs)
    dirty_train_epochs = params.epochs - clean_train_epochs
    if clean_train_epochs > 0:
        runner.train(params.mod(perturb_frac=0.0, epochs=clean_train_epochs))
    runner.train(params.mod(epochs=dirty_train_epochs))
    result = runner.eval(params.mod(perturb_frac=params.test_perturb_frac))
    result.update(train=params.perturb_frac, test=params.test_perturb_frac)

    few_fields = params.get('fields', [
        'train', 'test', 'loss', 'auc', 'acc', 'wts_ent', 'wts_l1',
        'wts_l1_linf', 'wts_1pct', 'wts_pct1pct', 'av_ent', 'av_high', 'a_ent',
        'g_ent', 'f_a_ent', 'f_g_ent'
    ])

    result_few = sub_dict(result, few_fields)
    simple_keys = [
        k for k, v in params.items() if type(v) in [int, float, str, bool]
    ]
    result_few.update(sub_dict(params, simple_keys))
    print(result_few)
    result.update(sub_dict(params, simple_keys))
    return result
Exemple #16
0
import pandas as pd

# evaluation on natural test data

config = Bunch(
    N=1000,
    batch_size=20,
    perturb_one_hot=False,
    col_spec=[
        dict(type='cat', card=3, name='c1'),
        dict(type='cat', card=4, name='c2'),
        dict(type='num', min=0, max=6, name='n1'),
        dict(type='num', min=-3, max=3, name='n2')
    ],
    noise=0.5,
    coefs=np.array([
        -1.0,
        1.0,
        0.4,  # cat feature 1
        -1.0,
        3.0,
        -1.0,
        -0.5,  # cat feature 2
        0.00001,  # n1
        0.7
    ]),
    bias=0.5)

train_data = SyntheticDataGenerator(config).data
test_data = SyntheticDataGenerator(config).data
Exemple #17
0
def binary_classification_model(features, labels, mode, params: utils.Bunch):
    """Custom model; initially just linear (logistic or poisson)
  """
    params = utils.Bunch(**params)
    optimizer = params.get('optimizer', 'ftrl')
    l1_reg = params.get('l1_reg', 0.0)
    l2_reg = params.get('l2_reg', 0.0)
    tf.set_random_seed(123)
    labels = tf.cast(labels, tf.float32)
    epsilon = params.perturb_norm_bound
    norm_order = params.get('perturb_norm_order', 2)
    train_perturb_frac = params.train_perturb_frac
    test_perturb_frac = params.test_perturb_frac
    # do the various feature transforms according to the
    # 'feature_column' param, so now we have the feature-vector
    # that we will do computations on.
    x = tf.feature_column.input_layer(features, params.feature_columns)
    # for units in params['hidden_units']:
    #   net = tf.layers.dense(net, units=units, activation=tf.nn.relu)

    # Compute logits (1 per class).
    #logits = tf.layers.dense(net, params['n_classes'], activation=None)
    #logits = tf.layers.dense(net, 1, activation=None, name='dense')
    dense = tf.layers.Dense(1, activation=None,
                            kernel_initializer=\
                              tf.keras.initializers.zeros(),
                              #tf.keras.initializers.RandomNormal(seed=123),
                            bias_initializer= \
                              tf.keras.initializers.zeros())
    #tf.keras.initializers.RandomNormal(seed=123))

    if len(dense.trainable_variables) == 0:
        dense(x)  # to force the kernel initialization
    # this is the "kernel" i.e. weights, does not include bias
    coefs = dense.trainable_variables[0]
    bias = dense.trainable_variables[1][0]
    perturb_frac = train_perturb_frac if mode == tf.estimator.ModeKeys.TRAIN \
      else test_perturb_frac
    x_perturbed, _ = RobustLogisticModel.perturb_continuous(
        x,
        labels,
        coefs,
        norm_bound=epsilon,
        norm_order=norm_order,
        perturb_frac=perturb_frac,
        seed=123)
    logits = dense(x_perturbed)
    if params.activation == 'sigmoid':
        predictions = tf.sigmoid(logits)
    elif params.activation == 'sign':
        predictions = tf.maximum(0.0, tf.sign(logits))
    else:  # assume relu
        predictions = tf.nn.relu(logits)
    labels = tf.reshape(labels, [-1, 1])
    # Compute predictions.
    predicted_classes = tf.maximum(tf.sign(predictions - 0.5), 0)

    # if mode == tf.estimator.ModeKeys.PREDICT:
    #   predictions = {
    #     'class_ids': predicted_classes[:, tf.newaxis],
    #     'probabilities': tf.nn.softmax(logits), # not really used
    #     'logits': logits,
    #   }
    #   return tf.estimator.EstimatorSpec(mode, predictions=predictions)

    # Compute loss.
    if params.activation == 'sigmoid':
        loss = tf.reduce_mean(
            tf.keras.backend.binary_crossentropy(target=labels,
                                                 output=logits,
                                                 from_logits=True))
    elif params.activation == 'sign':
        loss = tf.reduce_mean(-(2 * labels - 1) * logits)
    else:
        raise Exception(f'loss not known for activation {params.activation}')

    if l1_reg > 0 and optimizer != 'ftrl':
        loss = loss + l1_reg * tf.norm(coefs, ord=1)
    if l2_reg > 0 and optimizer != 'ftrl':
        loss = loss + l2_reg * tf.sqrt(tf.maximum(0.0, tf.nn.l2_loss(coefs)))

    adv_reg_lambda = params.get('adv_reg_lambda', 0.0)
    if adv_reg_lambda and perturb_frac > 0.0:
        clean_logits = dense(x)
        clean_loss = tf.reduce_mean(
            tf.keras.backend.binary_crossentropy(target=labels,
                                                 output=clean_logits,
                                                 from_logits=True))
        loss = clean_loss + adv_reg_lambda * loss


# Compute evaluation metrics.
    accuracy = tf.metrics.accuracy(labels=labels,
                                   predictions=predicted_classes,
                                   name='acc_op')

    auc = tf.metrics.auc(labels=labels, predictions=predictions, name='auc-op')

    # add metrics etc for tensorboard
    tf.summary.scalar('accuracy', accuracy[1])
    tf.summary.scalar('auc', auc[1])
    tf.summary.scalar('loss', loss)

    if mode == tf.estimator.ModeKeys.EVAL:
        # axiomatic attribution (Integrated Grads)
        feat_val_attribs = attribution.logistic_attribution(x, coefs, bias)
        feat_val_corr_stats = attribution.label_corr_stats(x, labels)
        av_attribs = tf.reduce_mean(feat_val_attribs, axis=0)
        attrib_entropy = tf_entropy(feat_val_attribs)
        num_high_attribs = num_above_relative_threshold(feat_val_attribs,
                                                        thresh=params.get(
                                                            'thresh', 0.1))
        av_attrib_entropy = tf.metrics.mean(attrib_entropy)
        av_high_attribs = tf.metrics.mean(num_high_attribs)
        mean_attribs = tf.metrics.mean_tensor(av_attribs, name='attrib')
        xy_av = tf.metrics.mean_tensor(feat_val_corr_stats.xy, name='xy_av')
        x_av = tf.metrics.mean_tensor(feat_val_corr_stats.x, name='x_av')
        y_av = tf.metrics.mean_tensor(feat_val_corr_stats.y, name='y_av')
        xsq_av = tf.metrics.mean_tensor(feat_val_corr_stats.xsq, name='xsq_av')
        ysq_av = tf.metrics.mean_tensor(feat_val_corr_stats.ysq, name='ysq_av')

        # ad-hoc attribution (AFVI)
        afvi = attribution.logistic_afvi(x, coefs, bias)
        mean_afvi = tf.metrics.mean_tensor(afvi, name='afvi')

        metrics = dict(accuracy=accuracy,
                       auc=auc,
                       attrib_ent=av_attrib_entropy,
                       high_attribs=av_high_attribs,
                       attrib=mean_attribs,
                       afvi=mean_afvi,
                       xy_av=xy_av,
                       x_av=x_av,
                       y_av=y_av,
                       xsq_av=xsq_av,
                       ysq_av=ysq_av)

        # the histograms don't work in eval mode??
        tf.summary.histogram('attrib', mean_attribs[1])
        tf.summary.histogram('afvi', mean_afvi[1])

        return tf.estimator.EstimatorSpec(mode,
                                          loss=loss,
                                          eval_metric_ops=metrics)

    # Create training op.
    assert mode == tf.estimator.ModeKeys.TRAIN

    #
    if optimizer == 'adam':
        loss_optimizer = tf.train.AdamOptimizer(learning_rate=params.lr)
    elif optimizer == 'ftrl':
        loss_optimizer = tf.train.FtrlOptimizer(
            learning_rate=params.lr,
            l1_regularization_strength=l1_reg,
            l2_regularization_strength=l2_reg)
    elif optimizer == 'adagrad':
        loss_optimizer = tf.train.AdagradOptimizer(learning_rate=params.lr)
    elif optimizer == 'sgd':
        loss_optimizer = tf.train.GradientDescentOptimizer(
            learning_rate=params.lr)
    else:
        raise Exception(f"Unknown optimizer: {optimizer}")

    train_op = loss_optimizer.minimize(loss,
                                       global_step=tf.train.get_global_step())
    return tf.estimator.EstimatorSpec(mode, loss=loss, train_op=train_op)
Exemple #18
0
def gen_synth_df(params: Bunch):
    N = params.get('N', 1000)
    nC = params.get('nc', 8)  # number of features predictive of label
    nF = params.get('nr', 8)  # how many features are random, uncorr with
    nW = params.get('nw', 0)  # how many features weakly correlated with label
    corr1 = params.get('corrp', True)  # whether predictive feats are corr
    corr = params.get('corr', False)  # whether random feats are corr
    cat = params.get('cat',
                     False)  # whether the random features are categorical
    pred = params.get('pred', 0.7)  # how often do the "predictive" features
    weak_pred = params.get('weak_pred', 1.0)  # predictivity of weak feature

    np.random.seed(123)

    y = np.random.choice(2, N, p=[0.5, 0.5])
    y1 = (2 * y - 1.0).reshape([-1, 1])
    df_agree = pd.DataFrame()
    p = pred  # probability of agreement with label
    if nC > 0:
        if corr1:  # identical, i.e. highly correlated
            agree = np.repeat(
              np.array(np.random.choice(2, N, p=[1-p, p]), dtype=np.float32). \
                reshape([-1,1]), nC, axis=1)
        else:  # uncorrelated
            agree = np.array(np.random.choice(2, N * nC, p=[1-p, p]),
                             dtype=np.float32). \
              reshape([-1, nC])
        agree = y1 * (2 * agree - 1)
        agree_cols = ['x' + str(i + 1) for i in range(nC)]
        df_agree = pd.DataFrame(agree, columns=agree_cols)

    #
    df_tar = pd.DataFrame(dict(target=y))

    # rest are random
    df_rest = pd.DataFrame()
    if nF > 0:
        if cat:  # 1 categorical feature with nF possible values
            rest = np.array(np.random.choice(nF, N), dtype=np.int64). \
              reshape([-1,1])
        else:
            if corr:  # identical, i.e. highly correlated
                rest = np.repeat(
                  np.array(np.random.choice(2, N, p=[0.5, 0.5]), dtype=np.float32). \
                    reshape([-1,1]), nF, axis=1)
            else:  # uncorrelated, i.i.d -1/1
                rest = np.array(np.random.choice(2, N * nF, p=[0.5, 0.5]),
                                dtype=np.float32).reshape([-1, nF])
            rest = 2 * rest - 1.0
        rest_cols = ['r'] if cat else ['r' + str(i + 1) for i in range(nF)]
        df_rest = pd.DataFrame(rest, columns=rest_cols)

    df_weak = pd.DataFrame()
    if nW > 0:  # normal, slightly correlated with label
        means = y1 * weak_pred / math.sqrt(nW)
        weak = np.repeat(np.random.normal(means, 1.0), nW, axis=1)
        weak_cols = ['w'] if cat else ['w' + str(i + 1) for i in range(nW)]
        df_weak = pd.DataFrame(weak, columns=weak_cols)

    df = pd.concat([df_agree, df_rest, df_weak, df_tar], axis=1)
    return df
Exemple #19
0
import pandas as pd
from src.train import run
from src.utils import Bunch, pmlb_dataset_x_y
tf.enable_eager_execution()
import numpy as np
import yaml

dataset = 'credit-a'
X, y, col_spec, target = pmlb_dataset_x_y(dataset)

np.random.seed(123)

config = Bunch(batch_size=20,
               perturb_one_hot=False,
               zap_categoricals=False,
               l2_epsilon=0.0,
               num_perturbed_categoricals=1,
               col_spec=col_spec,
               label_name=target)

pd.set_option('display.max_columns', 500)
pd.set_option('max_colwidth', 1000)
pd.set_option('display.width', 1000)
epochs = 400
batch = 20
lr = 0.01

res_10 = run(config,
             X,
             y,
             robust_frac=0.1,
Exemple #20
0
import tensorflow as tf
from src.robust_logistic import RobustLogisticModel
from data_loader.synthetic_data_generator import SyntheticDataGenerator
from src.train import train, test
from src.utils import loss, grad, Bunch
tf.enable_eager_execution()
import numpy as np
import pandas as pd

config = Bunch(N=500, batch_size=5,
               col_spec=[
                 #dict(type='cat', card=3),
                 #dict(type='cat', card=5),
                 dict(type='num', min=0, max=6, name='n1'),
                 dict(type='num', min=-3, max=3, name='n2')],
               noise=0.4,
               coefs=np.array([#-1.0, 1.0, 0.4,  # cat feature 1
                               #-1.0, 3.0, 0.9, -1.0, -0.5,  # cat feature 2
                               0.8,  # n1
                               -0.9]),
               bias = 0.3
               )
syndata = SyntheticDataGenerator(config)

# model = RobustLogisticModel(1, l2_epsilon=0.1,  config=config)
# wts = model.get_weights()
#
# # testing
# inputs, labels = syndata.next_batch()
# out = model(inputs, label=1)
# attribs = model.attributions(inputs, labels)
Exemple #21
0
def plot_multi(df: pd.DataFrame,
               x_ids=None,
               var='type',
               value='value',
               include=None,
               ignore=None,
               order_by=None,
               ascending=False,
               title=None,
               rename=dict(),
               legend=True,
               threshold=0.001,
               kind='bar',
               show=True,
               tight=True,
               ax=None,
               params: Bunch = Bunch()):
    # rcParams.update({'figure.autolayout': True})
    if rename:
        df = df.rename(columns=rename)
    if x_ids is None:
        df['__x'] = df.index
        x_ids = '__x'
    if order_by is not None:
        df = df.sort_values(by=order_by, ascending=ascending)
    if ignore:
        df = df.drop(ignore, axis=1)
    if include:
        df = df[include]
    plt.interactive(False)
    dfm = pd.melt(df, id_vars=x_ids, var_name=var, value_name=value)
    biggest = np.max(np.abs(dfm[value]))
    dfm = dfm[abs(dfm[value]) >= biggest * threshold]
    x_labels = df[x_ids].astype(str).tolist()
    x_labels = [x for x in x_labels if x in dfm[x_ids].astype(str).tolist()]
    max_x_label = max([len(s) for s in x_labels])
    if kind == 'bar':
        g = sns.catplot(x=x_ids,
                        y=value,
                        hue=var,
                        data=dfm,
                        kind=kind,
                        order=x_labels,
                        row_order=x_labels,
                        legend=False,
                        legend_out=True,
                        ax=ax)
        if params.get('xtick_font_size'):
            g.set_xticklabels(fontdict=dict(fontsize=params.xtick_font_size))
        if max_x_label > 4:
            g.set_xticklabels(rotation=90)
    else:  # line
        g = sns.lineplot(x=x_ids,
                         y=value,
                         hue=var,
                         data=dfm,
                         legend=False,
                         ax=ax)
        #g.set_xticklabels(labels=x_labels)
    if title:
        plt.title(title)
    if tight:
        plt.tight_layout()
    if legend:
        plt.legend(loc='upper right', title=var)
    if show:
        plt.show()
    return g
Exemple #22
0
    def eval(self, params: Bunch):
        tf.set_random_seed(123)
        np.random.seed(123)

        classifier = tf.estimator.Estimator(
            model_fn=binary_classification_model,
            model_dir=self.model_dir,
            config=tf.estimator.RunConfig(tf_random_seed=123),
            params=params.mod(segments=self.segments,
                              feature_columns=self.feature_columns,
                              train_perturb_frac=0.0,
                              test_perturb_frac=params.perturb_frac))

        def input_fn(train=False):
            df = self.df_train if train else self.df_test
            df_target = df[self.target_name]
            ds = tf.data.Dataset.from_tensor_slices((dict(df), df_target)).\
              prefetch(params.batch_size*3).\
              batch(params.batch_size)
            if platform.system() == 'Linux':
                ds = ds.apply(
                    tf.contrib.data.prefetch_to_device(device='/device:gpu:0'))
            return ds

        # input_fn = lambda: tf.contrib.data.make_csv_dataset(
        #   self.test_file,
        #   batch_size=batch_size,
        #   prefetch_buffer_size=batch_size*3,
        #   num_parallel_reads=10,
        #   num_epochs=1,
        #   shuffle=False,
        #   label_name=self.target_name
        # )

        eval_result = classifier.evaluate(
            input_fn=lambda: input_fn(train=False))
        train_result = classifier.evaluate(
            input_fn=lambda: input_fn(train=True))

        # Note we are computing ALL attribution metrics on
        # UNPERTURBED TRAIN data !
        feature_value_attribs = train_result['attrib']

        feature_value_attribs_ent = tf_entropy(feature_value_attribs)
        feature_attribs = tf.segment_sum(feature_value_attribs, self.segments)
        feature_attribs_ent = tf_entropy(feature_attribs)

        afvi = train_result['afvi']
        afvi_ent = tf_entropy(afvi)
        feature_afvi = tf.segment_sum(afvi, self.segments)
        feature_afvi_ent = tf_entropy(feature_afvi)

        col_names = [s['name'] for s in self.col_spec]
        feature_attribs_dict = dict(
            zip(col_names, np.round(tf_numpy(feature_attribs), 4)))
        feature_afvi_dict = dict(
            zip(col_names, np.round(tf_numpy(feature_afvi), 4)))
        wts = classifier.get_variable_value('dense/kernel').squeeze()
        wts_dict = dict(zip(self.feature_value_names, wts))

        feat_label_corrs = \
          (train_result['xy_av'] - train_result['x_av'] * train_result['y_av']) / \
          np.sqrt( (train_result['xsq_av'] - train_result['x_av']**2) * \
                   (train_result['ysq_av'] - train_result['y_av']**2) )

        corrs_dict = dict(zip(self.feature_value_names, feat_label_corrs))

        wts_ent = tf_numpy(tf_entropy(wts))
        wts_l1 = tf_numpy(tf.norm(wts, ord=1))
        wts_max = np.max(np.abs(wts))
        wts_l1_linf = wts_l1 / wts_max
        wts_1pct = np.sum(np.abs(wts) > 0.01 * wts_max)
        wts_pct1pct = 100 * np.sum(np.abs(wts) > 0.01 * wts_max) / len(wts)
        results = dict(acc=np.round(eval_result['accuracy'], 3),
                       auc=np.round(eval_result['auc'], 3),
                       loss=np.round(eval_result['loss'], 3),
                       wts_ent=np.round(wts_ent, 3),
                       wts_1pct=wts_1pct,
                       wts_pct1pct=wts_pct1pct,
                       wts_l1=np.round(wts_l1, 3),
                       wts_l1_linf=np.round(wts_l1_linf, 3),
                       av_ent=np.round(train_result['attrib_ent'], 7),
                       av_high=np.round(train_result['high_attribs'], 1),
                       a_ent=np.round(tf_numpy(afvi_ent), 3),
                       f_a_ent=np.round(tf_numpy(feature_afvi_ent), 3),
                       g_ent=np.round(tf_numpy(feature_value_attribs_ent), 3),
                       f_g_ent=np.round(tf_numpy(feature_attribs_ent), 3),
                       f_a_dict=feature_afvi_dict,
                       f_g_dict=feature_attribs_dict,
                       wts_dict=wts_dict,
                       corrs_dict=corrs_dict)
        return results