Esempio n. 1
0
def train():
    """
    Builds the SVM based on training data.
    """
    features, labels = __init__.load_data('train')

    vectorizer = text.CountVectorizer(decode_error='ignore',
                                      stop_words='english')
    transformer = text.TfidfTransformer()

    classifier = linear_model.SGDClassifier(loss='hinge',
                                            penalty='l2',
                                            alpha=1e-3,
                                            tol=1e-3,
                                            random_state=42)

    # Serializes the processing steps that would be required of the above.
    text_clf = pipeline.Pipeline(
        steps=[('vect', vectorizer), ('tfidf',
                                      transformer), ('clf-sgdc', classifier)])

    start = time.time()
    text_clf.fit(features, labels)
    print 'Training time:\t%1.4f seconds' % (time.time() - start)

    __init__.evaluate(text_clf, features, labels)

    return text_clf
Esempio n. 2
0
def train():
    """Builds the random forest based on training data."""
    features, labels = __init__.load_data('train')

    vectorizer = text.CountVectorizer(decode_error='ignore',
                                      stop_words='english')
    transformer = text.TfidfTransformer()
    classifier = ensemble.RandomForestClassifier(n_estimators=10)

    text_clf = pipeline.Pipeline(
        steps=[('vect', vectorizer), ('tfidf',
                                      transformer), ('clf-rf', classifier)])

    start = time.time()
    text_clf.fit(features, labels)
    print 'Training time:\t%1.4f seconds' % (time.time() - start)

    __init__.evaluate(text_clf, features, labels)

    return text_clf
Esempio n. 3
0
def bin(model):
    """Uses binning to identify the posts the model was most uncertain with."""
    if not os.path.isdir('bins/'):
        os.makedirs('bins/')

    features, labels = __init__.load_data('train')

    features = [feature.replace('\n', '') for feature in features]

    predictions = model.predict(features)

    bins = [list() for x in range(10)]

    for idx in range(len(predictions)):
        curr = predictions[idx]
        if curr < 0.1:
            bins[0].append(str(labels[idx]) + ' : ' + features[idx] + '\n')
        elif curr < 0.2:
            bins[1].append(str(labels[idx]) + ' : ' + features[idx] + '\n')
        elif curr < 0.3:
            bins[2].append(str(labels[idx]) + ' : ' + features[idx] + '\n')
        elif curr < 0.4:
            bins[3].append(str(labels[idx]) + ' : ' + features[idx] + '\n')
        elif curr < 0.5:
            bins[4].append(str(labels[idx]) + ' : ' + features[idx] + '\n')
        elif curr < 0.6:
            bins[5].append(str(labels[idx]) + ' : ' + features[idx] + '\n')
        elif curr < 0.7:
            bins[6].append(str(labels[idx]) + ' : ' + features[idx] + '\n')
        elif curr < 0.8:
            bins[7].append(str(labels[idx]) + ' : ' + features[idx] + '\n')
        elif curr < 0.9:
            bins[8].append(str(labels[idx]) + ' : ' + features[idx] + '\n')
        else:
            bins[9].append(str(labels[idx]) + ' : ' + features[idx] + '\n')

    for idx in range(len(bins)):
        with open('bins/%1.1f.txt' % (idx / 10.0), 'w') as out:
            out.writelines(bins[idx])
Esempio n. 4
0
def train():
    """
    Builds the classifier based on training data.
    """
    features, labels = __init__.load_data('train')
        
    vectorizer = text.CountVectorizer(decode_error='ignore', stop_words='english')
    transformer = text.TfidfTransformer()
    
    classifier = linear_model.LogisticRegression(solver='lbfgs')
    
    # Serializes the processing steps that would be required of the above.
    text_clf = pipeline.Pipeline(steps=[('vect', vectorizer),
                                       ('tfidf', transformer),
                                       ('clf-lr', classifier)])
    
    start = time.time()
    text_clf.fit(features, labels)
    print 'Training time:\t%1.4f seconds' % (time.time() - start)
    
    __init__.evaluate(text_clf, features, labels)

    return text_clf
Esempio n. 5
0
def train():
    """
    Builds the SVM based on training data.
    """
    features, labels = __init__.load_data('train')

    vectorizer = text.CountVectorizer(decode_error='ignore',
                                      stop_words='english')
    transformer = text.TfidfTransformer()

    classifier = svm.SVR(kernel='sigmoid', gamma='scale')

    # Serializes the processing steps that would be required of the above.
    text_clf = pipeline.Pipeline(
        steps=[('vect', vectorizer), ('tfidf',
                                      transformer), ('clf-svr', classifier)])

    start = time.time()
    text_clf.fit(features, labels)
    print 'Training time:\t%1.4f seconds' % (time.time() - start)

    __init__.evaluate(text_clf, features, labels)

    return text_clf
Esempio n. 6
0
def test(model):
    """Tests the classifier based on test data."""
    features, labels = __init__.load_data('test')

    __init__.evaluate(model, features, labels)
Esempio n. 7
0
def dev(model):
    """Tests the classifier based on dev data."""
    features, labels = __init__.load_data('dev')

    __init__.evaluate(model, features, labels)
def main():

    parser = argparse.ArgumentParser(description='NoBox')
    # Hparams
    parser.add_argument('--gp_coeff', type=float, default=0.,
                        help='coeff for the gradient penalty')
    parser.add_argument('--latent_dim', type=int, default=20, metavar='N',
                        help='Latent dim for VAE')
    parser.add_argument('--lr', type=float, default=0.01, metavar='LR',
                        help='learning rate for the generator (default: 0.01)')
    parser.add_argument('--lr_model', type=float, default=None, metavar='LR',
                        help='learning rate for the model (default: None -> default to args.lr)')
    parser.add_argument('--momentum', type=float, default=0.5, metavar='M',
                        help='optimizer momentum (default: 0.5)')
    parser.add_argument('--extragradient', default=False, action='store_true',
                        help='Use extragadient algorithm')
    parser.add_argument('--latent_size', type=int, default=50, metavar='N',
                        help='Size of latent distribution (default: 50)')
    parser.add_argument('--flow_model', default=None, const='soft',
                    nargs='?', choices=[None, 'RealNVP', 'planar', 'radial'],
                    help='Type of Normalizing Flow (default: %(default)s)')
    parser.add_argument('--flow_layer_type', type=str, default='Linear',
                        help='Which type of layer to use ---i.e. GRevNet or Linear')
    parser.add_argument('--flow_hidden_size', type=int, default=128,
                        help='Hidden layer size for Flows.')
    parser.add_argument('--n_blocks', type=int, default=2,
                        help='Number of blocks to stack in flow')
    parser.add_argument('--flow_hidden', type=int, default=1, help='Number of hidden layers in each Flow.')
    parser.add_argument('--eval_set', default="test",
                        help="Evaluate model on test or validation set.")
    parser.add_argument('--train_with_critic_path', type=str, default=None,
                        help='Train generator with saved critic model')
    parser.add_argument('--train_on_file', default=False, action='store_true',
                        help='Train using Madry tf grad')
    # Training
    parser.add_argument('--lambda_on_clean', default=0.0, type=float,
                        help='train the critic on clean examples of the train set')
    parser.add_argument('--not_use_labels', default=False, action='store_true',
                        help='Use the labels for the conditional generator')
    parser.add_argument('--hinge_coeff', default=10., type=float,
                        help='coeff for the hinge loss penalty')
    parser.add_argument('--anneal_eps', default=0., type=float,
                        help='coeff for the epsilon annealing')
    parser.add_argument('--fixed_critic', default=False, action='store_true',
                        help='Critic is not trained')
    parser.add_argument('--train_on_list', default=False, action='store_true',
                        help='train on a list of classifiers')
    parser.add_argument('--train_set', default='train',
                        choices=['train_and_test','test','train'],
                        help='add the test set in the training set')
    parser.add_argument('--epochs', type=int, default=10, metavar='N',
                        help='number of epochs to train (default: 10)')
    parser.add_argument('--n_iter', type=int, default=500,
                        help='N iters for quere based attacks')
    parser.add_argument('--PGD_steps', type=int, default=40, metavar='N',
                        help='max gradient steps (default: 30)')
    parser.add_argument('--max_iter', type=int, default=10, metavar='N',
                        help='max gradient steps (default: 10)')
    parser.add_argument('--epsilon', type=float, default=0.1, metavar='M',
                        help='Epsilon for Delta (default: 0.1)')
    parser.add_argument('--attack_ball', type=str, default="L2",
                        choices= ['L2','Linf'],
                        help='type of box attack')
    parser.add_argument('--bb_steps', type=int, default=2000, metavar='N',
                        help='Max black box steps per sample(default: 1000)')
    parser.add_argument('--attack_epochs', type=int, default=100, metavar='N',
                        help='Max numbe of epochs to train G')
    parser.add_argument('--num_flows', type=int, default=2, metavar='N',
                        help='Number of Flows')
    parser.add_argument('--seed', type=int, metavar='S',
                        help='random seed (default: None)')
    parser.add_argument('--input_size', type=int, default=784, metavar='S',
                        help='Input size for MNIST is default')
    parser.add_argument('--batch_size', type=int, default=256, metavar='S',
                        help='Batch size')
    parser.add_argument('--test_batch_size', type=int, default=512, metavar='S',
                        help='Test Batch size')
    parser.add_argument('--pgd_on_critic', default=False, action='store_true',
                        help='Train Critic on pgd samples')
    parser.add_argument('--train_with_robust', default=False, action='store_true',
                        help='Train with Robust model + Critic')
    parser.add_argument('--test', default=False, action='store_true',
                        help='just test model and print accuracy')
    parser.add_argument('--clip_grad', default=True, action='store_true',
                        help='Clip grad norm')
    parser.add_argument('--train_vae', default=False, action='store_true',
                        help='Train VAE')
    parser.add_argument('--train_ae', default=False, action='store_true',
                        help='Train AE')
    parser.add_argument('--attack_type', type=str, default='nobox',
                        help='Which attack to run')
    parser.add_argument('--attack_loss', type=str, default='cross_entropy',
                        help='Which loss func. to use to optimize G')
    parser.add_argument('--perturb_loss', type=str, default='L2', choices= ['L2','Linf'],
                        help='Which loss func. to use to optimize to compute constraint')
    parser.add_argument('--dataset', type=str, default='mnist')
    parser.add_argument('--model', type=str, default=None)
    parser.add_argument('--deterministic_G', default=False, action='store_true',
                        help='Deterministic Latent State')
    parser.add_argument('--run_baseline', default=False, action='store_true',
                        help='Run baseline PGD')
    parser.add_argument('--resample_test', default=False, action='store_true',
                help='Load model and test resampling capability')
    parser.add_argument('--resample_iterations', type=int, default=100, metavar='N',
                        help='How many times to resample (default: 100)')
    parser.add_argument('--architecture', default="VGG16",
                        help="The architecture we want to attack on CIFAR.")
    parser.add_argument('--eval_freq', default=5, type=int,
                        help="Evaluate and save model every eval_freq epochs.")
    parser.add_argument('--num_test_samples', default=None, type=int,
                        help="The number of samples used to train and test the attacker.")
    parser.add_argument('--num_eval_samples', default=None, type=int,
                        help="The number of samples used to train and test the attacker.")
    # Bells
    parser.add_argument("--wandb", action="store_true", default=False, help='Use wandb for logging')
    parser.add_argument('--model_path', type=str, default="mnist_cnn.pt",
                        help='where to save/load')
    parser.add_argument('--namestr', type=str, default='NoBox', \
            help='additional info in output filename to describe experiments')
    parser.add_argument('--dir_test_models', type=str, default="./dir_test_models",
                        help="The path to the directory containing the classifier models for evaluation.")
    parser.add_argument('--robust_model_path', type=str,
                        default="./madry_challenge_models/mnist/adv_trained/mnist_lenet5_advtrained.pt",
                        help="The path to our adv robust classifier")
    parser.add_argument('--robust_sample_prob', type=float, default=1e-1, metavar='N',
                        help='1-P(robust)')
    #parser.add_argument('--madry_model_path', type=str, default="./madry_challenge_models",
    #                    help="The path to the directory containing madrys classifiers for testing")
    parser.add_argument("--max_test_model", type=int, default=1,
                    help="The maximum number of pretrained classifiers to use for testing.")
    parser.add_argument("--perturb_magnitude", type=float, default=None,
                        help="The amount of perturbation we want to enforce with lagrangian.")
    parser.add_argument("--log_path", type=str, default="./logs",
                        help="Where to save logs if logger is specified.")
    parser.add_argument("--save_model", type=str, default=None,
                            help="Where to save the models, if it is specified.")
    parser.add_argument("--fixed_testset", action="store_true",
                            help="If used then makes sure that the same set of samples is always used for testing.")
    parser.add_argument('--normalize', default=None, choices=(None, "default", "meanstd"))

    ###
    parser.add_argument('--source_arch', default="res18",
                        help="The architecture we want to attack on CIFAR.")
    parser.add_argument('--target_arch', nargs='*',
                        help="The architecture we want to blackbox transfer to on CIFAR.")
    parser.add_argument('--ensemble_adv_trained', action='store_true')
    parser.add_argument('--adv_models', nargs='*', help='path to adv model(s)')
    parser.add_argument('--type', type=int, default=0, help='Model type (default: 0)')
    parser.add_argument('--model_name', help='path to model')
    parser.add_argument('--transfer', action='store_true')
    parser.add_argument('--command', choices=("eval", "train"), default="train")
    parser.add_argument('--split', type=int, default=None,
                        help="Which subsplit to use.")
    parser.add_argument('--path_to_data', default="../data", type=str)
    args = parser.parse_args()

    args.dev = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    normalize = None
    if args.normalize == "meanstd":
        normalize = transforms.Normalize(cf.mean["cifar10"], cf.std["cifar10"])
    elif args.normalize == "default":
        normalize = CIFAR_NORMALIZATION

    train_loader, test_loader, split_train_loader, split_test_loader = create_loaders(args,
            root=args.path_to_data, split=args.split, num_test_samples=args.num_test_samples, normalize=normalize)

    if args.split is not None:
        train_loader = split_train_loader
        test_loader = split_test_loader

    if os.path.isfile("../settings.json"):
        with open('../settings.json') as f:
            data = json.load(f)
        args.wandb_apikey = data.get("wandbapikey")

    if args.wandb:
        os.environ['WANDB_API_KEY'] = args.wandb_apikey
        wandb.init(project='NoBox-table2',
                   name='NoBox-Attack-{}-{}'.format(args.dataset, args.namestr))

    model, adv_models, l_test_classif_paths, args.model_type = data_and_model_setup(args, no_box_attack=True)
    model.to(args.dev)
    model.eval()

    print("Testing on %d Test Classifiers with Source Model %s" %(len(l_test_classif_paths), args.source_arch))
    x_test, y_test = load_data(args, test_loader)

    if args.dataset == "mnist":
        critic = load_unk_model(args)
    elif args.dataset == "cifar":
        name = args.source_arch
        if args.source_arch == "adv":
            name = "res18"
        critic = load_unk_model(args, name=name)

    misclassify_loss_func = kwargs_attack_loss[args.attack_loss]
    attacker = NoBoxAttack(critic, misclassify_loss_func, args)

    print("Evaluating clean error rate:")
    list_model = [args.source_arch]
    if args.source_arch == "adv":
        list_model = [args.model_type]
    if args.target_arch is not None:
        list_model = args.target_arch

    for model_type in list_model:
        num_samples = args.num_eval_samples
        if num_samples is None:
            num_samples = len(test_loader.dataset)

        eval_loader = torch.utils.data.Subset(test_loader.dataset,
                         np.random.randint(len(test_loader.dataset), size=(num_samples,)))
        eval_loader = torch.utils.data.DataLoader(eval_loader, batch_size=args.test_batch_size)
        baseline_transfer(args, None, "Clean", model_type, eval_loader,
            list_classifiers=l_test_classif_paths)

    def eval_fn(model):
        advcorrect = 0
        model.to(args.dev)
        model.eval()
        with ctx_noparamgrad_and_eval(model):
            if args.source_arch == 'googlenet':
                adv_complete_list = []
                for batch_idx, (x_batch, y_batch) in enumerate(test_loader):
                    if (batch_idx + 1) * args.test_batch_size > args.batch_size:
                        break
                    x_batch, y_batch = x_batch.to(args.dev), y_batch.to(args.dev)
                    adv_complete_list.append(attacker.perturb(x_batch, target=y_batch))
                adv_complete = torch.cat(adv_complete_list)
            else:
                adv_complete = attacker.perturb(x_test[:args.batch_size],
                                        target=y_test[:args.batch_size])
            adv_complete = torch.clamp(adv_complete, min=0., max=1.0)
            output = model(adv_complete)
            pred = output.max(1, keepdim=True)[1]
            advcorrect += pred.eq(y_test[:args.batch_size].view_as(pred)).sum().item()
            fool_rate = 1 - advcorrect / float(args.batch_size)
            print('Test set base model fool rate: %f' %(fool_rate))
        model.cpu()

        if args.transfer:
            adv_img_list = []
            y_orig = y_test[:args.batch_size]
            for i in range(0, len(adv_complete)):
                adv_img_list.append([adv_complete[i].unsqueeze(0), y_orig[i]])
            # Free memory
            del model
            torch.cuda.empty_cache()
            baseline_transfer(args, attacker, "AEG", model_type,
                            adv_img_list, l_test_classif_paths, adv_models)


    if args.command == "eval":
        attacker.load(args)
    elif args.command == "train":
        attacker.train(train_loader, test_loader, adv_models,
                l_test_classif_paths, l_train_classif={"source_model": model},
                eval_fn=eval_fn)
Esempio n. 9
0
def test(model):
    """Tests the random forest based on test data."""
    features, labels = __init__.load_data('test')

    __init__.evaluate(model, features, labels)
Esempio n. 10
0
def dev(model):
    """Tests the random forest based on dev data."""
    features, labels = __init__.load_data('dev')

    __init__.evaluate(model, features, labels)
Esempio n. 11
0
import __init__
import os
import pdb
import wordcloud
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from PIL import Image
from sklearn.feature_extraction import stop_words


if __name__ == '__main__':

    # Load ALL the data.
    train_features, train_labels =  __init__.load_data('train')
    dev_features, dev_labels =      __init__.load_data('dev')
    test_features, test_labels =    __init__.load_data('test')
    
    features = train_features + dev_features + test_features
    labels = train_labels + dev_labels + test_labels
    
    # Need to break up the data by label and sort it into bins.
    curr_label = labels[0]
    idx = 0
    feature_lists = [list() for _ in np.unique(labels)]
    while idx < len(labels):
        feature_lists[labels[idx]].append(features[idx])
        idx += 1
    
    # Make a word cloud for each bin.
Esempio n. 12
0
def load_data_locl():
    data, label = load_data()
    return data, label