def train(): """ Builds the SVM based on training data. """ features, labels = __init__.load_data('train') vectorizer = text.CountVectorizer(decode_error='ignore', stop_words='english') transformer = text.TfidfTransformer() classifier = linear_model.SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, tol=1e-3, random_state=42) # Serializes the processing steps that would be required of the above. text_clf = pipeline.Pipeline( steps=[('vect', vectorizer), ('tfidf', transformer), ('clf-sgdc', classifier)]) start = time.time() text_clf.fit(features, labels) print 'Training time:\t%1.4f seconds' % (time.time() - start) __init__.evaluate(text_clf, features, labels) return text_clf
def train(): """Builds the random forest based on training data.""" features, labels = __init__.load_data('train') vectorizer = text.CountVectorizer(decode_error='ignore', stop_words='english') transformer = text.TfidfTransformer() classifier = ensemble.RandomForestClassifier(n_estimators=10) text_clf = pipeline.Pipeline( steps=[('vect', vectorizer), ('tfidf', transformer), ('clf-rf', classifier)]) start = time.time() text_clf.fit(features, labels) print 'Training time:\t%1.4f seconds' % (time.time() - start) __init__.evaluate(text_clf, features, labels) return text_clf
def bin(model): """Uses binning to identify the posts the model was most uncertain with.""" if not os.path.isdir('bins/'): os.makedirs('bins/') features, labels = __init__.load_data('train') features = [feature.replace('\n', '') for feature in features] predictions = model.predict(features) bins = [list() for x in range(10)] for idx in range(len(predictions)): curr = predictions[idx] if curr < 0.1: bins[0].append(str(labels[idx]) + ' : ' + features[idx] + '\n') elif curr < 0.2: bins[1].append(str(labels[idx]) + ' : ' + features[idx] + '\n') elif curr < 0.3: bins[2].append(str(labels[idx]) + ' : ' + features[idx] + '\n') elif curr < 0.4: bins[3].append(str(labels[idx]) + ' : ' + features[idx] + '\n') elif curr < 0.5: bins[4].append(str(labels[idx]) + ' : ' + features[idx] + '\n') elif curr < 0.6: bins[5].append(str(labels[idx]) + ' : ' + features[idx] + '\n') elif curr < 0.7: bins[6].append(str(labels[idx]) + ' : ' + features[idx] + '\n') elif curr < 0.8: bins[7].append(str(labels[idx]) + ' : ' + features[idx] + '\n') elif curr < 0.9: bins[8].append(str(labels[idx]) + ' : ' + features[idx] + '\n') else: bins[9].append(str(labels[idx]) + ' : ' + features[idx] + '\n') for idx in range(len(bins)): with open('bins/%1.1f.txt' % (idx / 10.0), 'w') as out: out.writelines(bins[idx])
def train(): """ Builds the classifier based on training data. """ features, labels = __init__.load_data('train') vectorizer = text.CountVectorizer(decode_error='ignore', stop_words='english') transformer = text.TfidfTransformer() classifier = linear_model.LogisticRegression(solver='lbfgs') # Serializes the processing steps that would be required of the above. text_clf = pipeline.Pipeline(steps=[('vect', vectorizer), ('tfidf', transformer), ('clf-lr', classifier)]) start = time.time() text_clf.fit(features, labels) print 'Training time:\t%1.4f seconds' % (time.time() - start) __init__.evaluate(text_clf, features, labels) return text_clf
def train(): """ Builds the SVM based on training data. """ features, labels = __init__.load_data('train') vectorizer = text.CountVectorizer(decode_error='ignore', stop_words='english') transformer = text.TfidfTransformer() classifier = svm.SVR(kernel='sigmoid', gamma='scale') # Serializes the processing steps that would be required of the above. text_clf = pipeline.Pipeline( steps=[('vect', vectorizer), ('tfidf', transformer), ('clf-svr', classifier)]) start = time.time() text_clf.fit(features, labels) print 'Training time:\t%1.4f seconds' % (time.time() - start) __init__.evaluate(text_clf, features, labels) return text_clf
def test(model): """Tests the classifier based on test data.""" features, labels = __init__.load_data('test') __init__.evaluate(model, features, labels)
def dev(model): """Tests the classifier based on dev data.""" features, labels = __init__.load_data('dev') __init__.evaluate(model, features, labels)
def main(): parser = argparse.ArgumentParser(description='NoBox') # Hparams parser.add_argument('--gp_coeff', type=float, default=0., help='coeff for the gradient penalty') parser.add_argument('--latent_dim', type=int, default=20, metavar='N', help='Latent dim for VAE') parser.add_argument('--lr', type=float, default=0.01, metavar='LR', help='learning rate for the generator (default: 0.01)') parser.add_argument('--lr_model', type=float, default=None, metavar='LR', help='learning rate for the model (default: None -> default to args.lr)') parser.add_argument('--momentum', type=float, default=0.5, metavar='M', help='optimizer momentum (default: 0.5)') parser.add_argument('--extragradient', default=False, action='store_true', help='Use extragadient algorithm') parser.add_argument('--latent_size', type=int, default=50, metavar='N', help='Size of latent distribution (default: 50)') parser.add_argument('--flow_model', default=None, const='soft', nargs='?', choices=[None, 'RealNVP', 'planar', 'radial'], help='Type of Normalizing Flow (default: %(default)s)') parser.add_argument('--flow_layer_type', type=str, default='Linear', help='Which type of layer to use ---i.e. GRevNet or Linear') parser.add_argument('--flow_hidden_size', type=int, default=128, help='Hidden layer size for Flows.') parser.add_argument('--n_blocks', type=int, default=2, help='Number of blocks to stack in flow') parser.add_argument('--flow_hidden', type=int, default=1, help='Number of hidden layers in each Flow.') parser.add_argument('--eval_set', default="test", help="Evaluate model on test or validation set.") parser.add_argument('--train_with_critic_path', type=str, default=None, help='Train generator with saved critic model') parser.add_argument('--train_on_file', default=False, action='store_true', help='Train using Madry tf grad') # Training parser.add_argument('--lambda_on_clean', default=0.0, type=float, help='train the critic on clean examples of the train set') parser.add_argument('--not_use_labels', default=False, action='store_true', help='Use the labels for the conditional generator') parser.add_argument('--hinge_coeff', default=10., type=float, help='coeff for the hinge loss penalty') parser.add_argument('--anneal_eps', default=0., type=float, help='coeff for the epsilon annealing') parser.add_argument('--fixed_critic', default=False, action='store_true', help='Critic is not trained') parser.add_argument('--train_on_list', default=False, action='store_true', help='train on a list of classifiers') parser.add_argument('--train_set', default='train', choices=['train_and_test','test','train'], help='add the test set in the training set') parser.add_argument('--epochs', type=int, default=10, metavar='N', help='number of epochs to train (default: 10)') parser.add_argument('--n_iter', type=int, default=500, help='N iters for quere based attacks') parser.add_argument('--PGD_steps', type=int, default=40, metavar='N', help='max gradient steps (default: 30)') parser.add_argument('--max_iter', type=int, default=10, metavar='N', help='max gradient steps (default: 10)') parser.add_argument('--epsilon', type=float, default=0.1, metavar='M', help='Epsilon for Delta (default: 0.1)') parser.add_argument('--attack_ball', type=str, default="L2", choices= ['L2','Linf'], help='type of box attack') parser.add_argument('--bb_steps', type=int, default=2000, metavar='N', help='Max black box steps per sample(default: 1000)') parser.add_argument('--attack_epochs', type=int, default=100, metavar='N', help='Max numbe of epochs to train G') parser.add_argument('--num_flows', type=int, default=2, metavar='N', help='Number of Flows') parser.add_argument('--seed', type=int, metavar='S', help='random seed (default: None)') parser.add_argument('--input_size', type=int, default=784, metavar='S', help='Input size for MNIST is default') parser.add_argument('--batch_size', type=int, default=256, metavar='S', help='Batch size') parser.add_argument('--test_batch_size', type=int, default=512, metavar='S', help='Test Batch size') parser.add_argument('--pgd_on_critic', default=False, action='store_true', help='Train Critic on pgd samples') parser.add_argument('--train_with_robust', default=False, action='store_true', help='Train with Robust model + Critic') parser.add_argument('--test', default=False, action='store_true', help='just test model and print accuracy') parser.add_argument('--clip_grad', default=True, action='store_true', help='Clip grad norm') parser.add_argument('--train_vae', default=False, action='store_true', help='Train VAE') parser.add_argument('--train_ae', default=False, action='store_true', help='Train AE') parser.add_argument('--attack_type', type=str, default='nobox', help='Which attack to run') parser.add_argument('--attack_loss', type=str, default='cross_entropy', help='Which loss func. to use to optimize G') parser.add_argument('--perturb_loss', type=str, default='L2', choices= ['L2','Linf'], help='Which loss func. to use to optimize to compute constraint') parser.add_argument('--dataset', type=str, default='mnist') parser.add_argument('--model', type=str, default=None) parser.add_argument('--deterministic_G', default=False, action='store_true', help='Deterministic Latent State') parser.add_argument('--run_baseline', default=False, action='store_true', help='Run baseline PGD') parser.add_argument('--resample_test', default=False, action='store_true', help='Load model and test resampling capability') parser.add_argument('--resample_iterations', type=int, default=100, metavar='N', help='How many times to resample (default: 100)') parser.add_argument('--architecture', default="VGG16", help="The architecture we want to attack on CIFAR.") parser.add_argument('--eval_freq', default=5, type=int, help="Evaluate and save model every eval_freq epochs.") parser.add_argument('--num_test_samples', default=None, type=int, help="The number of samples used to train and test the attacker.") parser.add_argument('--num_eval_samples', default=None, type=int, help="The number of samples used to train and test the attacker.") # Bells parser.add_argument("--wandb", action="store_true", default=False, help='Use wandb for logging') parser.add_argument('--model_path', type=str, default="mnist_cnn.pt", help='where to save/load') parser.add_argument('--namestr', type=str, default='NoBox', \ help='additional info in output filename to describe experiments') parser.add_argument('--dir_test_models', type=str, default="./dir_test_models", help="The path to the directory containing the classifier models for evaluation.") parser.add_argument('--robust_model_path', type=str, default="./madry_challenge_models/mnist/adv_trained/mnist_lenet5_advtrained.pt", help="The path to our adv robust classifier") parser.add_argument('--robust_sample_prob', type=float, default=1e-1, metavar='N', help='1-P(robust)') #parser.add_argument('--madry_model_path', type=str, default="./madry_challenge_models", # help="The path to the directory containing madrys classifiers for testing") parser.add_argument("--max_test_model", type=int, default=1, help="The maximum number of pretrained classifiers to use for testing.") parser.add_argument("--perturb_magnitude", type=float, default=None, help="The amount of perturbation we want to enforce with lagrangian.") parser.add_argument("--log_path", type=str, default="./logs", help="Where to save logs if logger is specified.") parser.add_argument("--save_model", type=str, default=None, help="Where to save the models, if it is specified.") parser.add_argument("--fixed_testset", action="store_true", help="If used then makes sure that the same set of samples is always used for testing.") parser.add_argument('--normalize', default=None, choices=(None, "default", "meanstd")) ### parser.add_argument('--source_arch', default="res18", help="The architecture we want to attack on CIFAR.") parser.add_argument('--target_arch', nargs='*', help="The architecture we want to blackbox transfer to on CIFAR.") parser.add_argument('--ensemble_adv_trained', action='store_true') parser.add_argument('--adv_models', nargs='*', help='path to adv model(s)') parser.add_argument('--type', type=int, default=0, help='Model type (default: 0)') parser.add_argument('--model_name', help='path to model') parser.add_argument('--transfer', action='store_true') parser.add_argument('--command', choices=("eval", "train"), default="train") parser.add_argument('--split', type=int, default=None, help="Which subsplit to use.") parser.add_argument('--path_to_data', default="../data", type=str) args = parser.parse_args() args.dev = torch.device('cuda' if torch.cuda.is_available() else 'cpu') normalize = None if args.normalize == "meanstd": normalize = transforms.Normalize(cf.mean["cifar10"], cf.std["cifar10"]) elif args.normalize == "default": normalize = CIFAR_NORMALIZATION train_loader, test_loader, split_train_loader, split_test_loader = create_loaders(args, root=args.path_to_data, split=args.split, num_test_samples=args.num_test_samples, normalize=normalize) if args.split is not None: train_loader = split_train_loader test_loader = split_test_loader if os.path.isfile("../settings.json"): with open('../settings.json') as f: data = json.load(f) args.wandb_apikey = data.get("wandbapikey") if args.wandb: os.environ['WANDB_API_KEY'] = args.wandb_apikey wandb.init(project='NoBox-table2', name='NoBox-Attack-{}-{}'.format(args.dataset, args.namestr)) model, adv_models, l_test_classif_paths, args.model_type = data_and_model_setup(args, no_box_attack=True) model.to(args.dev) model.eval() print("Testing on %d Test Classifiers with Source Model %s" %(len(l_test_classif_paths), args.source_arch)) x_test, y_test = load_data(args, test_loader) if args.dataset == "mnist": critic = load_unk_model(args) elif args.dataset == "cifar": name = args.source_arch if args.source_arch == "adv": name = "res18" critic = load_unk_model(args, name=name) misclassify_loss_func = kwargs_attack_loss[args.attack_loss] attacker = NoBoxAttack(critic, misclassify_loss_func, args) print("Evaluating clean error rate:") list_model = [args.source_arch] if args.source_arch == "adv": list_model = [args.model_type] if args.target_arch is not None: list_model = args.target_arch for model_type in list_model: num_samples = args.num_eval_samples if num_samples is None: num_samples = len(test_loader.dataset) eval_loader = torch.utils.data.Subset(test_loader.dataset, np.random.randint(len(test_loader.dataset), size=(num_samples,))) eval_loader = torch.utils.data.DataLoader(eval_loader, batch_size=args.test_batch_size) baseline_transfer(args, None, "Clean", model_type, eval_loader, list_classifiers=l_test_classif_paths) def eval_fn(model): advcorrect = 0 model.to(args.dev) model.eval() with ctx_noparamgrad_and_eval(model): if args.source_arch == 'googlenet': adv_complete_list = [] for batch_idx, (x_batch, y_batch) in enumerate(test_loader): if (batch_idx + 1) * args.test_batch_size > args.batch_size: break x_batch, y_batch = x_batch.to(args.dev), y_batch.to(args.dev) adv_complete_list.append(attacker.perturb(x_batch, target=y_batch)) adv_complete = torch.cat(adv_complete_list) else: adv_complete = attacker.perturb(x_test[:args.batch_size], target=y_test[:args.batch_size]) adv_complete = torch.clamp(adv_complete, min=0., max=1.0) output = model(adv_complete) pred = output.max(1, keepdim=True)[1] advcorrect += pred.eq(y_test[:args.batch_size].view_as(pred)).sum().item() fool_rate = 1 - advcorrect / float(args.batch_size) print('Test set base model fool rate: %f' %(fool_rate)) model.cpu() if args.transfer: adv_img_list = [] y_orig = y_test[:args.batch_size] for i in range(0, len(adv_complete)): adv_img_list.append([adv_complete[i].unsqueeze(0), y_orig[i]]) # Free memory del model torch.cuda.empty_cache() baseline_transfer(args, attacker, "AEG", model_type, adv_img_list, l_test_classif_paths, adv_models) if args.command == "eval": attacker.load(args) elif args.command == "train": attacker.train(train_loader, test_loader, adv_models, l_test_classif_paths, l_train_classif={"source_model": model}, eval_fn=eval_fn)
def test(model): """Tests the random forest based on test data.""" features, labels = __init__.load_data('test') __init__.evaluate(model, features, labels)
def dev(model): """Tests the random forest based on dev data.""" features, labels = __init__.load_data('dev') __init__.evaluate(model, features, labels)
import __init__ import os import pdb import wordcloud import numpy as np import pandas as pd import matplotlib.pyplot as plt from PIL import Image from sklearn.feature_extraction import stop_words if __name__ == '__main__': # Load ALL the data. train_features, train_labels = __init__.load_data('train') dev_features, dev_labels = __init__.load_data('dev') test_features, test_labels = __init__.load_data('test') features = train_features + dev_features + test_features labels = train_labels + dev_labels + test_labels # Need to break up the data by label and sort it into bins. curr_label = labels[0] idx = 0 feature_lists = [list() for _ in np.unique(labels)] while idx < len(labels): feature_lists[labels[idx]].append(features[idx]) idx += 1 # Make a word cloud for each bin.
def load_data_locl(): data, label = load_data() return data, label