def main(): global dataset_name # load data X, y = dh.load_data(s.DATA_FP, n_features=s.NUM_FEATURES, memmapped=False) X, y = X[0:10000, :], y[0:10000] # Make feature group combinations for Devset and holdout mapdict = dh.load_mapdict_file(s.MAPDICT_FP) fg_indices = get_feature_group_indices(mapdict) combo_indices = combine_feature_groups(fg_indices) # fit the ablation experiment ablation_report = {} for ablation_name, indices in combo_indices.iteritems(): dataset_name = ablation_name X_dev_abl = slice_feature_group(X_dev, indices, ablation_name) X_holdout_abl = slice_feature_group(X_holdout, indices, ablation_name) logging.info( "\n==============================================================" "\n{}: Performing crossvalidation." "\n==============================================================". format(ablation_name.upper())) winner_report = crossvalidate(X_dev_abl, y_dev, X_holdout_abl, y_holdout, clf, cv) ablation_report["true_labels_holdout"] = y_holdout.tolist() ablation_report[ablation_name] = winner_report with open(os.path.join(s.OPT_DIRP, "%s_ablation_report.json" % timestamp), "wt") as f: json.dump(ablation_report, f, sort_keys=True) pprint(ablation_report, depth=20) select_model(X, y)
def main(): X, y = dh.load_data(s.DATA_FP, n_features=s.NUM_FEATURES, memmapped=False) start = time.time() Xbns = SelectPercentile(bns, percentile=0.33).fit_transform(X, y) print(Xbns.shape) logging.info("Done BNS {} s".format(time.time() - start)) start = time.time() Xf = SelectPercentile(f_classif, percentile=0.33).fit_transform(X, y) print(Xf.shape) logging.info("Done Anova F {} s".format(time.time() - start))
def main(): # load data X, y = dh.load_data(s.DATA_FP, n_features=s.NUM_FEATURES, memmapped=False) # # # TESTING # print("Warning TESTING") # X, y = X[0:1000, :], y[0:1000] # logging.warning("TESTING with {}".format(X.shape)) util.ensure_dir(s.OPT_DIRP) X = dh.do_memmap(X) select_model(X, y) util.send_text_message("{}: Ended run and wrote all to {}".format( str(datetime.datetime.now()), s.OPT_DIRP))
'-r', default='', help='Resume the optimization from snapshot') parser.add_argument('--saveflag', '-s', choices=('on', 'off'), default='off', help='Save model and optimizer flag') args = parser.parse_args() if args.gpu >= 0: cuda.check_cuda_available() # Prepare dataset print('load cifar-10 dataset') if args.data == 'on': cifar = dh.process_data(augmentation=args.augmentation) else: cifar = dh.load_data() # Cropping => insize = 24 cifar['train']['x'], cifar['train']['y'] = dh.crop_data( cifar['train']['x'], cifar['train']['y']) cifar['test']['x'], cifar['test']['y'] = dh.crop_data(cifar['test']['x'], cifar['test']['y']) N, N_test = len(cifar['train']['x']), len(cifar['test']['x']) print(N, N_test) batchsize = args.batchsize n_epoch = args.epoch assert N % batchsize == 0 assert N_test % batchsize == 0 # Prepare Convolution NN model
parser.add_argument('--optimizer', '-o', choices=('adam', 'adagrad', 'sgd'), default='sgd', help='Optimizer algorithm') parser.add_argument('--plotflag', '-p', choices=('on', 'off'), default='off', help='Accuracy plot flag') parser.add_argument('--resume', '-r', default='', help='Resume the optimization from snapshot') parser.add_argument('--saveflag', '-s', choices=('on', 'off'), default='off', help='Save model and optimizer flag') args = parser.parse_args() if args.gpu >= 0: cuda.check_cuda_available() # Prepare dataset print('load cifar-10 dataset') if args.data == 'on': cifar = dh.process_data(augmentation=args.augmentation) else: cifar = dh.load_data() N = len(cifar['train']['x']) N_test = len(cifar['test']['x']) print(N, N_test) batchsize = args.batchsize n_epoch = args.epoch assert N % batchsize == 0 assert N_test % batchsize == 0 # Prepare Convolution NN model if args.net == 'alex': import model_cnn model = model_cnn.CifarCNN_2() elif args.net == 'alexbn':
detect_label = 1 util.ensure_dir( "/home/gilles/repos/cbrole/static/CASCADE_{}".format(LANGUAGE)) # load heldout X, y DATA_FP = s.langspec[LANGUAGE]["DATA_FP"] # X, y = dh.load_data(DATA_FP, n_features=s.langspec[LANGUAGE]['NUM_FEATURES'], memmapped=False) run_dir = os.path.dirname( os.path.dirname(os.path.dirname(langspec[LANGUAGE]["role_modelfp"]))) NUM_FEATURES_POSTSPLIT = json.load( open(os.path.join(run_dir, "holdinout_split_indices.json"), "rt"))["num_features"] X_in, y_in = dh.load_data( "{}/holdin.svm".format(run_dir), n_features=NUM_FEATURES_POSTSPLIT, memmapped=False, ) X_out, y_out = dh.load_data( "{}/holdout.svm".format(run_dir), n_features=NUM_FEATURES_POSTSPLIT, memmapped=False, ) # load detection model detect_fp = langspec[LANGUAGE]["detect_modelfp"] role_fp = langspec[LANGUAGE]["role_modelfp"] detect = load(detect_fp) role = load(role_fp) all_classes = reduce(np.union1d, (detect.classes_, role.classes_))
def train(self): """ trains a network with a given training set """ self.device = torch.device("cuda") np.random.seed(self.rand_seed) torch.manual_seed(self.rand_seed) train, val, test, vocab = datahandler.load_data( "./data/ptb", self.maxlen) self.vocab_size = len(vocab) # make iterable dataset object train_iter, val_iter, test_iter = torchtext.data.BucketIterator.splits( (train, val, test), batch_sizes=[self.batch_size, 1, 1], device=self.device, repeat=False, sort_key=lambda x: len(x.text), sort_within_batch=True, ) self.N_train_data = len(train) self.N_val_data = len(val) self.N_batches = int(self.N_train_data / self.batch_size + int(self.N_train_data % self.batch_size > 0)) self.N_train_data = len(train) self.N_val_data = len(val) self.N_batches = int(self.N_train_data / self.batch_size + int(self.N_train_data % self.batch_size > 0)) self.log("N_train_data: %d N_mini_batches: %d" % (self.N_train_data, self.N_batches)) # instantiate the dmm self.dmm = DMM(input_dim=self.vocab_size, dropout=self.dropout) # setup optimizer opt_params = { "lr": self.lr, "betas": (self.beta1, self.beta2), "clip_norm": self.cn, "lrd": self.lr_decay, "weight_decay": self.wd, } self.adam = ClippedAdam(opt_params) # set up inference algorithm self.elbo = Trace_ELBO() self.svi = SVI(self.dmm.model, self.dmm.guide, self.adam, loss=self.elbo) val_f = 10 print("training dmm") times = [time.time()] for epoch in range(self.n_epoch): if self.ckpt_f > 0 and epoch > 0 and epoch % self.ckpt_f == 0: self.save_ckpt() # train and report metrics train_nll = self._train_batch( train_iter, epoch, ) times.append(time.time()) t_elps = times[-1] - times[-2] self.log("epoch %04d -> train nll: %.4f \t t_elps=%.3f sec" % (epoch, train_nll, t_elps)) if epoch % val_f == 0: val_nll = self._validate(val_iter) pass
# "2_Bystander_defender", "2_Harasser", # "2_Bystander_assistant"])} LABELS = type_labels DATA_FP = s.langspec[LANGUAGE]["DATA_FP"] # No longer needed run_dir = run_dirs[LANGUAGE] all_runs = [] all_bl = { "rand_baseline": [], "maj_baseline": [], } all_bootstrap_score = {} y_distr = {} X_in, y_in = dh.load_data("{}/holdin.svm".format(run_dir), memmapped=False) X_out, y_true = dh.load_data("{}/holdout.svm".format(run_dir), memmapped=False) fold_log_dirp = glob.glob("{}/fold_log".format(run_dir))[0] fold_meta = get_metadata(fold_log_dirp) split = json.load(open("{}/holdinout_split_indices.json".format(run_dir), "rt")) out_idc = split["holdout"] in_idc = split["holdin"] full_class_counts = np.asarray( np.unique(np.append(y_in, y_true), return_counts=True) ).T.tolist() in_class_counts = np.asarray(np.unique(y_in, return_counts=True)).T.tolist() out_class_counts = np.asarray(np.unique(y_true, return_counts=True)).T.tolist()