def main():
    global dataset_name
    # load data
    X, y = dh.load_data(s.DATA_FP, n_features=s.NUM_FEATURES, memmapped=False)
    X, y = X[0:10000, :], y[0:10000]

    # Make feature group combinations for Devset and holdout
    mapdict = dh.load_mapdict_file(s.MAPDICT_FP)
    fg_indices = get_feature_group_indices(mapdict)
    combo_indices = combine_feature_groups(fg_indices)

    # fit the ablation experiment
    ablation_report = {}
    for ablation_name, indices in combo_indices.iteritems():
        dataset_name = ablation_name
        X_dev_abl = slice_feature_group(X_dev, indices, ablation_name)
        X_holdout_abl = slice_feature_group(X_holdout, indices, ablation_name)

        logging.info(
            "\n=============================================================="
            "\n{}: Performing crossvalidation."
            "\n==============================================================".
            format(ablation_name.upper()))

        winner_report = crossvalidate(X_dev_abl, y_dev, X_holdout_abl,
                                      y_holdout, clf, cv)
        ablation_report["true_labels_holdout"] = y_holdout.tolist()
        ablation_report[ablation_name] = winner_report

    with open(os.path.join(s.OPT_DIRP, "%s_ablation_report.json" % timestamp),
              "wt") as f:
        json.dump(ablation_report, f, sort_keys=True)

    pprint(ablation_report, depth=20)
    select_model(X, y)
def main():

    X, y = dh.load_data(s.DATA_FP, n_features=s.NUM_FEATURES, memmapped=False)
    start = time.time()
    Xbns = SelectPercentile(bns, percentile=0.33).fit_transform(X, y)
    print(Xbns.shape)
    logging.info("Done BNS {} s".format(time.time() - start))
    start = time.time()
    Xf = SelectPercentile(f_classif, percentile=0.33).fit_transform(X, y)
    print(Xf.shape)
    logging.info("Done Anova F {} s".format(time.time() - start))
def main():
    # load data
    X, y = dh.load_data(s.DATA_FP, n_features=s.NUM_FEATURES, memmapped=False)

    # # # TESTING
    # print("Warning TESTING")
    # X, y = X[0:1000, :], y[0:1000]
    # logging.warning("TESTING with {}".format(X.shape))

    util.ensure_dir(s.OPT_DIRP)
    X = dh.do_memmap(X)

    select_model(X, y)
    util.send_text_message("{}: Ended run and wrote all to {}".format(
        str(datetime.datetime.now()), s.OPT_DIRP))
Exemple #4
0
                    '-r',
                    default='',
                    help='Resume the optimization from snapshot')
parser.add_argument('--saveflag',
                    '-s',
                    choices=('on', 'off'),
                    default='off',
                    help='Save model and optimizer flag')
args = parser.parse_args()

if args.gpu >= 0: cuda.check_cuda_available()

# Prepare dataset
print('load cifar-10 dataset')
if args.data == 'on': cifar = dh.process_data(augmentation=args.augmentation)
else: cifar = dh.load_data()

# Cropping => insize = 24
cifar['train']['x'], cifar['train']['y'] = dh.crop_data(
    cifar['train']['x'], cifar['train']['y'])
cifar['test']['x'], cifar['test']['y'] = dh.crop_data(cifar['test']['x'],
                                                      cifar['test']['y'])

N, N_test = len(cifar['train']['x']), len(cifar['test']['x'])
print(N, N_test)
batchsize = args.batchsize
n_epoch = args.epoch
assert N % batchsize == 0
assert N_test % batchsize == 0

# Prepare Convolution NN model
Exemple #5
0
parser.add_argument('--optimizer', '-o', choices=('adam', 'adagrad', 'sgd'),
                    default='sgd', help='Optimizer algorithm')
parser.add_argument('--plotflag', '-p', choices=('on', 'off'),
                    default='off', help='Accuracy plot flag')
parser.add_argument('--resume', '-r', default='',
                    help='Resume the optimization from snapshot')
parser.add_argument('--saveflag', '-s', choices=('on', 'off'),
                    default='off', help='Save model and optimizer flag')
args = parser.parse_args()

if args.gpu >= 0: cuda.check_cuda_available()

# Prepare dataset
print('load cifar-10 dataset')
if args.data == 'on': cifar = dh.process_data(augmentation=args.augmentation)
else: cifar = dh.load_data()

N = len(cifar['train']['x'])
N_test = len(cifar['test']['x'])
print(N, N_test)
batchsize = args.batchsize
n_epoch = args.epoch
assert N % batchsize == 0
assert N_test % batchsize == 0


# Prepare Convolution NN model
if args.net == 'alex':
    import model_cnn
    model = model_cnn.CifarCNN_2()
elif args.net == 'alexbn':
    detect_label = 1

    util.ensure_dir(
        "/home/gilles/repos/cbrole/static/CASCADE_{}".format(LANGUAGE))
    # load heldout X, y
    DATA_FP = s.langspec[LANGUAGE]["DATA_FP"]
    # X, y = dh.load_data(DATA_FP, n_features=s.langspec[LANGUAGE]['NUM_FEATURES'], memmapped=False)

    run_dir = os.path.dirname(
        os.path.dirname(os.path.dirname(langspec[LANGUAGE]["role_modelfp"])))
    NUM_FEATURES_POSTSPLIT = json.load(
        open(os.path.join(run_dir, "holdinout_split_indices.json"),
             "rt"))["num_features"]
    X_in, y_in = dh.load_data(
        "{}/holdin.svm".format(run_dir),
        n_features=NUM_FEATURES_POSTSPLIT,
        memmapped=False,
    )
    X_out, y_out = dh.load_data(
        "{}/holdout.svm".format(run_dir),
        n_features=NUM_FEATURES_POSTSPLIT,
        memmapped=False,
    )

    # load detection model
    detect_fp = langspec[LANGUAGE]["detect_modelfp"]
    role_fp = langspec[LANGUAGE]["role_modelfp"]
    detect = load(detect_fp)
    role = load(role_fp)
    all_classes = reduce(np.union1d, (detect.classes_, role.classes_))
Exemple #7
0
    def train(self):
        """
        trains a network with a given training set
        """
        self.device = torch.device("cuda")
        np.random.seed(self.rand_seed)
        torch.manual_seed(self.rand_seed)

        train, val, test, vocab = datahandler.load_data(
            "./data/ptb", self.maxlen)

        self.vocab_size = len(vocab)

        # make iterable dataset object
        train_iter, val_iter, test_iter = torchtext.data.BucketIterator.splits(
            (train, val, test),
            batch_sizes=[self.batch_size, 1, 1],
            device=self.device,
            repeat=False,
            sort_key=lambda x: len(x.text),
            sort_within_batch=True,
        )
        self.N_train_data = len(train)
        self.N_val_data = len(val)
        self.N_batches = int(self.N_train_data / self.batch_size +
                             int(self.N_train_data % self.batch_size > 0))

        self.N_train_data = len(train)
        self.N_val_data = len(val)
        self.N_batches = int(self.N_train_data / self.batch_size +
                             int(self.N_train_data % self.batch_size > 0))
        self.log("N_train_data: %d  N_mini_batches: %d" %
                 (self.N_train_data, self.N_batches))

        # instantiate the dmm
        self.dmm = DMM(input_dim=self.vocab_size, dropout=self.dropout)

        # setup optimizer
        opt_params = {
            "lr": self.lr,
            "betas": (self.beta1, self.beta2),
            "clip_norm": self.cn,
            "lrd": self.lr_decay,
            "weight_decay": self.wd,
        }
        self.adam = ClippedAdam(opt_params)
        # set up inference algorithm
        self.elbo = Trace_ELBO()
        self.svi = SVI(self.dmm.model,
                       self.dmm.guide,
                       self.adam,
                       loss=self.elbo)

        val_f = 10

        print("training dmm")
        times = [time.time()]
        for epoch in range(self.n_epoch):

            if self.ckpt_f > 0 and epoch > 0 and epoch % self.ckpt_f == 0:
                self.save_ckpt()

            # train and report metrics
            train_nll = self._train_batch(
                train_iter,
                epoch,
            )

            times.append(time.time())
            t_elps = times[-1] - times[-2]
            self.log("epoch %04d -> train nll: %.4f \t t_elps=%.3f sec" %
                     (epoch, train_nll, t_elps))

            if epoch % val_f == 0:
                val_nll = self._validate(val_iter)
        pass
Exemple #8
0
    #                                                                  "2_Bystander_defender", "2_Harasser",
    #                                                                  "2_Bystander_assistant"])}
    LABELS = type_labels

    DATA_FP = s.langspec[LANGUAGE]["DATA_FP"]  # No longer needed
    run_dir = run_dirs[LANGUAGE]

    all_runs = []
    all_bl = {
        "rand_baseline": [],
        "maj_baseline": [],
    }
    all_bootstrap_score = {}
    y_distr = {}

    X_in, y_in = dh.load_data("{}/holdin.svm".format(run_dir), memmapped=False)
    X_out, y_true = dh.load_data("{}/holdout.svm".format(run_dir), memmapped=False)

    fold_log_dirp = glob.glob("{}/fold_log".format(run_dir))[0]
    fold_meta = get_metadata(fold_log_dirp)

    split = json.load(open("{}/holdinout_split_indices.json".format(run_dir), "rt"))
    out_idc = split["holdout"]
    in_idc = split["holdin"]

    full_class_counts = np.asarray(
        np.unique(np.append(y_in, y_true), return_counts=True)
    ).T.tolist()
    in_class_counts = np.asarray(np.unique(y_in, return_counts=True)).T.tolist()

    out_class_counts = np.asarray(np.unique(y_true, return_counts=True)).T.tolist()