Ejemplo n.º 1
0
 def impurity(self, rows):
     '''Calculates the gini_impurity corresponding to
     the rows provided.
     '''
     counts = utils.label_counts(rows)
     total = rows.shape[0]
     gini_im = 1
     for count in counts.values:
         gini_im -= (count / total)**2
     return gini_im
Ejemplo n.º 2
0
 def impurity(self, rows):
     '''Calculates the entropy corresponding to
     the rows provided.
     '''
     counts = utils.label_counts(rows)
     total = rows.shape[0]
     entropy = 0
     for count in counts.values:
         entropy -= (count / total) * math.log((count / total), 2)
     return entropy
Ejemplo n.º 3
0
    def build_tree(self, data_array):
        '''
        Recursively builds a decision tree for categorical data_array
        input:
            current tree node object
            data_array array
        return"
            decision tree: linked list of nodes.
        '''
        current_entropy = entropy(data_array)
        feature_count = len(data_array[0])-1

        ig_global = 0.0
        ig_feature_valpair = None
        ig_setpair = None
        # choosing best feature and its value to split data_arrayset
        for index in range(0, feature_count):
            # creating a set of unique values for a given feature in the data_arrayset
            vals = set()
            for row in data_array:
                vals.add(row[index])
            # iterating through the unique set of features and calculating information gain
            for values in vals:
                (subset_1, subset_2) = data_split(
                    data_array, index, values)
                pos = float(len(subset_1))/float(len(data_array))
                neg = float(len(subset_2))/float(len(data_array))
                gain = current_entropy - pos * \
                    entropy(subset_1) - (neg)*entropy(subset_2)
                # updating feature index, values and data_array splits for the best information gain
                if gain > ig_global and len(subset_1) > 0 and len(subset_2) > 0:
                    ig_global = gain
                    ig_feature_valpair = (index, values)
                    ig_setpair = (subset_1, subset_2)
        # ig >0 for impure sets: hence move on to subsets of data_array
        if ig_global > 0.0:
            self.feature_index = ig_feature_valpair[0]
            self.feature_value = ig_feature_valpair[1]
            if not self.true:
                self.true = DecisionTree()
                self.true.build_tree(ig_setpair[0])
            if not self.false:
                self.false = DecisionTree()
                self.false.build_tree(ig_setpair[1])
        # decision leaf reached and decision label assigned to this leaf
        else:
            self.class_label = label_counts(data_array)
Ejemplo n.º 4
0
def train(model,
          train_dataset,
          test_dataset,
          nclasses,
          args,
          val_dataset=None):
    if val_dataset is None:
        new_train_size = int(0.8 * len(train_dataset))
        val_size = len(train_dataset) - new_train_size
        train_dataset, val_dataset = random_split(train_dataset,
                                                  [new_train_size, val_size])

    train_loader = DataLoader(train_dataset,
                              args.batch_size,
                              num_workers=(cpu_count()) // 2)
    val_loader = DataLoader(val_dataset,
                            args.batch_size,
                            shuffle=False,
                            num_workers=(cpu_count()) // 2)
    if not args.test_only:
        criterion = utils.loss_wrapper(args.C)
        optimizer = torch.optim.SGD(get_trainable_params(model),
                                    lr=args.lr,
                                    weight_decay=5e-3,
                                    momentum=0.9)
        scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
            optimizer, mode='max', patience=args.patience, factor=0.5)

        val_label_counts = utils.label_counts(val_loader, nclasses)

        for i in range(args.nepochs):
            epoch_loss = 0
            epoch_correct = 0

            t = tqdm(enumerate(train_loader))
            t.set_description('epoch#%d' % i)
            for j, batch in t:
                train_loss, train_correct = train_on_batch(
                    model, batch, optimizer, criterion)
                epoch_loss += train_loss
                epoch_correct += train_correct
                t.set_postfix(loss=epoch_loss / ((j + 1) * args.batch_size),
                              accuracy=epoch_correct /
                              ((j + 1) * args.batch_size),
                              lr=optimizer.param_groups[0]['lr'])
            epoch_loss /= len(train_dataset)
            epoch_acc = epoch_correct / len(train_dataset)

            val_correct = evaluate(model, val_loader, val_label_counts)
            val_acc = np.mean(val_correct / val_label_counts)
            print('val_accuracy:', val_acc)

            if i == 0 or scheduler.is_better(val_acc, scheduler.best):
                with open(args.outfile, 'wb') as f:
                    torch.save(model, f)

            scheduler.step(val_acc)

            logger.info(
                'epoch#%d train_loss=%.3f train_acc=%.3f val_acc=%.3f lr=%.4f'
                % (i, epoch_loss, epoch_acc, val_acc,
                   optimizer.param_groups[0]['lr']))

    test_loader = DataLoader(test_dataset,
                             args.batch_size,
                             shuffle=False,
                             num_workers=(cpu_count()) // 2)

    test_label_counts = utils.label_counts(test_loader, nclasses)
    test_correct = evaluate(model, test_loader, test_label_counts)
    test_acc = np.mean(test_correct / test_label_counts)
    print('test_accuracy:', test_acc)
    logger.info('test_accuracy = %0.3f' % test_acc)
Ejemplo n.º 5
0
def train(model,
          train_dataset,
          test_dataset,
          nclasses,
          adversary,
          args,
          val_dataset=None,
          mLogger=None):
    print(mLogger)
    if mLogger is not None:
        logger = mLogger
    if val_dataset is None:
        new_train_size = int(0.8 * len(train_dataset))
        val_size = len(train_dataset) - new_train_size
        train_dataset, val_dataset = random_split(train_dataset,
                                                  [new_train_size, val_size])

    train_loader = DataLoader(train_dataset,
                              args.batch_size,
                              num_workers=(cpu_count()) // 2)
    val_loader = DataLoader(val_dataset,
                            args.batch_size,
                            shuffle=False,
                            num_workers=(cpu_count()) // 2)

    criterion = utils.loss_wrapper(args.C)

    if args.optimizer == 'sgd':
        optimizer = torch.optim.SGD(get_trainable_params(model),
                                    lr=args.lr,
                                    weight_decay=5e-4,
                                    momentum=0.9,
                                    nesterov=True)
    if args.optimizer == 'adam':
        optimizer = torch.optim.Adam(get_trainable_params(model),
                                     lr=args.lr,
                                     weight_decay=1e-4)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
        optimizer, mode='max', patience=args.patience, factor=0.2)

    test_loader = DataLoader(test_dataset,
                             args.batch_size,
                             shuffle=False,
                             num_workers=(cpu_count()) // 2)
    test_label_counts = utils.label_counts(test_loader, nclasses)
    test_correct = evaluate(model, test_loader, test_label_counts)
    test_acc = np.sum(test_correct) / np.sum(test_label_counts)
    print('test_accuracy:', test_acc)
    logger.info('test_accuracy = %0.3f' % test_acc)

    val_label_counts = utils.label_counts(val_loader, nclasses)
    bad_iters = 0
    for i in range(args.nepochs):
        epoch_loss = 0
        epoch_correct = 0
        epoch_count = 0
        t = tqdm(enumerate(train_loader))
        t.set_description('epoch#%d' % i)
        for j, batch in t:
            x, y = batch
            x = x.cuda()
            y = y.cuda()

            if args.gaussian_smoothing:
                eps = torch.normal(mean=0, std=args.sigma, size=x.shape).cuda()
                x += eps
            else:
                flips = np.random.binomial(1, 0.5, size=x.shape[0])
                flips = flips == 1
                x[flips] = adversary.perturb(x[flips], y[flips])

            train_loss, train_correct = train_on_batch(model, (x, y),
                                                       optimizer, criterion)
            epoch_loss += train_loss
            epoch_correct += train_correct
            epoch_count += x.shape[0]
            t.set_postfix(loss=epoch_loss / ((j + 1) * args.batch_size),
                          accuracy=epoch_correct / (epoch_count),
                          lr=optimizer.param_groups[0]['lr'])
        epoch_loss /= len(train_dataset)
        epoch_acc = epoch_correct / len(train_dataset)

        # val_correct = evaluate(model, val_loader, val_label_counts)
        # val_acc = np.mean(val_correct / val_label_counts)
        # print('val_accuracy:', val_acc, )

        adv, label, pred, advpred = attack_whole_dataset(adversary, val_loader)
        val_acc = get_accuracy(pred, label)
        adv_acc = get_accuracy(advpred, label)
        print('clean val accuracy:', val_acc)
        print('robust val accuracy:', adv_acc)

        if i == 0 or scheduler.is_better(val_acc, scheduler.best):
            with open(args.outfile, 'wb') as f:
                torch.save(model, f)
            bad_iters = 0
        else:
            bad_iters += 1
        if bad_iters >= 3 * args.patience:
            print('early stopping...')
            break
        scheduler.step(adv_acc)

        logger.info(
            'epoch#%d train_loss=%.3f train_acc=%.3f val_acc=%.3f lr=%.4f' %
            (i, epoch_loss, epoch_acc, val_acc,
             optimizer.param_groups[0]['lr']))

    test_loader = DataLoader(test_dataset,
                             args.batch_size,
                             shuffle=False,
                             num_workers=(cpu_count()) // 2)
    model = torch.load(args.outfile)
    test_label_counts = utils.label_counts(test_loader, nclasses)
    test_correct = evaluate(model, test_loader, test_label_counts)
    test_acc = np.sum(test_correct) / np.sum(test_label_counts)
    print('test_accuracy:', test_acc)
    logger.info('test_accuracy = %0.3f' % test_acc)

    adv, label, pred, advpred = attack_whole_dataset(adversary, test_loader)
    test_acc = get_accuracy(pred, label)
    print('clean test accuracy:', test_acc)
    print('robust test accuracy:', get_accuracy(advpred, label))