Esempio n. 1
0
    def train_batch_gen(self,
                        csv_path='data/train_v2.csv',
                        imgs_dir='data/train-tif-v2'):

        logger = logging.getLogger(funcname())

        # Helpers.
        scale = lambda x: (x - np.min(x)) / (np.max(x) - np.min(x)) * 2 - 1
        onehot_to_distribution = lambda x: np.argmax(x, axis=1) / np.sum(
            np.argmax(x, axis=1))

        # Read the CSV and error-check contents.
        df = pd.read_csv(csv_path)
        img_names = [
            '%s/%s.tif' % (imgs_dir, n) for n in df['image_name'].values
        ]
        tag_sets = [set(t.strip().split(' ')) for t in df['tags'].values]

        # Error check.
        for img_name, tag_set in zip(img_names, tag_sets):
            assert path.exists(img_name), img_name
            assert len(tag_set) > 0, tag_set

        # # Build an index of tags to their corresponding indexes in the dataset
        # # so that you can sample tags evenly.
        # TAGS_cycle = cycle(TAGS)
        # tags_to_row_idxs = {t: [] for t in TAGS}
        # for idx, row in df.iterrows():
        #     for t in row['tags'].split(' '):
        #         tags_to_row_idxs[t].append(idx)

        while True:

            # New batches at each iteration to prevent over-writing previous batch before it's used.
            imgs_batch = np.zeros([
                self.config['batch_size'],
            ] + self.config['input_shape'],
                                  dtype=np.float32)
            tags_batch = np.zeros([
                self.config['batch_size'],
            ] + self.config['output_shape'],
                                  dtype=np.uint8)

            # Sample *self.config['batch_size']* random rows and build the batches.
            for batch_idx in range(self.config['batch_size']):
                # data_idx = self.rng.choice(tags_to_row_idxs[next(TAGS_cycle)])
                data_idx = self.rng.randint(0, len(img_names))

                img = resize(tif.imread(img_names[data_idx]),
                             self.config['input_shape'][:2],
                             preserve_range=True,
                             mode='constant')
                if self.config['trn_transform']:
                    imgs_batch[batch_idx] = scale(
                        random_transforms(img, nb_min=0, nb_max=5))
                else:
                    imgs_batch[batch_idx] = scale(img)
                tags_batch[batch_idx] = tagset_to_onehot(tag_sets[data_idx])

            yield imgs_batch, tags_batch
    def train_batch_gen(self, imgs_csv, imgs_dir, transform):

        logger = logging.getLogger(funcname())

        # Read the CSV and extract image names and tags.
        df = pd.read_csv(imgs_csv)
        imgs_paths = ['%s/%s.jpg' % (imgs_dir, n) for n in df['image_name'].values]
        tag_sets = [set(t.strip().split(' ')) for t in df['tags'].values]

        # Compute the mean image for pre-processing.
        mean_img = self.get_mean_img(imgs_paths, '%s/mean_img_trn.jpg' % self.cpdir)
        mean_img = mean_img.astype(np.float32) / 255.
        mean_img_mean = np.mean(mean_img)
        img_preprocess = lambda img: img.astype(np.float32) / 255. - mean_img_mean

        while True:

            imgs_batch = np.zeros([self.config['batch_size'], ] + self.config['input_shape'])
            tags_batch = np.zeros([self.config['batch_size'], ] + self.config['output_shape'])
            random_idxs = cycle(np.random.choice(np.arange(len(imgs_paths)), len(imgs_paths)))

            for batch_idx in range(self.config['batch_size']):
                data_idx = next(random_idxs)
                img = imread(imgs_paths[data_idx], mode='RGB')
                img = img_preprocess(img)
                img = resize(img, self.config['input_shape'], preserve_range=True, mode='constant')
                if transform:
                    img = random_transforms(img, nb_min=0, nb_max=6)
                imgs_batch[batch_idx] = img
                tags_batch[batch_idx] = tagset_to_ints(tag_sets[data_idx])

            yield imgs_batch, tags_batch
 def objective(w, yt_trn=yt_trn, yp_trn=yp_trn):
     f2_opt, thresh_opt, w = get_weighted_optimized_results(yt_trn, yp_trn, w)
     if len(tag_trials.trials) > 1:
         if f2_opt > -tag_trials.best_trial['result']['loss']:
             logger = logging.getLogger(funcname())
             logger.info('%2d - %s: hyperopt f2 improved from %lf to %lf' 
                 % (tag_idx, TAGS_short[tag_idx], -tag_trials.best_trial['result']['loss'], f2_opt))
     if len(tag_trials.trials) % ceil(nb_iter/10) == 0:
             logger = logging.getLogger(funcname())
             logger.info('%2d - %s: %d/%d: hyperopt f2 is %lf'
              % (tag_idx, TAGS_short[tag_idx], len(tag_trials.trials), nb_iter, -tag_trials.best_trial['result']['loss']))
     return {
         'loss':-f2_opt,
         'status': STATUS_OK,
         'thresholds': thresh_opt,
         'weights': w,
     }
Esempio n. 4
0
def submission(names, yp, csv_path):
    logger = logging.getLogger(funcname())
    assert len(np.unique(yp)) == 2
    yp = yp.astype(np.uint8)
    df_rows = [[names[i], binary_to_tagstr(yp[i, :])]
               for i in range(yp.shape[0])]
    df_sub = pd.DataFrame(df_rows, columns=['image_name', 'tags'])
    df_sub.to_csv(csv_path, index=False)
    logger.info('Saved %s.' % csv_path)
Esempio n. 5
0
 def on_epoch_begin(self, epoch, logs):
     if epoch == self.unfreeze_epoch:
         logger = logging.getLogger(funcname())
         for idx, layer in enumerate(self.model.layers):
             layer.trainable = True
             logger.info('Unfreezing layer %d: %s' % (idx, layer.name))
         lr = K.get_value(self.model.optimizer.lr) * self.unfreeze_lr_mult
         K.set_value(self.model.optimizer.lr, lr)
         logger.info('Epoch %d: new learning rate %.4lf.' %
                     (epoch, K.get_value(self.model.optimizer.lr)))
Esempio n. 6
0
    def train(self):

        logger = logging.getLogger(funcname())

        # Data setup.
        iidxs = np.arange(len(listdir(self.config['imgs_dir_trn'])))
        iidxs_trn, iidxs_val = train_test_split(
            iidxs, test_size=self.config['prop_val'], random_state=rng)
        steps_trn = ceil(len(iidxs_trn) / self.config['batch_size_trn'])
        steps_val = ceil(len(iidxs_val) / self.config['batch_size_trn'])
        assert len(set(iidxs_trn).intersection(iidxs_val)) == 0
        assert steps_val < steps_trn

        gen_trn = self.batch_gen(iidxs_trn,
                                 steps_trn,
                                 nb_augment_max=self.config['nb_augment_max'])
        gen_val = self.batch_gen(iidxs_val, steps_val, nb_augment_max=1)

        def print_tag_F2_metrics(epoch, logs):

            for tag in self.TAGS_net_short:
                f2_trn = logs['F2_%s' % tag]
                f2_val = logs['val_F2_%s' % tag]
                cnt_trn = logs['cnt_%s' % tag]
                cnt_val = logs['val_cnt_%s' % tag]
                logger.info(
                    '%-6s F2 trn=%-6.3lf cnt=%-6.2lf F2 val=%-6.3lf cnt=%-6.2lf'
                    % (tag, f2_trn, cnt_trn, f2_val, cnt_val))

        cb = [
            LambdaCallback(on_epoch_end=print_tag_F2_metrics),
            HistoryPlot('%s/history.png' % self.cpdir),
            CSVLogger('%s/history.csv' % self.cpdir),
            ModelCheckpoint('%s/wvalF2.hdf5' % self.cpdir,
                            monitor='val_F2',
                            verbose=1,
                            save_best_only=True,
                            mode='max'),
            ReduceLROnPlateau(monitor='val_F2',
                              factor=0.75,
                              patience=10,
                              min_lr=1e-4,
                              epsilon=1e-2,
                              verbose=1,
                              mode='max'),
            EarlyStopping(monitor='val_F2', patience=30, verbose=1, mode='max')
        ]

        self.net.fit_generator(gen_trn,
                               steps_per_epoch=steps_trn,
                               epochs=self.config['nb_epochs'],
                               verbose=1,
                               callbacks=cb,
                               validation_data=gen_val,
                               validation_steps=steps_val)
 def get_mean_img(self, imgs_paths, mean_img_path):
     '''Compute the mean image from the given paths and save it to the given path.'''
     logger = logging.getLogger(funcname())
     if not path.exists(mean_img_path):
         mean_img = np.zeros(self.config['image_shape'], dtype=np.float32)
         for idx, img_path in enumerate(imgs_paths):
             mean_img += imread(img_path, mode='RGB').astype(np.float32) / len(imgs_paths)
             if idx % 1000 == 0:
                 logger.info('%d/%d' % (idx, len(imgs_paths)))
         imsave(mean_img_path, mean_img)
     return imread(mean_img_path)
 def objective(w, self=self, yt_trn=yt_trn):
     w = np.array(w).reshape((N_trn, NUM_OUTPUTS))
     f2_opt, thresh_opt, w = get_weighted_optimized_results(yt_trn, self.yp_trn_all, w)
     if len(self.trials.trials) > 1:
         if f2_opt > -self.trials.best_trial['result']['loss']:
             logger = logging.getLogger(funcname())
             logger.info('hyperopt f2 improved from %lf to %lf' % (-self.trials.best_trial['result']['loss'], f2_opt))
     return {
         'loss':-f2_opt,
         'status': STATUS_OK,
         'thresholds': thresh_opt,
         'weights': w,
     }
    def train(self):

        logger = logging.getLogger(funcname())
        rng = np.random

        imgs_idxs = np.arange(len(listdir(self.config['trn_imgs_dir'])))
        imgs_idxs = rng.choice(imgs_idxs, len(imgs_idxs))
        imgs_idxs_trn = imgs_idxs[:int(len(imgs_idxs) * self.config['trn_prop_trn'])]
        imgs_idxs_val = imgs_idxs[-int(len(imgs_idxs) * self.config['trn_prop_val']):]
        gen_trn = self.batch_gen(self.config['trn_imgs_csv'], self.config['trn_imgs_dir'], imgs_idxs_trn,
                                 transform=self.config['trn_transform'], balanced=True)
        gen_val = self.batch_gen(self.config['trn_imgs_csv'], self.config['trn_imgs_dir'], imgs_idxs_val, False)

        def print_tag_F2_metrics(epoch, logs):

            for tag in TAGS_short:
                f2_trn = logs['F2_%s' % tag]
                f2_val = logs['val_F2_%s' % tag]
                cnt_trn = logs['cnt_%s' % tag]
                cnt_val = logs['val_cnt_%s' % tag]
                logger.info('%-6s F2 trn=%-6.3lf cnt=%-6.2lf F2 val=%-6.3lf cnt=%-6.2lf' %
                            (tag, f2_trn, cnt_trn, f2_val, cnt_val))

        cb = [
            LambdaCallback(on_epoch_end=print_tag_F2_metrics),
            HistoryPlot('%s/history.png' % self.cpdir),
            CSVLogger('%s/history.csv' % self.cpdir),
            ModelCheckpoint('%s/weights_val_F2.hdf5' % self.cpdir, monitor='val_F2', verbose=1,
                            save_best_only=True, mode='max'),
            ReduceLROnPlateau(monitor='val_F2', factor=0.5, patience=2,
                              min_lr=1e-4, epsilon=1e-2, verbose=1, mode='max'),
            EarlyStopping(monitor='val_F2', patience=30, verbose=1, mode='max')
        ]

        nb_steps_trn = ceil(len(imgs_idxs_trn) * 1. / self.config['batch_size_trn'])
        nb_steps_val = ceil(len(imgs_idxs_val) * 1. / self.config['batch_size_trn'])

        self.net.fit_generator(gen_trn, steps_per_epoch=nb_steps_trn, epochs=self.config['trn_nb_epochs'],
                               verbose=1, callbacks=cb,
                               workers=3, pickle_safe=True,
                               validation_data=gen_val, validation_steps=nb_steps_val)
Esempio n. 10
0
    def __init__(self, ensemble_def_file):
        logger = logging.getLogger(funcname())

        # Read spreadsheet values to get prediction paths.
        df = pd.read_csv(ensemble_def_file) 
        paths_yp_trn_initial = []
        for n, gtrn in enumerate(df['yp_trn_glob'].values):
            paths = sorted(glob(os.path.expanduser(gtrn)))
            if len(paths) == 0:
                logger.warn("No files found for glob %d: %s" % (n, gtrn))
            paths_yp_trn_initial += paths 

        self.paths_yp_trn = []

        # Read and concatenate all of the yp_trn and yp_tst files into two matrices.
        logger.info("Loading predictions...")
        self.yp_trn_all = []
        self.yp_tst_all = []
        for i, p_trn in enumerate(paths_yp_trn_initial):
            logger.info(p_trn)

            p_tst = p_trn.replace('trn', 'tst')
            if not os.path.exists(p_tst):
                logger.warning("Skipping %s because it doesn't have a matching test file." % p_trn)
                continue

            yp_trn = np.load(p_trn)
            if yp_trn.shape != (NUM_IMAGES_TRN, NUM_OUTPUTS):
                logger.warning("Skipping %s because the trn shape is incorrect" % p_trn)
                continue

            yp_tst = np.load(p_tst)
            if yp_tst.shape != (NUM_IMAGES_TST, NUM_OUTPUTS):
                logger.warning("Skipping %s because the tst shape is incorrect" % p_tst)
                continue

            self.yp_trn_all.append(yp_trn)
            self.yp_tst_all.append(yp_tst)
            self.paths_yp_trn.append(p_trn)
        self.yp_trn_all = np.array(self.yp_trn_all)
        self.yp_tst_all = np.array(self.yp_tst_all)
Esempio n. 11
0
    def train_batch_gen(self, csv_path='data/train_v2.csv', imgs_dir='data/train-jpg-v2'):

        logger = logging.getLogger(funcname())

        # Helpers.
        scale = lambda x: (x - np.min(x)) / (np.max(x) - np.min(x)) * 2 - 1
        onehot_to_distribution = lambda x: np.argmax(x, axis=1) / np.sum(np.argmax(x, axis=1))

        # Read the CSV and error-check contents.
        df = pd.read_csv(csv_path)
        img_names = ['%s/%s.jpg' % (imgs_dir, n) for n in df['image_name'].values]
        tag_sets = [set(t.strip().split(' ')) for t in df['tags'].values]

        # Error check.
        for img_name, tag_set in zip(img_names, tag_sets):
            assert path.exists(img_name), img_name
            assert len(tag_set) > 0, tag_set

        while True:

            # New batches at each iteration to prevent over-writing previous batch before it's used.
            imgs_batch = np.zeros([self.config['batch_size'], ] + self.config['input_shape'], dtype=np.float32)
            tags_batch = np.zeros([self.config['batch_size'], ] + self.config['output_shape'], dtype=np.uint8)

            # Sample *self.config['batch_size']* random rows and build the batches.
            for batch_idx in range(self.config['batch_size']):
                data_idx = self.rng.randint(0, len(img_names))

                img = resize(imread(img_names[data_idx], mode='RGB'), self.config[
                    'input_shape'][:2], preserve_range=True, mode='constant')
                if self.config['trn_transform']:
                    imgs_batch[batch_idx] = scale(random_transforms(img, nb_min=0, nb_max=2))
                else:
                    imgs_batch[batch_idx] = scale(img)
                tags_batch[batch_idx] = tagset_to_boolarray(tag_sets[data_idx])

            yield imgs_batch, tags_batch
Esempio n. 12
0
    def create_net(self, weights_path=None):

        logger = logging.getLogger(funcname())

        inputs = Input(shape=self.config['input_shape'])
        inception = InceptionV3(include_top=False,
                                weights='imagenet',
                                input_tensor=inputs,
                                pooling='max')
        features = Dropout(0.1)(inception.output)

        # Single softmax classifier for the four cloud-cover classes.
        x = BatchNormalization()(features)
        x = Dense(256)(x)
        x = PReLU()(x)
        x = Dropout(0.2)(x)
        x = Dense(4, activation='softmax', name='out_cc')(x)
        tc = x

        # Individual softmax classifiers for remaining classes.
        clsf_rest = []
        for _ in range(13):
            x = BatchNormalization()(features)
            x = Dense(256)(x)
            x = PReLU()(x)
            x = Dropout(0.2)(x)
            x = Dense(2, activation='softmax', name='out_%d' % _)(x)
            x = Lambda(lambda x: x[:, 1:])(x)
            clsf_rest.append(x)

        # Concatenate the rest of the tags and nullify them if cloudy was predicted.
        tr = concatenate(clsf_rest, axis=-1)

        # Combine tags into one tensor.
        tags_comb = concatenate([tc, tr])

        self.net = Model(inputs=inputs, outputs=tags_comb)

        def kldiv(yt, yp):
            '''KL divergence for each example in batch.'''
            yt = K.clip(yt, K.epsilon(), 1)
            yp = K.clip(yp, K.epsilon(), 1)
            return K.sum(yt * K.log(yt / yp), axis=1)

        def wlogloss(yt, yp):
            '''Weighted log loss for each example in batch.'''

            # Weight false negative errors. This should decrease as recall increases.
            meanpos = K.sum(yp * yt) / (K.sum(yt) + K.epsilon())
            wfnmax = 20.
            wfnmult = (1. - meanpos) * wfnmax

            # Weight false positive errors.
            wfp = 1.

            wmat = (yt * wfnmult) + (K.abs(yt - 1) * wfp)
            errmat = yt * K.log(yp + K.epsilon()) + (
                (1 - yt) * K.log(1 - yp + K.epsilon()))
            return -1 * errmat * wmat

        def cloudymult(ytc, ypr):
            '''Multiplier for cases where the cloudy tag is present but other tags are predicted.'''
            mult = K.clip(
                K.sum(ypr, axis=1, keepdims=True) * ytc[:, 1:2], 1, 13)
            return mult

        def cloudyerrors(yt, yp):
            '''Number of samples per batch with cloudy tag errors.'''
            ytc = yt[:, :4]
            ypr = yp[:, 4:]
            x = K.round(K.max(ypr, axis=1)) * ytc[:, 1]
            return K.sum(x)

        def combined_loss(yt, yp):
            ytc = yt[:, :4]
            ypc = yp[:, :4]
            ytr = yt[:, 4:]
            ypr = yp[:, 4:]

            # Losses computed per example in the batch.
            lc = kldiv(ytc, ypc)  # (b, 1)
            lr = wlogloss(ytr, ypr)  # (b, 13)
            cm = cloudymult(ytc, ypr)  # (b, 1)

            # Return a single scalar.
            return K.mean(lc * cm) + K.mean(lr * cm)

        def ccsum(yt, yp):
            return K.mean(K.sum(yp[:, :4], axis=1))

        # Generate an F2 metric for each tag.
        tf2_metrics = []
        for i, t in enumerate(self.TAGS_net_short):

            @rename('F2_%s' % t)
            def tagF2(yt, yp, i=i):
                return F2(yt[:, i], yp[:, i])

            tf2_metrics.append(tagF2)

        # Generate a metric for each tag that tracks how often it occurs in a batch.
        tcnt_metrics = []
        for i, t in enumerate(TAGS_short):

            @rename('cnt_%s' % t)
            def tagcnt(yt, yp, i=i):
                return K.sum(yt[:, i])

            tcnt_metrics.append(tagcnt)

        self.net.compile(optimizer=Adam(0.0007, decay=0.0001),
                         metrics=[F2, prec, reca, cloudyerrors, ccsum] +
                         tf2_metrics + tcnt_metrics,
                         loss=combined_loss)
        self.net.summary()
        plot_model(self.net, to_file='%s/net.png' % self.cpdir)

        if weights_path is not None:
            logger.info('Loading weights from %s' % weights_path)
            self.net.load_weights(weights_path)
Esempio n. 13
0
    def fit(self, yt_trn, nb_iter=5, use_hyperopt=True, use_rand_search=False, max_parallel=cpu_count()):
        logger = logging.getLogger(funcname())
        np.set_printoptions(formatter={'float': '{: 0.3f}'.format})

        N_trn = len(self.paths_yp_trn)

        # results format: [(F2 score, tag thresholds, ensemble weights)]
        results, best_idx = [], 0

        # Try equal weights
        w = np.ones((N_trn, NUM_OUTPUTS), dtype=np.float16)
        f2_opt, thresh_opt, w = get_weighted_optimized_results(yt_trn, self.yp_trn_all, w)
        results.append((f2_opt, thresh_opt, w))
        logger.info('Equal weights: f2 = %f' % f2_opt)


        # Try using each member individually.
        try_individually = False
        if try_individually:
            w = np.zeros((N_trn, NUM_OUTPUTS), dtype=np.float16)
            for it in range(N_trn):
                w *= 0
                w[it] = 1
                f2_opt, thresh_opt, w = get_weighted_optimized_results(yt_trn, self.yp_trn_all, w)
                logger.info('%-70s f2 = %f' % (self.paths_yp_trn[it][-60:], f2_opt))
                if f2_opt > 0.88:
                    results.append((f2_opt, thresh_opt, w))
                    if f2_opt > results[best_idx][0]:
                        old_best = results[best_idx][0]
                        best_idx = len(results)-1
                        logger.info('f2 improved from %lf to %lf' % (old_best, f2_opt))
                        # serialize_ensemble(f2_opt, thresh_opt, w, names_tst, yp_tst_all, self.yp_trn_all, self.paths_yp_trn)

        if use_hyperopt:
            optimize_individually = True
            if optimize_individually:
                logging.getLogger("hyperopt.tpe").setLevel(logging.WARNING)
                if max_parallel == 1:
                    tag_weights = []
                    for tag_idx in range(NUM_OUTPUTS):
                        logger.info("Optimizing tag %d" % tag_idx)
                        tag_weights.append(optimize_single_tag(tag_idx, self.yp_trn_all[:,:,tag_idx], yt_trn[:, tag_idx], nb_iter))
                else:
                    p = Pool(min(max_parallel, NUM_OUTPUTS))
                    tag_weights = p.map(optimize_single_tag, [(tag_idx, self.yp_trn_all[:,:,tag_idx], yt_trn[:, tag_idx], nb_iter) for tag_idx in range(NUM_OUTPUTS)])
                w = np.transpose(tag_weights)
                f2_opt, thresh_opt, w = get_weighted_optimized_results(yt_trn, self.yp_trn_all, w)
                logger.info("Optimized weights: f2 = %f" % f2_opt)
                results.append((f2_opt, thresh_opt, w))
            else:
                self.trials = Trials()
                def objective(w, self=self, yt_trn=yt_trn):
                    w = np.array(w).reshape((N_trn, NUM_OUTPUTS))
                    f2_opt, thresh_opt, w = get_weighted_optimized_results(yt_trn, self.yp_trn_all, w)
                    if len(self.trials.trials) > 1:
                        if f2_opt > -self.trials.best_trial['result']['loss']:
                            logger = logging.getLogger(funcname())
                            logger.info('hyperopt f2 improved from %lf to %lf' % (-self.trials.best_trial['result']['loss'], f2_opt))
                    return {
                        'loss':-f2_opt,
                        'status': STATUS_OK,
                        'thresholds': thresh_opt,
                        'weights': w,
                    }
                weights_space_all = [hp.uniform(str(i), 0, 1) for i in range(N_trn*NUM_OUTPUTS)]
                total = 0
                for s in weights_space_all:
                    total += s
                for i in range(len(weights_space_all)):
                    weights_space_all[i] /= total

                best = fmin(objective, space=weights_space_all, algo=tpe.suggest, max_evals=nb_iter, trials=self.trials)
                sortedTrials = sorted(self.trials.trials, key=lambda x: x['result']['loss'])[:10]

                # Process hyperopt results into our results list
                for t in sortedTrials:
                    f2_opt = -t['result']['loss']
                    thresh_opt = t['result']['thresholds']
                    w = t['result']['weights']
                    if f2_opt > 0.88:
                        results.append((f2_opt, thresh_opt, w))
                        if f2_opt > results[best_idx][0]:
                            best_idx = len(results)-1
        if use_rand_search:
            # Randomly choose a weight for each tag across all member predictions.
            logger.info('Searching random weights...')
            f2_scores = []
            for it in range(nb_iter):
                w = np.random.rand(N_trn, NUM_OUTPUTS)
                f2_opt, thresh_opt, w = get_weighted_optimized_results(yt_trn, self.yp_trn_all, w)
                f2_scores.append(f2_opt)
                if f2_opt > 0.88:
                    results.append((f2_opt, thresh_opt, w))
                    if f2_opt > results[best_idx][0]:
                        old_best = results[best_idx][0]
                        best_idx = len(results)-1
                        logger.info('%-05.2lf: f2 improved from %lf to %lf:\n' % ((it / nb_iter * 100), old_best, f2_opt))
                        # serialize_ensemble(f2_opt, thresh_opt, w, names_tst, yp_tst_all, self.yp_trn_all, self.paths_yp_trn)

                if it % 50 == 0:
                    logger.info('%-05.2lf: f2 mean=%.4lf, min=%.4lf, max=%.4lf, stdv=%.4lf, unique=%d' % \
                         ((it / nb_iter * 100), np.mean(f2_scores), np.min(f2_scores), np.max(f2_scores), 
                            np.std(f2_scores), len(np.unique(f2_scores))))

        return results
Esempio n. 14
0
 def on_train_begin(self, logs):
     logger = logging.getLogger(funcname())
     for idx, layer in enumerate(self.model.layers[:self.frozen_up_to_idx]):
         layer.trainable = False
         logger.info('Freezing layer %d: %s' % (idx, layer.name))
Esempio n. 15
0
    def on_epoch_end(self, epoch, logs):
        '''
        1. Predictions on every image in the validation set.
        2. Evaluate the precision, recall, and F2 of each tag.
        3. Store the metrics and predictions in a pickle file in the cpdir.
        '''

        logger = logging.getLogger(funcname())

        # Make predictions and store all true and predicted tags.
        yt = np.zeros((self.batch_size * self.nb_steps, len(TAGS)))
        yp = np.zeros((self.batch_size * self.nb_steps, len(TAGS)))
        for bidx in tqdm(range(self.nb_steps)):
            ib, tb = next(self.batch_gen)
            yt[bidx * self.batch_size:(bidx + 1) * self.batch_size] = tb
            yp[bidx * self.batch_size:(bidx + 1) *
               self.batch_size] = self.model.predict_on_batch(ib)

        # Find optimal thresholds.
        thresholds = optimize_thresholds(yt, yp)
        yp_opt = (yp > thresholds).astype(np.uint8)

        # Print per-tag metrics with stress-inducing colors.
        tags_f2, tags_p, tags_r = [], [], []
        for tidx in range(len(TAGS)):
            f2, p, r = f2pr(yt[:, tidx], yp_opt[:, tidx])
            tags_f2.append(f2)
            tags_p.append(p)
            tags_r.append(p)
            s = '%-20s F2=%.3lf p=%.3lf r=%.3lf t=%.3lf' % (
                TAGS[tidx], f2, p, r, thresholds[tidx])
            if f2 > 0.9:
                s = colored(s, 'green')
            elif f2 > 0.8:
                s = colored(s, 'yellow')
            else:
                s = colored(s, 'red')
            logger.info(s)

        # Metric variance across tags.
        logs['val_f2_var'] = np.var(tags_f2)
        logs['val_prec_var'] = np.var(tags_p)
        logs['val_reca_var'] = np.var(tags_r)

        # Metrics on all predictions.
        f2, p, r = f2pr(yt, (yp > 0.5))
        logger.info('Unoptimized F2=%.3lf, p=%.3lf, r=%.3lf' % (f2, p, r))
        f2, p, r = f2pr(yt, yp_opt)
        logger.info('Optimized   F2=%.3lf, p=%.3lf, r=%.3lf' % (f2, p, r))
        logger.info(
            'Variance    F2=%.3lf, p=%.3lf, r=%.3lf' %
            (logs['val_f2_var'], logs['val_prec_var'], logs['val_reca_var']))

        # Record optimized metrics.
        logs['val_F2'] = f2
        logs['val_prec'] = p
        logs['val_reca'] = r

        # Let em know you improved.
        if f2 > self.best_metric:
            logger.info('val_F2 improved from %.3lf to %.3lf %s!' %
                        (self.best_metric, f2, u'\U0001F642'))
            self.best_metric = f2
            self.best_epoch = epoch
        else:
            logger.info('Last improvement %d epochs ago %s. Maybe next time!' %
                        ((epoch - self.best_epoch), u'\U0001F625'))
Esempio n. 16
0
def predict(model_class, args):
    """Instantiates the model and makes augmented predictions. Serializes the predictions
    as numpy matrices."""

    logger = logging.getLogger(funcname())

    # Generate ID from model file. Used for saving files.
    fp = open(args['model'], 'rb')
    MD5ID = md5(fp.read()).hexdigest()
    fp.close()

    # Setup model.
    model = model_class(model_path=args['model'])
    model.cfg['cpdir'] = '/'.join(args['model'].split('/')[:-1])
    assert 'hdf5_path_trn' in model.cfg
    assert 'hdf5_path_tst' in model.cfg
    assert 'tst_batch_size' in model.cfg

    # Load references to hdf5 data.
    data_trn = h5py.File(model.cfg['hdf5_path_trn'], 'r')
    data_tst = h5py.File(model.cfg['hdf5_path_tst'], 'r')
    imgs_trn, tags_trn = data_trn.get('images'), data_trn.get('tags')[...]
    imgs_tst, tags_tst = data_tst.get('images'), data_tst.get('tags')
    names_trn = data_trn.attrs['names'].split(',')
    names_tst = data_tst.attrs['names'].split(',')

    aug_funcs = [
        ('identity', lambda x: x), ('vflip', lambda x: x[:, ::-1, ...]),
        ('hflip', lambda x: x[:, :, ::-1]),
        ('rot90', lambda x: np.rot90(x, 1, axes=(1, 2))),
        ('rot180', lambda x: np.rot90(x, 2, axes=(1, 2))),
        ('rot270', lambda x: np.rot90(x, 3, axes=(1, 2))),
        ('rot90vflip', lambda x: np.rot90(x, 1, axes=(1, 2))[:, ::-1, ...]),
        ('rot90hflip', lambda x: np.rot90(x, 1, axes=(1, 2))[:, :, ::-1])
    ]

    # Keep mean combination of all augmentations.
    yp_trn_all = np.zeros(tags_trn.shape, dtype=np.float16)
    yp_tst_all = np.zeros(tags_tst.shape, dtype=np.float16)

    # Make training and testing predictions batch-by-batch for multiple
    # augmentations. Serialize the matrix of activations for each augmentation.
    for aug_name, aug_func in aug_funcs:

        logger.info('TTA: %s' % (aug_name))

        # Train set.
        yp = np.zeros(tags_trn.shape, dtype=np.float16)
        for i0 in tqdm(range(0, imgs_trn.shape[0],
                             model.cfg['tst_batch_size'])):
            i1 = i0 + min(model.cfg['tst_batch_size'], imgs_trn.shape[0] - i0)
            ib = np.array([
                imresize(img[...], model.cfg['input_shape'])
                for img in imgs_trn[i0:i1]
            ])
            yp[i0:i1] = model.predict_batch(aug_func(ib))

        # Optimize activation thresholds and print F2 as a sanity check.
        f2, p, r = f2pr(tags_trn, (yp > 0.5).astype(np.uint8))
        logger.info('Default   f2=%.4lf, p=%.4lf, r=%.4lf' % (f2, p, r))

        thresh_opt = optimize_thresholds(tags_trn, yp)
        f2, p, r = f2pr(tags_trn, (yp > thresh_opt))
        logger.info('Optimized f2=%.4lf, p=%.4lf, r=%.4lf' % (f2, p, r))

        # Save csv submission with default and optimized thresholds.
        csv_path = '%s/submission_trn_%s_def_%s.csv' % (model.cpdir, aug_name,
                                                        MD5ID)
        submission(names_trn, (yp > 0.5), csv_path)
        csv_path = '%s/submission_trn_%s_opt_%s.csv' % (model.cpdir, aug_name,
                                                        MD5ID)
        submission(names_trn, (yp > thresh_opt), csv_path)

        # Save raw activations.
        npy_path = '%s/yp_trn_%s_%s.npy' % (model.cpdir, aug_name, MD5ID)
        np.save(npy_path, yp)
        logger.info('Saved %s.' % npy_path)

        # Add to mean combination.
        yp_trn_all += yp / len(aug_funcs)

        # Test set.
        yp = np.zeros(tags_tst.shape, dtype=np.float16)
        for i0 in tqdm(range(0, imgs_tst.shape[0],
                             model.cfg['tst_batch_size'])):
            i1 = i0 + min(model.cfg['tst_batch_size'], imgs_tst.shape[0] - i0)
            ib = np.array([
                imresize(img[...], model.cfg['input_shape'])
                for img in imgs_tst[i0:i1]
            ])
            yp[i0:i1] = model.predict_batch(aug_func(ib))

        # Save csv submission with default and optimized thresholds.
        csv_path = '%s/submission_tst_%s_def_%s.csv' % (model.cpdir, aug_name,
                                                        MD5ID)
        submission(names_tst, (yp > 0.5), csv_path)
        csv_path = '%s/submission_tst_%s_opt_%s.csv' % (model.cpdir, aug_name,
                                                        MD5ID)
        submission(names_tst, (yp > thresh_opt), csv_path)

        # Save raw activations.
        npy_path = '%s/yp_tst_%s_%s.npy' % (model.cpdir, aug_name, MD5ID)
        np.save(npy_path, yp)
        logger.info('Saved %s.' % npy_path)

        # Add to mean combination.
        yp_tst_all += yp / len(aug_funcs)

    # Optimize activation thresholds for combined predictions.
    logger.info('TTA: mean')
    f2, p, r = f2pr(tags_trn, (yp_trn_all > 0.5))
    logger.info('Default   f2 =%.4lf, p=%.4lf, r=%.4lf' % (f2, p, r))
    thresh_opt = optimize_thresholds(tags_trn, yp_trn_all)
    f2, p, r = f2pr(tags_trn, (yp_trn_all > thresh_opt))
    logger.info('Optimized f2 =%.4lf, p=%.4lf, r=%.4lf' % (f2, p, r))

    # Save train and test csv submission with default and optimized thresholds.
    csv_path = '%s/submission_trn_%s_def_%s.csv' % (model.cpdir, 'mean_aug',
                                                    MD5ID)
    submission(names_trn, (yp_trn_all > 0.5), csv_path)
    csv_path = '%s/submission_trn_%s_opt_%s.csv' % (model.cpdir, 'mean_aug',
                                                    MD5ID)
    submission(names_trn, (yp_trn_all > thresh_opt), csv_path)
    csv_path = '%s/submission_tst_%s_def_%s.csv' % (model.cpdir, 'mean_aug',
                                                    MD5ID)
    submission(names_tst, (yp_tst_all > 0.5), csv_path)
    csv_path = '%s/submission_tst_%s_opt_%s.csv' % (model.cpdir, 'mean_aug',
                                                    MD5ID)
    submission(names_tst, (yp_tst_all > thresh_opt), csv_path)

    # Save train and test raw activations.
    npy_path = '%s/yp_trn_%s_%s.npy' % (model.cpdir, 'mean_aug', MD5ID)
    np.save(npy_path, yp_trn_all)
    logger.info('Saved %s.' % npy_path)
    npy_path = '%s/yp_tst_%s_%s.npy' % (model.cpdir, 'mean_aug', MD5ID)
    np.save(npy_path, yp_tst_all)
    logger.info('Saved %s.' % npy_path)