def train_batch_gen(self, csv_path='data/train_v2.csv', imgs_dir='data/train-tif-v2'): logger = logging.getLogger(funcname()) # Helpers. scale = lambda x: (x - np.min(x)) / (np.max(x) - np.min(x)) * 2 - 1 onehot_to_distribution = lambda x: np.argmax(x, axis=1) / np.sum( np.argmax(x, axis=1)) # Read the CSV and error-check contents. df = pd.read_csv(csv_path) img_names = [ '%s/%s.tif' % (imgs_dir, n) for n in df['image_name'].values ] tag_sets = [set(t.strip().split(' ')) for t in df['tags'].values] # Error check. for img_name, tag_set in zip(img_names, tag_sets): assert path.exists(img_name), img_name assert len(tag_set) > 0, tag_set # # Build an index of tags to their corresponding indexes in the dataset # # so that you can sample tags evenly. # TAGS_cycle = cycle(TAGS) # tags_to_row_idxs = {t: [] for t in TAGS} # for idx, row in df.iterrows(): # for t in row['tags'].split(' '): # tags_to_row_idxs[t].append(idx) while True: # New batches at each iteration to prevent over-writing previous batch before it's used. imgs_batch = np.zeros([ self.config['batch_size'], ] + self.config['input_shape'], dtype=np.float32) tags_batch = np.zeros([ self.config['batch_size'], ] + self.config['output_shape'], dtype=np.uint8) # Sample *self.config['batch_size']* random rows and build the batches. for batch_idx in range(self.config['batch_size']): # data_idx = self.rng.choice(tags_to_row_idxs[next(TAGS_cycle)]) data_idx = self.rng.randint(0, len(img_names)) img = resize(tif.imread(img_names[data_idx]), self.config['input_shape'][:2], preserve_range=True, mode='constant') if self.config['trn_transform']: imgs_batch[batch_idx] = scale( random_transforms(img, nb_min=0, nb_max=5)) else: imgs_batch[batch_idx] = scale(img) tags_batch[batch_idx] = tagset_to_onehot(tag_sets[data_idx]) yield imgs_batch, tags_batch
def train_batch_gen(self, imgs_csv, imgs_dir, transform): logger = logging.getLogger(funcname()) # Read the CSV and extract image names and tags. df = pd.read_csv(imgs_csv) imgs_paths = ['%s/%s.jpg' % (imgs_dir, n) for n in df['image_name'].values] tag_sets = [set(t.strip().split(' ')) for t in df['tags'].values] # Compute the mean image for pre-processing. mean_img = self.get_mean_img(imgs_paths, '%s/mean_img_trn.jpg' % self.cpdir) mean_img = mean_img.astype(np.float32) / 255. mean_img_mean = np.mean(mean_img) img_preprocess = lambda img: img.astype(np.float32) / 255. - mean_img_mean while True: imgs_batch = np.zeros([self.config['batch_size'], ] + self.config['input_shape']) tags_batch = np.zeros([self.config['batch_size'], ] + self.config['output_shape']) random_idxs = cycle(np.random.choice(np.arange(len(imgs_paths)), len(imgs_paths))) for batch_idx in range(self.config['batch_size']): data_idx = next(random_idxs) img = imread(imgs_paths[data_idx], mode='RGB') img = img_preprocess(img) img = resize(img, self.config['input_shape'], preserve_range=True, mode='constant') if transform: img = random_transforms(img, nb_min=0, nb_max=6) imgs_batch[batch_idx] = img tags_batch[batch_idx] = tagset_to_ints(tag_sets[data_idx]) yield imgs_batch, tags_batch
def objective(w, yt_trn=yt_trn, yp_trn=yp_trn): f2_opt, thresh_opt, w = get_weighted_optimized_results(yt_trn, yp_trn, w) if len(tag_trials.trials) > 1: if f2_opt > -tag_trials.best_trial['result']['loss']: logger = logging.getLogger(funcname()) logger.info('%2d - %s: hyperopt f2 improved from %lf to %lf' % (tag_idx, TAGS_short[tag_idx], -tag_trials.best_trial['result']['loss'], f2_opt)) if len(tag_trials.trials) % ceil(nb_iter/10) == 0: logger = logging.getLogger(funcname()) logger.info('%2d - %s: %d/%d: hyperopt f2 is %lf' % (tag_idx, TAGS_short[tag_idx], len(tag_trials.trials), nb_iter, -tag_trials.best_trial['result']['loss'])) return { 'loss':-f2_opt, 'status': STATUS_OK, 'thresholds': thresh_opt, 'weights': w, }
def submission(names, yp, csv_path): logger = logging.getLogger(funcname()) assert len(np.unique(yp)) == 2 yp = yp.astype(np.uint8) df_rows = [[names[i], binary_to_tagstr(yp[i, :])] for i in range(yp.shape[0])] df_sub = pd.DataFrame(df_rows, columns=['image_name', 'tags']) df_sub.to_csv(csv_path, index=False) logger.info('Saved %s.' % csv_path)
def on_epoch_begin(self, epoch, logs): if epoch == self.unfreeze_epoch: logger = logging.getLogger(funcname()) for idx, layer in enumerate(self.model.layers): layer.trainable = True logger.info('Unfreezing layer %d: %s' % (idx, layer.name)) lr = K.get_value(self.model.optimizer.lr) * self.unfreeze_lr_mult K.set_value(self.model.optimizer.lr, lr) logger.info('Epoch %d: new learning rate %.4lf.' % (epoch, K.get_value(self.model.optimizer.lr)))
def train(self): logger = logging.getLogger(funcname()) # Data setup. iidxs = np.arange(len(listdir(self.config['imgs_dir_trn']))) iidxs_trn, iidxs_val = train_test_split( iidxs, test_size=self.config['prop_val'], random_state=rng) steps_trn = ceil(len(iidxs_trn) / self.config['batch_size_trn']) steps_val = ceil(len(iidxs_val) / self.config['batch_size_trn']) assert len(set(iidxs_trn).intersection(iidxs_val)) == 0 assert steps_val < steps_trn gen_trn = self.batch_gen(iidxs_trn, steps_trn, nb_augment_max=self.config['nb_augment_max']) gen_val = self.batch_gen(iidxs_val, steps_val, nb_augment_max=1) def print_tag_F2_metrics(epoch, logs): for tag in self.TAGS_net_short: f2_trn = logs['F2_%s' % tag] f2_val = logs['val_F2_%s' % tag] cnt_trn = logs['cnt_%s' % tag] cnt_val = logs['val_cnt_%s' % tag] logger.info( '%-6s F2 trn=%-6.3lf cnt=%-6.2lf F2 val=%-6.3lf cnt=%-6.2lf' % (tag, f2_trn, cnt_trn, f2_val, cnt_val)) cb = [ LambdaCallback(on_epoch_end=print_tag_F2_metrics), HistoryPlot('%s/history.png' % self.cpdir), CSVLogger('%s/history.csv' % self.cpdir), ModelCheckpoint('%s/wvalF2.hdf5' % self.cpdir, monitor='val_F2', verbose=1, save_best_only=True, mode='max'), ReduceLROnPlateau(monitor='val_F2', factor=0.75, patience=10, min_lr=1e-4, epsilon=1e-2, verbose=1, mode='max'), EarlyStopping(monitor='val_F2', patience=30, verbose=1, mode='max') ] self.net.fit_generator(gen_trn, steps_per_epoch=steps_trn, epochs=self.config['nb_epochs'], verbose=1, callbacks=cb, validation_data=gen_val, validation_steps=steps_val)
def get_mean_img(self, imgs_paths, mean_img_path): '''Compute the mean image from the given paths and save it to the given path.''' logger = logging.getLogger(funcname()) if not path.exists(mean_img_path): mean_img = np.zeros(self.config['image_shape'], dtype=np.float32) for idx, img_path in enumerate(imgs_paths): mean_img += imread(img_path, mode='RGB').astype(np.float32) / len(imgs_paths) if idx % 1000 == 0: logger.info('%d/%d' % (idx, len(imgs_paths))) imsave(mean_img_path, mean_img) return imread(mean_img_path)
def objective(w, self=self, yt_trn=yt_trn): w = np.array(w).reshape((N_trn, NUM_OUTPUTS)) f2_opt, thresh_opt, w = get_weighted_optimized_results(yt_trn, self.yp_trn_all, w) if len(self.trials.trials) > 1: if f2_opt > -self.trials.best_trial['result']['loss']: logger = logging.getLogger(funcname()) logger.info('hyperopt f2 improved from %lf to %lf' % (-self.trials.best_trial['result']['loss'], f2_opt)) return { 'loss':-f2_opt, 'status': STATUS_OK, 'thresholds': thresh_opt, 'weights': w, }
def train(self): logger = logging.getLogger(funcname()) rng = np.random imgs_idxs = np.arange(len(listdir(self.config['trn_imgs_dir']))) imgs_idxs = rng.choice(imgs_idxs, len(imgs_idxs)) imgs_idxs_trn = imgs_idxs[:int(len(imgs_idxs) * self.config['trn_prop_trn'])] imgs_idxs_val = imgs_idxs[-int(len(imgs_idxs) * self.config['trn_prop_val']):] gen_trn = self.batch_gen(self.config['trn_imgs_csv'], self.config['trn_imgs_dir'], imgs_idxs_trn, transform=self.config['trn_transform'], balanced=True) gen_val = self.batch_gen(self.config['trn_imgs_csv'], self.config['trn_imgs_dir'], imgs_idxs_val, False) def print_tag_F2_metrics(epoch, logs): for tag in TAGS_short: f2_trn = logs['F2_%s' % tag] f2_val = logs['val_F2_%s' % tag] cnt_trn = logs['cnt_%s' % tag] cnt_val = logs['val_cnt_%s' % tag] logger.info('%-6s F2 trn=%-6.3lf cnt=%-6.2lf F2 val=%-6.3lf cnt=%-6.2lf' % (tag, f2_trn, cnt_trn, f2_val, cnt_val)) cb = [ LambdaCallback(on_epoch_end=print_tag_F2_metrics), HistoryPlot('%s/history.png' % self.cpdir), CSVLogger('%s/history.csv' % self.cpdir), ModelCheckpoint('%s/weights_val_F2.hdf5' % self.cpdir, monitor='val_F2', verbose=1, save_best_only=True, mode='max'), ReduceLROnPlateau(monitor='val_F2', factor=0.5, patience=2, min_lr=1e-4, epsilon=1e-2, verbose=1, mode='max'), EarlyStopping(monitor='val_F2', patience=30, verbose=1, mode='max') ] nb_steps_trn = ceil(len(imgs_idxs_trn) * 1. / self.config['batch_size_trn']) nb_steps_val = ceil(len(imgs_idxs_val) * 1. / self.config['batch_size_trn']) self.net.fit_generator(gen_trn, steps_per_epoch=nb_steps_trn, epochs=self.config['trn_nb_epochs'], verbose=1, callbacks=cb, workers=3, pickle_safe=True, validation_data=gen_val, validation_steps=nb_steps_val)
def __init__(self, ensemble_def_file): logger = logging.getLogger(funcname()) # Read spreadsheet values to get prediction paths. df = pd.read_csv(ensemble_def_file) paths_yp_trn_initial = [] for n, gtrn in enumerate(df['yp_trn_glob'].values): paths = sorted(glob(os.path.expanduser(gtrn))) if len(paths) == 0: logger.warn("No files found for glob %d: %s" % (n, gtrn)) paths_yp_trn_initial += paths self.paths_yp_trn = [] # Read and concatenate all of the yp_trn and yp_tst files into two matrices. logger.info("Loading predictions...") self.yp_trn_all = [] self.yp_tst_all = [] for i, p_trn in enumerate(paths_yp_trn_initial): logger.info(p_trn) p_tst = p_trn.replace('trn', 'tst') if not os.path.exists(p_tst): logger.warning("Skipping %s because it doesn't have a matching test file." % p_trn) continue yp_trn = np.load(p_trn) if yp_trn.shape != (NUM_IMAGES_TRN, NUM_OUTPUTS): logger.warning("Skipping %s because the trn shape is incorrect" % p_trn) continue yp_tst = np.load(p_tst) if yp_tst.shape != (NUM_IMAGES_TST, NUM_OUTPUTS): logger.warning("Skipping %s because the tst shape is incorrect" % p_tst) continue self.yp_trn_all.append(yp_trn) self.yp_tst_all.append(yp_tst) self.paths_yp_trn.append(p_trn) self.yp_trn_all = np.array(self.yp_trn_all) self.yp_tst_all = np.array(self.yp_tst_all)
def train_batch_gen(self, csv_path='data/train_v2.csv', imgs_dir='data/train-jpg-v2'): logger = logging.getLogger(funcname()) # Helpers. scale = lambda x: (x - np.min(x)) / (np.max(x) - np.min(x)) * 2 - 1 onehot_to_distribution = lambda x: np.argmax(x, axis=1) / np.sum(np.argmax(x, axis=1)) # Read the CSV and error-check contents. df = pd.read_csv(csv_path) img_names = ['%s/%s.jpg' % (imgs_dir, n) for n in df['image_name'].values] tag_sets = [set(t.strip().split(' ')) for t in df['tags'].values] # Error check. for img_name, tag_set in zip(img_names, tag_sets): assert path.exists(img_name), img_name assert len(tag_set) > 0, tag_set while True: # New batches at each iteration to prevent over-writing previous batch before it's used. imgs_batch = np.zeros([self.config['batch_size'], ] + self.config['input_shape'], dtype=np.float32) tags_batch = np.zeros([self.config['batch_size'], ] + self.config['output_shape'], dtype=np.uint8) # Sample *self.config['batch_size']* random rows and build the batches. for batch_idx in range(self.config['batch_size']): data_idx = self.rng.randint(0, len(img_names)) img = resize(imread(img_names[data_idx], mode='RGB'), self.config[ 'input_shape'][:2], preserve_range=True, mode='constant') if self.config['trn_transform']: imgs_batch[batch_idx] = scale(random_transforms(img, nb_min=0, nb_max=2)) else: imgs_batch[batch_idx] = scale(img) tags_batch[batch_idx] = tagset_to_boolarray(tag_sets[data_idx]) yield imgs_batch, tags_batch
def create_net(self, weights_path=None): logger = logging.getLogger(funcname()) inputs = Input(shape=self.config['input_shape']) inception = InceptionV3(include_top=False, weights='imagenet', input_tensor=inputs, pooling='max') features = Dropout(0.1)(inception.output) # Single softmax classifier for the four cloud-cover classes. x = BatchNormalization()(features) x = Dense(256)(x) x = PReLU()(x) x = Dropout(0.2)(x) x = Dense(4, activation='softmax', name='out_cc')(x) tc = x # Individual softmax classifiers for remaining classes. clsf_rest = [] for _ in range(13): x = BatchNormalization()(features) x = Dense(256)(x) x = PReLU()(x) x = Dropout(0.2)(x) x = Dense(2, activation='softmax', name='out_%d' % _)(x) x = Lambda(lambda x: x[:, 1:])(x) clsf_rest.append(x) # Concatenate the rest of the tags and nullify them if cloudy was predicted. tr = concatenate(clsf_rest, axis=-1) # Combine tags into one tensor. tags_comb = concatenate([tc, tr]) self.net = Model(inputs=inputs, outputs=tags_comb) def kldiv(yt, yp): '''KL divergence for each example in batch.''' yt = K.clip(yt, K.epsilon(), 1) yp = K.clip(yp, K.epsilon(), 1) return K.sum(yt * K.log(yt / yp), axis=1) def wlogloss(yt, yp): '''Weighted log loss for each example in batch.''' # Weight false negative errors. This should decrease as recall increases. meanpos = K.sum(yp * yt) / (K.sum(yt) + K.epsilon()) wfnmax = 20. wfnmult = (1. - meanpos) * wfnmax # Weight false positive errors. wfp = 1. wmat = (yt * wfnmult) + (K.abs(yt - 1) * wfp) errmat = yt * K.log(yp + K.epsilon()) + ( (1 - yt) * K.log(1 - yp + K.epsilon())) return -1 * errmat * wmat def cloudymult(ytc, ypr): '''Multiplier for cases where the cloudy tag is present but other tags are predicted.''' mult = K.clip( K.sum(ypr, axis=1, keepdims=True) * ytc[:, 1:2], 1, 13) return mult def cloudyerrors(yt, yp): '''Number of samples per batch with cloudy tag errors.''' ytc = yt[:, :4] ypr = yp[:, 4:] x = K.round(K.max(ypr, axis=1)) * ytc[:, 1] return K.sum(x) def combined_loss(yt, yp): ytc = yt[:, :4] ypc = yp[:, :4] ytr = yt[:, 4:] ypr = yp[:, 4:] # Losses computed per example in the batch. lc = kldiv(ytc, ypc) # (b, 1) lr = wlogloss(ytr, ypr) # (b, 13) cm = cloudymult(ytc, ypr) # (b, 1) # Return a single scalar. return K.mean(lc * cm) + K.mean(lr * cm) def ccsum(yt, yp): return K.mean(K.sum(yp[:, :4], axis=1)) # Generate an F2 metric for each tag. tf2_metrics = [] for i, t in enumerate(self.TAGS_net_short): @rename('F2_%s' % t) def tagF2(yt, yp, i=i): return F2(yt[:, i], yp[:, i]) tf2_metrics.append(tagF2) # Generate a metric for each tag that tracks how often it occurs in a batch. tcnt_metrics = [] for i, t in enumerate(TAGS_short): @rename('cnt_%s' % t) def tagcnt(yt, yp, i=i): return K.sum(yt[:, i]) tcnt_metrics.append(tagcnt) self.net.compile(optimizer=Adam(0.0007, decay=0.0001), metrics=[F2, prec, reca, cloudyerrors, ccsum] + tf2_metrics + tcnt_metrics, loss=combined_loss) self.net.summary() plot_model(self.net, to_file='%s/net.png' % self.cpdir) if weights_path is not None: logger.info('Loading weights from %s' % weights_path) self.net.load_weights(weights_path)
def fit(self, yt_trn, nb_iter=5, use_hyperopt=True, use_rand_search=False, max_parallel=cpu_count()): logger = logging.getLogger(funcname()) np.set_printoptions(formatter={'float': '{: 0.3f}'.format}) N_trn = len(self.paths_yp_trn) # results format: [(F2 score, tag thresholds, ensemble weights)] results, best_idx = [], 0 # Try equal weights w = np.ones((N_trn, NUM_OUTPUTS), dtype=np.float16) f2_opt, thresh_opt, w = get_weighted_optimized_results(yt_trn, self.yp_trn_all, w) results.append((f2_opt, thresh_opt, w)) logger.info('Equal weights: f2 = %f' % f2_opt) # Try using each member individually. try_individually = False if try_individually: w = np.zeros((N_trn, NUM_OUTPUTS), dtype=np.float16) for it in range(N_trn): w *= 0 w[it] = 1 f2_opt, thresh_opt, w = get_weighted_optimized_results(yt_trn, self.yp_trn_all, w) logger.info('%-70s f2 = %f' % (self.paths_yp_trn[it][-60:], f2_opt)) if f2_opt > 0.88: results.append((f2_opt, thresh_opt, w)) if f2_opt > results[best_idx][0]: old_best = results[best_idx][0] best_idx = len(results)-1 logger.info('f2 improved from %lf to %lf' % (old_best, f2_opt)) # serialize_ensemble(f2_opt, thresh_opt, w, names_tst, yp_tst_all, self.yp_trn_all, self.paths_yp_trn) if use_hyperopt: optimize_individually = True if optimize_individually: logging.getLogger("hyperopt.tpe").setLevel(logging.WARNING) if max_parallel == 1: tag_weights = [] for tag_idx in range(NUM_OUTPUTS): logger.info("Optimizing tag %d" % tag_idx) tag_weights.append(optimize_single_tag(tag_idx, self.yp_trn_all[:,:,tag_idx], yt_trn[:, tag_idx], nb_iter)) else: p = Pool(min(max_parallel, NUM_OUTPUTS)) tag_weights = p.map(optimize_single_tag, [(tag_idx, self.yp_trn_all[:,:,tag_idx], yt_trn[:, tag_idx], nb_iter) for tag_idx in range(NUM_OUTPUTS)]) w = np.transpose(tag_weights) f2_opt, thresh_opt, w = get_weighted_optimized_results(yt_trn, self.yp_trn_all, w) logger.info("Optimized weights: f2 = %f" % f2_opt) results.append((f2_opt, thresh_opt, w)) else: self.trials = Trials() def objective(w, self=self, yt_trn=yt_trn): w = np.array(w).reshape((N_trn, NUM_OUTPUTS)) f2_opt, thresh_opt, w = get_weighted_optimized_results(yt_trn, self.yp_trn_all, w) if len(self.trials.trials) > 1: if f2_opt > -self.trials.best_trial['result']['loss']: logger = logging.getLogger(funcname()) logger.info('hyperopt f2 improved from %lf to %lf' % (-self.trials.best_trial['result']['loss'], f2_opt)) return { 'loss':-f2_opt, 'status': STATUS_OK, 'thresholds': thresh_opt, 'weights': w, } weights_space_all = [hp.uniform(str(i), 0, 1) for i in range(N_trn*NUM_OUTPUTS)] total = 0 for s in weights_space_all: total += s for i in range(len(weights_space_all)): weights_space_all[i] /= total best = fmin(objective, space=weights_space_all, algo=tpe.suggest, max_evals=nb_iter, trials=self.trials) sortedTrials = sorted(self.trials.trials, key=lambda x: x['result']['loss'])[:10] # Process hyperopt results into our results list for t in sortedTrials: f2_opt = -t['result']['loss'] thresh_opt = t['result']['thresholds'] w = t['result']['weights'] if f2_opt > 0.88: results.append((f2_opt, thresh_opt, w)) if f2_opt > results[best_idx][0]: best_idx = len(results)-1 if use_rand_search: # Randomly choose a weight for each tag across all member predictions. logger.info('Searching random weights...') f2_scores = [] for it in range(nb_iter): w = np.random.rand(N_trn, NUM_OUTPUTS) f2_opt, thresh_opt, w = get_weighted_optimized_results(yt_trn, self.yp_trn_all, w) f2_scores.append(f2_opt) if f2_opt > 0.88: results.append((f2_opt, thresh_opt, w)) if f2_opt > results[best_idx][0]: old_best = results[best_idx][0] best_idx = len(results)-1 logger.info('%-05.2lf: f2 improved from %lf to %lf:\n' % ((it / nb_iter * 100), old_best, f2_opt)) # serialize_ensemble(f2_opt, thresh_opt, w, names_tst, yp_tst_all, self.yp_trn_all, self.paths_yp_trn) if it % 50 == 0: logger.info('%-05.2lf: f2 mean=%.4lf, min=%.4lf, max=%.4lf, stdv=%.4lf, unique=%d' % \ ((it / nb_iter * 100), np.mean(f2_scores), np.min(f2_scores), np.max(f2_scores), np.std(f2_scores), len(np.unique(f2_scores)))) return results
def on_train_begin(self, logs): logger = logging.getLogger(funcname()) for idx, layer in enumerate(self.model.layers[:self.frozen_up_to_idx]): layer.trainable = False logger.info('Freezing layer %d: %s' % (idx, layer.name))
def on_epoch_end(self, epoch, logs): ''' 1. Predictions on every image in the validation set. 2. Evaluate the precision, recall, and F2 of each tag. 3. Store the metrics and predictions in a pickle file in the cpdir. ''' logger = logging.getLogger(funcname()) # Make predictions and store all true and predicted tags. yt = np.zeros((self.batch_size * self.nb_steps, len(TAGS))) yp = np.zeros((self.batch_size * self.nb_steps, len(TAGS))) for bidx in tqdm(range(self.nb_steps)): ib, tb = next(self.batch_gen) yt[bidx * self.batch_size:(bidx + 1) * self.batch_size] = tb yp[bidx * self.batch_size:(bidx + 1) * self.batch_size] = self.model.predict_on_batch(ib) # Find optimal thresholds. thresholds = optimize_thresholds(yt, yp) yp_opt = (yp > thresholds).astype(np.uint8) # Print per-tag metrics with stress-inducing colors. tags_f2, tags_p, tags_r = [], [], [] for tidx in range(len(TAGS)): f2, p, r = f2pr(yt[:, tidx], yp_opt[:, tidx]) tags_f2.append(f2) tags_p.append(p) tags_r.append(p) s = '%-20s F2=%.3lf p=%.3lf r=%.3lf t=%.3lf' % ( TAGS[tidx], f2, p, r, thresholds[tidx]) if f2 > 0.9: s = colored(s, 'green') elif f2 > 0.8: s = colored(s, 'yellow') else: s = colored(s, 'red') logger.info(s) # Metric variance across tags. logs['val_f2_var'] = np.var(tags_f2) logs['val_prec_var'] = np.var(tags_p) logs['val_reca_var'] = np.var(tags_r) # Metrics on all predictions. f2, p, r = f2pr(yt, (yp > 0.5)) logger.info('Unoptimized F2=%.3lf, p=%.3lf, r=%.3lf' % (f2, p, r)) f2, p, r = f2pr(yt, yp_opt) logger.info('Optimized F2=%.3lf, p=%.3lf, r=%.3lf' % (f2, p, r)) logger.info( 'Variance F2=%.3lf, p=%.3lf, r=%.3lf' % (logs['val_f2_var'], logs['val_prec_var'], logs['val_reca_var'])) # Record optimized metrics. logs['val_F2'] = f2 logs['val_prec'] = p logs['val_reca'] = r # Let em know you improved. if f2 > self.best_metric: logger.info('val_F2 improved from %.3lf to %.3lf %s!' % (self.best_metric, f2, u'\U0001F642')) self.best_metric = f2 self.best_epoch = epoch else: logger.info('Last improvement %d epochs ago %s. Maybe next time!' % ((epoch - self.best_epoch), u'\U0001F625'))
def predict(model_class, args): """Instantiates the model and makes augmented predictions. Serializes the predictions as numpy matrices.""" logger = logging.getLogger(funcname()) # Generate ID from model file. Used for saving files. fp = open(args['model'], 'rb') MD5ID = md5(fp.read()).hexdigest() fp.close() # Setup model. model = model_class(model_path=args['model']) model.cfg['cpdir'] = '/'.join(args['model'].split('/')[:-1]) assert 'hdf5_path_trn' in model.cfg assert 'hdf5_path_tst' in model.cfg assert 'tst_batch_size' in model.cfg # Load references to hdf5 data. data_trn = h5py.File(model.cfg['hdf5_path_trn'], 'r') data_tst = h5py.File(model.cfg['hdf5_path_tst'], 'r') imgs_trn, tags_trn = data_trn.get('images'), data_trn.get('tags')[...] imgs_tst, tags_tst = data_tst.get('images'), data_tst.get('tags') names_trn = data_trn.attrs['names'].split(',') names_tst = data_tst.attrs['names'].split(',') aug_funcs = [ ('identity', lambda x: x), ('vflip', lambda x: x[:, ::-1, ...]), ('hflip', lambda x: x[:, :, ::-1]), ('rot90', lambda x: np.rot90(x, 1, axes=(1, 2))), ('rot180', lambda x: np.rot90(x, 2, axes=(1, 2))), ('rot270', lambda x: np.rot90(x, 3, axes=(1, 2))), ('rot90vflip', lambda x: np.rot90(x, 1, axes=(1, 2))[:, ::-1, ...]), ('rot90hflip', lambda x: np.rot90(x, 1, axes=(1, 2))[:, :, ::-1]) ] # Keep mean combination of all augmentations. yp_trn_all = np.zeros(tags_trn.shape, dtype=np.float16) yp_tst_all = np.zeros(tags_tst.shape, dtype=np.float16) # Make training and testing predictions batch-by-batch for multiple # augmentations. Serialize the matrix of activations for each augmentation. for aug_name, aug_func in aug_funcs: logger.info('TTA: %s' % (aug_name)) # Train set. yp = np.zeros(tags_trn.shape, dtype=np.float16) for i0 in tqdm(range(0, imgs_trn.shape[0], model.cfg['tst_batch_size'])): i1 = i0 + min(model.cfg['tst_batch_size'], imgs_trn.shape[0] - i0) ib = np.array([ imresize(img[...], model.cfg['input_shape']) for img in imgs_trn[i0:i1] ]) yp[i0:i1] = model.predict_batch(aug_func(ib)) # Optimize activation thresholds and print F2 as a sanity check. f2, p, r = f2pr(tags_trn, (yp > 0.5).astype(np.uint8)) logger.info('Default f2=%.4lf, p=%.4lf, r=%.4lf' % (f2, p, r)) thresh_opt = optimize_thresholds(tags_trn, yp) f2, p, r = f2pr(tags_trn, (yp > thresh_opt)) logger.info('Optimized f2=%.4lf, p=%.4lf, r=%.4lf' % (f2, p, r)) # Save csv submission with default and optimized thresholds. csv_path = '%s/submission_trn_%s_def_%s.csv' % (model.cpdir, aug_name, MD5ID) submission(names_trn, (yp > 0.5), csv_path) csv_path = '%s/submission_trn_%s_opt_%s.csv' % (model.cpdir, aug_name, MD5ID) submission(names_trn, (yp > thresh_opt), csv_path) # Save raw activations. npy_path = '%s/yp_trn_%s_%s.npy' % (model.cpdir, aug_name, MD5ID) np.save(npy_path, yp) logger.info('Saved %s.' % npy_path) # Add to mean combination. yp_trn_all += yp / len(aug_funcs) # Test set. yp = np.zeros(tags_tst.shape, dtype=np.float16) for i0 in tqdm(range(0, imgs_tst.shape[0], model.cfg['tst_batch_size'])): i1 = i0 + min(model.cfg['tst_batch_size'], imgs_tst.shape[0] - i0) ib = np.array([ imresize(img[...], model.cfg['input_shape']) for img in imgs_tst[i0:i1] ]) yp[i0:i1] = model.predict_batch(aug_func(ib)) # Save csv submission with default and optimized thresholds. csv_path = '%s/submission_tst_%s_def_%s.csv' % (model.cpdir, aug_name, MD5ID) submission(names_tst, (yp > 0.5), csv_path) csv_path = '%s/submission_tst_%s_opt_%s.csv' % (model.cpdir, aug_name, MD5ID) submission(names_tst, (yp > thresh_opt), csv_path) # Save raw activations. npy_path = '%s/yp_tst_%s_%s.npy' % (model.cpdir, aug_name, MD5ID) np.save(npy_path, yp) logger.info('Saved %s.' % npy_path) # Add to mean combination. yp_tst_all += yp / len(aug_funcs) # Optimize activation thresholds for combined predictions. logger.info('TTA: mean') f2, p, r = f2pr(tags_trn, (yp_trn_all > 0.5)) logger.info('Default f2 =%.4lf, p=%.4lf, r=%.4lf' % (f2, p, r)) thresh_opt = optimize_thresholds(tags_trn, yp_trn_all) f2, p, r = f2pr(tags_trn, (yp_trn_all > thresh_opt)) logger.info('Optimized f2 =%.4lf, p=%.4lf, r=%.4lf' % (f2, p, r)) # Save train and test csv submission with default and optimized thresholds. csv_path = '%s/submission_trn_%s_def_%s.csv' % (model.cpdir, 'mean_aug', MD5ID) submission(names_trn, (yp_trn_all > 0.5), csv_path) csv_path = '%s/submission_trn_%s_opt_%s.csv' % (model.cpdir, 'mean_aug', MD5ID) submission(names_trn, (yp_trn_all > thresh_opt), csv_path) csv_path = '%s/submission_tst_%s_def_%s.csv' % (model.cpdir, 'mean_aug', MD5ID) submission(names_tst, (yp_tst_all > 0.5), csv_path) csv_path = '%s/submission_tst_%s_opt_%s.csv' % (model.cpdir, 'mean_aug', MD5ID) submission(names_tst, (yp_tst_all > thresh_opt), csv_path) # Save train and test raw activations. npy_path = '%s/yp_trn_%s_%s.npy' % (model.cpdir, 'mean_aug', MD5ID) np.save(npy_path, yp_trn_all) logger.info('Saved %s.' % npy_path) npy_path = '%s/yp_tst_%s_%s.npy' % (model.cpdir, 'mean_aug', MD5ID) np.save(npy_path, yp_tst_all) logger.info('Saved %s.' % npy_path)