def make_cv(sep_time): """ make cv train/valid csv files in preprocessed folder """ log_name = now() log_path = f'log/make_feature/' Path(log_path).mkdir(parents=True, exist_ok=True) log_fname = log_path + f'{log_name}.log' logger = Logger('make_feature', log_fname) df = load_csv(config.TRAIN_PATH, dtypes=config.TRAIN_DTYPES, parse_dates=config.TRAIN_PARSE_DATES) df = df.drop('attributed_time', axis=1) df['hour'] = df.click_time.dt.hour df['day'] = df.click_time.dt.day ce_cols = [['ip', 'day', 'hour'], ['ip', 'app'], ['ip', 'app', 'os'], ['ip', 'device'], ['app', 'channel']] with logger.interval_timer('compute'): df = to_parallel(df, make_ce, ce_cols) split_gen = enumerate(timeseries_cv(df, config.SEP_TIME)) for num, (train_index, test_index) in split_gen: logger.info(f"fold {num} start") train_df, test_df = df.loc[train_index], df.loc[test_index] train_df.to_csv(f'preprocessed/val/train.csv', index=False) test_df.to_csv(f'preprocessed/val/test.csv', index=False)
def add_features(dir_name, dump_dir, debug=None): """ add features to existed files """ log_name = now() log_path = f'log/make_feature/' Path(log_path).mkdir(parents=True, exist_ok=True) log_fname = log_path + f'{log_name}.log' logger = Logger('make_feature', log_fname) Path(dump_dir).mkdir(parents=True, exist_ok=True) cv = enumerate(generate_cv(dir_name, 4, debug)) for num, (train_df, test_df) in cv: df = dd.concat([train_df, test_df]) train_len = len(train_df) df['hour'] = df.click_time.dt.hour df['day'] = df.click_time.dt.day df = make_comp( df, [['ip', 'day', 'hour'], ['ip', 'app'], ['ip', 'app', 'os'], ['ip', 'device'], ['app', 'channel']], logger, count=True) df = make_comp(df, [['app', 'channel'], ['app', 'device'], ['app', 'os'], ['os', 'ip'], ['app', 'ip']], logger) df = df.compute().reset_index() df.drop('index', axis=1, inplace=True) df = make_diff(df, [['ip', 'app'], ['ip', 'app', 'os'], ['ip', 'device'], ['app', 'channel'], ['ip', 'app', 'os', 'device']], 1, logger) train_df, test_df = df.loc[:train_len], df.loc[train_len:] train_df.to_csv(dump_dir + f'fold_{num}_train.csv', index=False) test_df.to_csv(dump_dir + f'fold_{num}_test.csv', index=False)
# find maximum number of available GPUs call = "nvidia-smi --list-gpus" pipe = Popen(call, shell=True, stdout=PIPE).stdout available_gpus = pipe.read().decode().splitlines() NUM_GPUS = len(available_gpus) else: os.environ["CUDA_VISIBLE_DEVICES"] = str(results.GPUID) num_channels = results.num_channels plane = results.plane num_epochs = 1000000 num_patches = results.num_patches batch_size = results.batch_size model = results.model model_architecture = "unet" start_time = utils.now() experiment_details = start_time + "_" + model_architecture + "_" +\ results.experiment_details loss = results.loss learning_rate = 1e-4 utils.save_args_to_csv(results, os.path.join("results", experiment_details)) WEIGHT_DIR = os.path.join("models", "weights", experiment_details) TB_LOG_DIR = os.path.join("models", "tensorboard", start_time) MODEL_NAME = model_architecture + "_model_" + experiment_details MODEL_PATH = os.path.join(WEIGHT_DIR, MODEL_NAME + ".json") HISTORY_PATH = os.path.join(WEIGHT_DIR, MODEL_NAME + "_history.json")
max_idx, max_val = max(enumerate(pred), key=itemgetter(1)) max_true, val_true = max(enumerate(ground_truth), key=itemgetter(1)) pred_class = class_encodings[max_idx] gt_class = class_encodings[max_true] if max_idx != max_true: acc_count -= 1 record_results( results.OUTFILE, (os.path.basename(filename), gt_class, pred_class, confidences)) print("{} of {} images correctly classified.\nAccuracy: {:.2f}\n".format( str(acc_count), str(total), acc_count / total * 100.)) with open(os.path.join(PRED_DIR, now() + "_results.txt"), 'w') as f: with open(os.path.join(PRED_DIR, now() + "_results_errors.txt"), 'w') as e: for filename, pred, ground_truth in zip(filenames, preds, y): # find class of prediction via max max_idx, max_val = max(enumerate(pred), key=itemgetter(1)) max_true, val_true = max(enumerate(ground_truth), key=itemgetter(1)) pos = class_encodings[max_idx] # record confidences confidences = ", ".join( ["{:>5.2f}".format(x * 100) for x in pred]) if max_idx == max_true:
def experiment(train=None, test=None, seed=None): """experiment func """ cv_name = now() cv_log_path = f'cv/LightGBM/{cv_name}/' Path(cv_log_path).mkdir(parents=True, exist_ok=True) log_fname = cv_log_path + 'cv.log' cv_logger = Logger('CV_log', log_fname) cv_logger.info("Experiment Start") with cv_logger.interval_timer('load data'): if train: train_df = load_feather(train) # train_df = train_df.sample(100000) else: fs = Path('preprocessed/features').glob('train_*.csv') # fs = ['preprocessed/features/train_nextClick.csv', # 'preprocessed/features/train_ip_app_nextClick.csv'] train_df = load_data(config.TRAIN_PATH, fs, cv_logger, dump='preprocessed/train.ftr') # offset = pd.to_datetime('2017-11-07 16:00:00') # train_df = train_df[train_df.click_time >= offset] gc.collect() if test: test_df = load_feather(test) else: fs = Path('preprocessed/features').glob('test_*.csv') # fs = ['preprocessed/features/test_nextClick.csv', # 'preprocessed/features/test_ip_app_nextClick.csv'] test_df = load_data(config.TEST_PATH, fs, cv_logger, dump='preprocessed/test.ftr') gc.collect() train_df = train_df.reset_index(drop=True) test_df = test_df.reset_index(drop=True) cv_logger.info(config.SEP_TIME) with cv_logger.interval_timer('split'): split_gen = enumerate(timeseries_cv(train_df, config.SEP_TIME)) # dump configuration aucs = [] # add ip_day_hour_nunique, app_device_channel_nextClick, # ip_os_device_nextClick, train_cols = [ 'app', 'app_channel_ce', 'channel', 'device', 'hour', 'ip_app_ce', 'ip_app_channel_hour_mean', 'ip_app_channel_nextClick', 'ip_app_device_os_channel_nextClick', 'ip_app_device_os_nextClick', 'ip_app_nextClick', 'ip_app_nunique', 'ip_app_os_ce', 'ip_app_os_nunique', 'ip_channel_nunique', 'ip_day_hour_ce', 'ip_day_nunique', 'ip_day_hour_nunique', 'ip_device_nunique', 'ip_device_os_app_cumcount', 'ip_nextClick', 'ip_os_device_nextClick', 'app_device_channel_nextClick', 'ip_os_cumcount', 'ip_os_device_app_nunique', 'os' ] # encode_list = config.ENCODE_LIST # threshold = config.TE_THR valid_time = [4, 5, 6, 9, 10, 11, 13, 14, 15] # public_time = [5, 6, 9, 10, 11, 13, 14, 15] train_df = proc_bf_cv(train_df) gc.collect() for num, (train_idx, valid_idx) in split_gen: cv_logger.kiritori() cv_logger.info(f"fold {num} start") with cv_logger.interval_timer('train test split'): cvtrain_df = train_df.loc[train_idx] valid_df = train_df.loc[valid_idx] valid_df2 = valid_df[valid_df.hour.isin(valid_time)] cv_logger.info(f'train size {cvtrain_df.shape}') cv_logger.info(f'valid size {valid_df2.shape}') # valid_df3 = valid_df[valid_df.hour == 4] # valid_df4 = valid_df[valid_df.hour.isin(public_time)] # with cv_logger.interval_timer('target encode'): # cvtrain_df, valid_df, tes = custom_encode(cvtrain_df, # valid_df, # encode_list, # threshold, # cv_logger) # cvtrain_df = proc_bf_cv(cvtrain_df) # valid_df = proc_bf_cv(valid_df) # train_cols += [c for c in cvtrain_df.columns if '_te' in c] cv_logger.info("LGBM Baseline validation") eval_names = ['valid_lb'] train_X, train_y = cvtrain_df[train_cols], cvtrain_df.is_attributed eval_set = [] with cv_logger.interval_timer('valid make'): for df in [valid_df2]: X, y = df[train_cols], df.is_attributed eval_set.append((X, y)) cv_logger.info(f'train size {train_X.shape}') cv_logger.info(f'valid size {eval_set[0][0].shape}') cv_logger.info(list(train_X.columns)) gc.collect() lgbm = LGBMClassifier(n_estimators=1000, learning_rate=0.1, num_leaves=31, max_depth=-1, min_child_samples=20, min_child_weight=5, max_bin=255, scale_pos_weight=200, colsample_bytree=0.3, subsample=0.6, subsample_freq=0, random_state=seed, n_jobs=24) cv_logger.info(lgbm.get_params()) lgbm.fit(train_X, train_y, eval_metric="auc", eval_set=eval_set, eval_names=eval_names, early_stopping_rounds=30, verbose=10) auc = lgbm.best_score_ aucs.append(auc) cv_logger.info(f"naive LGBM AUC : {auc}") cv_logger.info(pformat(lgbm.evals_result_)) cv_logger.info("feature importance") fi = dict(zip(train_X.columns, lgbm.feature_importances_)) cv_logger.info(pformat(fi)) cv_logger.info(f"fold {num} end") del train_df cv_logger.double_kiritori() cv_logger.info("Cross Validation Done") cv_logger.info("Naive LGBM") cv_logger.info(f"AUC {auc}") cv_logger.info("Predict") # with cv_logger.interval_timer('all target encode'): # for te in tes: # test_df = te.transform(test_df) test_df = proc_bf_cv(test_df) test_X = test_df[train_cols] pred = lgbm.predict_proba(test_X, num_iteration=lgbm.best_iteration_) test_df['is_attributed'] = pred[:, 1] test_df['click_id'] = test_df.click_id.astype('uint32') sub = test_sup_merge(test_df) sub[['click_id', 'is_attributed']].to_csv(f'sub/{cv_name}_{seed}.csv', index=False) cv_logger.info("Experiment Done")
def main(): ############################################## # Load Data ############################################## train_data = [] test_data = [] train_label = [] test_label = [] for fname in glob.glob(os.path.join(train_data_dir, '*.npy')): train_data.append(np.load(fname)) miu, std = normalize(train_data) train_data = sort_by_length(train_data) for fname in glob.glob(os.path.join(train_label_dir, '*.npy')): train_label.append(np.load(fname)) for fname in glob.glob(os.path.join(test_data_dir, '*.npy')): test_data.append(np.load(fname)) normalize(test_data, miu=miu, std=std) for fname in glob.glob(os.path.join(test_label_dir, '*.npy')): test_label.append(np.load(fname)) ############################################## # Preparation ############################################## max_seq_length = max([d.shape[1] for d in train_data + test_data]) num_test_samples = len(test_data) (test_data_tensor, test_seq_lengths), (test_label_indices, test_label_vals, test_label_shape) = \ list(create_batches(test_data, test_label, max_seq_length, num_test_samples, range(num_test_samples)))[0] # borrow create_batch to transform test data / label cur_unixtime = time.time() cur_checkpoint_path = os.path.join(CHECKPOINT_DIR, '{:.0f}'.format(cur_unixtime)) if not os.path.exists(cur_checkpoint_path): os.makedirs(cur_checkpoint_path) cur_tb_summary_path = os.path.join(TENSORBOARD_LOG_DIR, '{:.0f}'.format(cur_unixtime)) if not os.path.exists(cur_tb_summary_path): os.makedirs(cur_tb_summary_path) ############################################## # Build Graph ############################################## graph = tf.Graph() with graph.as_default(): ################## # INPUT ################## X_train = tf.placeholder(tf.float32, shape=(max_seq_length, batch_size, num_features), name='X_train') X_test = tf.placeholder(tf.float32, shape=(max_seq_length, num_test_samples, num_features), name='X_test') y_train_indices = tf.placeholder(tf.int64, shape=(None, 2)) y_train_vals = tf.placeholder(tf.int64) y_train_shape = tf.placeholder(tf.int64, shape=(2, )) y_train = tf.cast( tf.SparseTensor(y_train_indices, y_train_vals, y_train_shape), tf.int32) y_test_indices = tf.placeholder(tf.int64, shape=(None, 2)) y_test_vals = tf.placeholder(tf.int64) y_test_shape = tf.placeholder(tf.int64, shape=(2, )) y_test = tf.cast( tf.SparseTensor(y_test_indices, y_test_vals, y_test_shape), tf.int32) seq_lengths_train = tf.placeholder(tf.int32, shape=(batch_size, )) seq_lengths_test = tf.placeholder(tf.int32, shape=(num_test_samples, )) ################## # BGRU ################## def brnn_layer(fw_cell, bw_cell, inputs, seq_lengths, scope=None): (output_fw, output_bw), _ = tf.nn.bidirectional_dynamic_rnn( fw_cell, bw_cell, inputs=inputs, dtype=tf.float32, sequence_length=seq_lengths, time_major=True, scope=scope) brnn_combined_outputs = output_fw + output_bw return brnn_combined_outputs def multi_brnn_layer(inputs, seq_lengths, num_layers, is_training, use_dropout=True, keep_prob=0.5): inner_outputs = inputs for n in range(num_layers): forward_cell = rnn_cell_fn(num_rnn_hidden, activation=rnn_cell_activation_fn) backward_cell = rnn_cell_fn(num_rnn_hidden, activation=rnn_cell_activation_fn) inner_outputs = brnn_layer(forward_cell, backward_cell, inner_outputs, seq_lengths, 'brnn_{}'.format(n)) if use_dropout: inner_outputs = tf.contrib.layers.dropout( inner_outputs, keep_prob=keep_prob, is_training=is_training) return inner_outputs with tf.variable_scope('rlan') as scope: brnn_outputs_train = multi_brnn_layer(X_train, seq_lengths_train, num_rnn_layers, True, keep_prob=keep_prob) brnn_outputs_train = [ tf.reshape(t, shape=(batch_size, num_rnn_hidden)) for t in tf.split(brnn_outputs_train, max_seq_length, axis=0) ] scope.reuse_variables() brnn_outputs_test = multi_brnn_layer(X_test, seq_lengths_test, num_rnn_layers, False, keep_prob=keep_prob) brnn_outputs_test = [ tf.reshape(t, shape=(num_test_samples, num_rnn_hidden)) for t in tf.split(brnn_outputs_test, max_seq_length, axis=0) ] # TODO: Learning Rate Decay # TODO: Use better initialization # TODO: Add BatchNorm # TODO: Joint LM-acoustic Model # TODO: Implement Demo (audio => text) ################## # CTC ################## # with tf.name_scope('fc-layer'): fc_W = tf.get_variable('fc_W', initializer=tf.truncated_normal( [num_rnn_hidden, num_classes])) fc_b = tf.get_variable('fc_b', initializer=tf.truncated_normal([num_classes])) logits_train = [ tf.matmul(output, fc_W) + fc_b for output in brnn_outputs_train ] logits3d_train = tf.stack(logits_train) logits_test = [ tf.matmul(output, fc_W) + fc_b for output in brnn_outputs_test ] logits3d_test = tf.stack(logits_test) loss_train = tf.reduce_mean( tf.nn.ctc_loss(y_train, logits3d_train, seq_lengths_train)) loss_test = tf.reduce_mean( tf.nn.ctc_loss(y_test, logits3d_test, seq_lengths_test)) var_trainable_op = tf.trainable_variables() grads, _ = tf.clip_by_global_norm( tf.gradients(loss_train, var_trainable_op), grad_clip) optimizer = tf.train.AdamOptimizer(learning_rate).apply_gradients( zip(grads, var_trainable_op)) pred_train = tf.to_int32( tf.nn.ctc_beam_search_decoder(logits3d_train, seq_lengths_train, merge_repeated=False)[0][0]) err_rate_train = tf.reduce_mean( tf.edit_distance(pred_train, y_train, normalize=True)) pred_test = tf.to_int32( tf.nn.ctc_beam_search_decoder(logits3d_test, seq_lengths_test, merge_repeated=False)[0][0]) err_rate_test = tf.reduce_mean( tf.edit_distance(pred_test, y_test, normalize=True)) # tf.summary.scalar('loss_train', loss_train) # tf.summary.scalar('loss_test', loss_test) # tf.summary.scalar('err_rate_train', err_rate_train) # tf.summary.scalar('err_rate_test', err_rate_test) # merged = tf.summary.merge_all() ############################################## # Run TF Session ############################################## # tb_file_writer = tf.summary.FileWriter(cur_tb_summary_path, graph) config = tf.ConfigProto() config.gpu_options.allow_growth = True with tf.Session(graph=graph, config=config) as sess: saver = tf.train.Saver(tf.global_variables(), max_to_keep=5, keep_checkpoint_every_n_hours=1) tf.global_variables_initializer().run() num_processed_batches = 0 for epoch in range(num_epochs): num_samples = len(train_data) perm = range(num_samples) if epoch == 0 else np.random.permutation( num_samples) batches = create_batches(train_data, train_label, max_seq_length, batch_size, perm) for batch, ((batch_data, batch_seq_lengths), (batch_indices, batch_vals, batch_shape)) in enumerate(batches): num_processed_batches += 1 if epoch > 0 and batch == 0: _, batch_loss, batch_err_rate, batch_pred, cur_test_loss, cur_test_err_rate, cur_test_pred = \ sess.run( [optimizer, loss_train, err_rate_train, pred_train, loss_test, err_rate_test, pred_test], feed_dict={X_train: batch_data, y_train_indices: batch_indices, y_train_vals: batch_vals, y_train_shape: batch_shape, seq_lengths_train: batch_seq_lengths, X_test: test_data_tensor, y_test_indices: test_label_indices, y_test_vals: test_label_vals, y_test_shape: test_label_shape, seq_lengths_test: test_seq_lengths}) merged_train_per = calc_PER( SparseTensor(batch_pred.indices, batch_pred.values, batch_pred.dense_shape), SparseTensor(batch_indices, batch_vals, batch_shape)) merged_test_per = calc_PER( SparseTensor(cur_test_pred.indices, cur_test_pred.values, cur_test_pred.dense_shape), SparseTensor(test_label_indices, test_label_vals, test_label_shape)) with open(result_file_name, 'a') as f: f.write('{},{},{},{},{},{},{},{}\n'.format( now(), epoch, batch, num_processed_batches, batch_err_rate, merged_train_per, cur_test_err_rate, merged_test_per)) print( '[epoch: {}, batch: {}] err_train = {:.4f} (phn_merged: {:.4f}))' .format(epoch, batch, batch_err_rate, merged_train_per, merged_test_per)) else: _, batch_loss, batch_err_rate, batch_pred = \ sess.run([optimizer, loss_train, err_rate_train, pred_train], feed_dict={X_train: batch_data, y_train_indices: batch_indices, y_train_vals: batch_vals, y_train_shape: batch_shape, seq_lengths_train: batch_seq_lengths}) merged_train_per = calc_PER( SparseTensor(batch_pred.indices, batch_pred.values, batch_pred.dense_shape), SparseTensor(batch_indices, batch_vals, batch_shape)) with open(result_file_name, 'a') as f: f.write('{},{},{},{},{},{},,\n'.format( now(), epoch, batch, num_processed_batches, batch_err_rate, merged_train_per)) # tb_file_writer.add_summary(summary, num_processed_batches) saver.save(sess, os.path.join(cur_checkpoint_path, 'model'), global_step=epoch)
train_datagen = NIfTIImageDataGenerator() test_datagen = NIfTIImageDataGenerator() train_generator = train_datagen.flow_from_directory( TRAIN_DIR, **params, **train_params) validation_generator = test_datagen.flow_from_directory( VAL_DIR, **params, **val_params) ############### CALLBACKS ############### callbacks_list = [] # Checkpoint WEIGHT_NAME = MODEL_NAME.replace("model", "weights") + "_" +\ now()+"-epoch-{epoch:04d}-val_acc-{val_acc:.4f}.hdf5" fpath = os.path.join(WEIGHT_DIR, WEIGHT_NAME) checkpoint = ModelCheckpoint(fpath, monitor='val_acc', verbose=1, save_best_only=True, mode='max', save_weights_only=True) callbacks_list.append(checkpoint) # Early Stopping, used to quantify convergence es = EarlyStopping(monitor='val_acc', min_delta=1e-4, patience=20) callbacks_list.append(es) ############### TRAINING ############### model.fit_generator(train_generator,
############### DATA IMPORT ############### # X, y, filenames, num_classes, img_shape = load_data(PREPROCESSED_DIR, classes) # # print("Finished data processing") ## load entire dataset into RAM (max 70) # X, y, filenames, num_classes, img_shape = load_data(TRAIN_DIR, classes) classes = results.classes.replace(" ", "").split(',') ############### MODEL SELECTION ############### LR = 1e-4 LOAD_WEIGHTS = False MODEL_NAME = "phinet_model_" + now() MODEL_PATH = os.path.join(WEIGHT_DIR, MODEL_NAME + ".json") if not os.path.exists(WEIGHT_DIR): os.makedirs(WEIGHT_DIR) if LOAD_WEIGHTS: weight_files = os.listdir(WEIGHT_DIR) weight_files.sort() best_weights = os.path.join(WEIGHT_DIR, weight_files[-1]) with open(MODEL_PATH) as json_data: model = model_from_json(json.load(json_data)) model.load_weights(best_weights) else: model = phinet(n_classes=len(classes), learning_rate=LR)
model = model_from_json(json.load(json_data)) model.load_weights(best_weights) else: model = phinet(n_classes=num_classes, learning_rate=LR) # save model architecture to file json_string = model.to_json() with open(MODEL_PATH, 'w') as f: json.dump(json_string, f) ############### CALLBACKS ############### callbacks_list = [] # Checkpoint WEIGHT_NAME = MODEL_NAME.replace("model", "weights") + "_" + now( ) + "-epoch-{epoch:04d}-val_acc-{val_acc:.4f}.hdf5" fpath = os.path.join(WEIGHT_DIR, WEIGHT_NAME) checkpoint = ModelCheckpoint(fpath, monitor='val_acc', verbose=1, save_best_only=True, mode='max', save_weights_only=True) callbacks_list.append(checkpoint) # Dynamic Learning Rate dlr = ReduceLROnPlateau(monitor="val_acc", factor=0.5, patience=5, mode='max', verbose=1,
num_channels = results.num_channels num_epochs = 1000000 num_patches = results.num_patches # 508257 batch_size = results.batch_size model = results.model experiment_details = results.experiment_details loss = results.loss learning_rate = 1e-4 MOUNT_POINT = os.path.join("..", "nihvandy", "ct_seg") LOGFILE = os.path.join(MOUNT_POINT, "multisite_training_log.txt") WEIGHT_DIR = os.path.join(MOUNT_POINT, "interleaved_weights", experiment_details) TB_LOG_DIR = os.path.join(MOUNT_POINT, "tensorboard", utils.now()) THIS_COMPUTER = open("host_id.cfg").read().split()[0] MODEL_NAME = "inception_model_" + experiment_details MODEL_PATH = os.path.join(WEIGHT_DIR, MODEL_NAME + ".json") # files and paths TRAIN_DIR = results.SRC_DIR for d in [WEIGHT_DIR, TB_LOG_DIR]: if not os.path.exists(d): os.makedirs(d) PATCH_SIZE = [int(x) for x in results.patch_size.split("x")] # multi site ordering
# find maximum number of available GPUs call = "nvidia-smi --list-gpus" pipe = Popen(call, shell=True, stdout=PIPE).stdout available_gpus = pipe.read().decode().splitlines() NUM_GPUS = len(available_gpus) else: os.environ["CUDA_VISIBLE_DEVICES"] = str(results.GPUID) num_channels = results.num_channels plane = results.plane num_epochs = 1000000 num_patches = results.num_patches batch_size = results.batch_size model = results.model model_architecture = "unet" start_time = utils.now() experiment_details = model_architecture + "_" +\ results.experiment_details loss = results.loss learning_rate = 1e-4 utils.save_args_to_csv(results, os.path.join("results", experiment_details)) MOUNT_POINT = os.path.join("nihvandy", "ct_seg") LOGFILE = os.path.join(MOUNT_POINT, "multisite_training_log.txt") WEIGHT_DIR = os.path.join(MOUNT_POINT, "models", "msl_weights", experiment_details) TB_LOG_DIR = os.path.join(MOUNT_POINT, "models", "tensorboard", start_time) THIS_COMPUTER = open("host_id.cfg").read().split()[0]