def on_epoch_end(self, epoch, logs={}): self.losses.append(logs.get('loss')) self.val_losses.append(logs.get('val_loss')) save_path = os.path.join( self.FLAGS.train_local, 'weights_%03d_%.4f.h5' % (epoch, logs.get('val_acc'))) self.model.save_weights(save_path) if self.FLAGS.train_url.startswith('s3://'): save_url = os.path.join( self.FLAGS.train_url, 'weights_%03d_%.4f.h5' % (epoch, logs.get('val_acc'))) file.copy(save_path, save_url) print('save weights file', save_path) if self.FLAGS.keep_weights_file_num > -1: weights_files = glob(os.path.join(self.FLAGS.train_local, '*.h5')) if len(weights_files) >= self.FLAGS.keep_weights_file_num: weights_files.sort( key=lambda file_name: os.stat(file_name).st_ctime, reverse=True) for file_path in weights_files[self.FLAGS. keep_weights_file_num:]: os.remove( file_path) # only remove weights files on local path
def save_model(save_dir, phase, name, epoch, f1score, model): if not os.path.exists(save_dir): os.mkdir(save_dir) save_dir = os.path.join(save_dir, args.model) if not os.path.exists(save_dir): os.mkdir(save_dir) save_dir = os.path.join(save_dir, phase) if not os.path.exists(save_dir): os.mkdir(save_dir) state_dict = model.state_dict() for key in state_dict.keys(): state_dict[key] = state_dict[key].cpu() state_dict_all = { 'state_dict': state_dict, 'epoch': epoch, 'f1score': f1score, } saveStr = '{:s}.ckpt'.format(name) torch.save( state_dict_all , os.path.join(save_dir, saveStr)) file.copy(os.path.join(save_dir, saveStr), os.path.join(args.save_dir_obs, saveStr)) if 'best' in name and f1score > 0.3: bestStr = '{:s}_{:s}.ckpt'.format(name, str(epoch)) torch.save(state_dict_all , os.path.join(save_dir, bestStr)) file.copy(os.path.join(save_dir, bestStr), os.path.join(args.save_dir_obs, bestStr))
def train_model(FLAGS): # data flow generator train_sequence, validation_sequence = data_flow(FLAGS.data_local, FLAGS.batch_size, FLAGS.num_classes, FLAGS.input_size) optimizer = adam(lr=FLAGS.learning_rate, clipnorm=0.001) objective = 'binary_crossentropy' metrics = ['accuracy'] model = model_fn(FLAGS, objective, optimizer, metrics) if FLAGS.restore_model_path != '' and file.exists(FLAGS.restore_model_path): if FLAGS.restore_model_path.startswith('s3://'): restore_model_name = FLAGS.restore_model_path.rsplit('/', 1)[1] file.copy(FLAGS.restore_model_path, '/cache/tmp/' + restore_model_name) model.load_weights('/cache/tmp/' + restore_model_name,by_name=True) os.remove('/cache/tmp/' + restore_model_name) else: model.load_weights(FLAGS.restore_model_path,by_name=True) if not os.path.exists(FLAGS.train_local): os.makedirs(FLAGS.train_local) tensorBoard = TensorBoard(log_dir=FLAGS.train_local) history = LossHistory(FLAGS) model.fit_generator( train_sequence, steps_per_epoch=len(train_sequence), epochs=FLAGS.max_epochs, verbose=1, callbacks=[history, tensorBoard], validation_data=validation_sequence, max_queue_size=10, workers=int(multiprocessing.cpu_count() * 0.7), use_multiprocessing=True, shuffle=True ) print('training done!') if FLAGS.deploy_script_path != '': from save_model import save_pb_model save_pb_model(FLAGS, model) if FLAGS.test_data_url != '': print('test dataset predicting...') from eval import load_test_data img_names, test_data, test_labels = load_test_data(FLAGS) predictions = model.predict(test_data, verbose=0) right_count = 0 for index, pred in enumerate(predictions): predict_label = np.argmax(pred, axis=0) test_label = test_labels[index] if predict_label == test_label: right_count += 1 accuracy = right_count / len(img_names) print('accuracy: %0.4f' % accuracy) metric_file_name = os.path.join(FLAGS.train_local, 'metric.json') metric_file_content = '{"total_metric": {"total_metric_values": {"accuracy": %0.4f}}}' % accuracy with open(metric_file_name, "w") as f: f.write(metric_file_content + '\n') print('end')
def load_weights(model, weighs_file_path): if os.path.isfile(weighs_file_path): print('load weights from %s' % weighs_file_path) if weighs_file_path.startswith('s3://'): weighs_file_name = weighs_file_path.rsplit('/', 1)[1] file.copy(weighs_file_path, '/cache/tmp/' + weighs_file_name) weighs_file_path = '/cache/tmp/' + weighs_file_name model.load_weights(weighs_file_path) os.remove(weighs_file_path) else: model.load_weights(weighs_file_path) print('load weights success') else: print('load weights failed! Please check weighs_file_path')
def save_pb_model(FLAGS, model): if FLAGS.mode == 'train': pb_save_dir_local = FLAGS.train_local pb_save_dir_obs = FLAGS.train_url elif FLAGS.mode == 'save_pb': freeze_weights_file_dir = FLAGS.freeze_weights_file_path.rsplit( '/', 1)[0] if freeze_weights_file_dir.startswith('s3://'): pb_save_dir_local = '/cache/tmp' pb_save_dir_obs = freeze_weights_file_dir else: pb_save_dir_local = freeze_weights_file_dir pb_save_dir_obs = pb_save_dir_local signature = tf.saved_model.signature_def_utils.predict_signature_def( inputs={'input_img': model.input}, outputs={'output_score': model.output}) builder = tf.saved_model.builder.SavedModelBuilder( os.path.join(pb_save_dir_local, 'model')) legacy_init_op = tf.group(tf.tables_initializer(), name='legacy_init_op') builder.add_meta_graph_and_variables( sess=backend.get_session(), tags=[tf.saved_model.tag_constants.SERVING], signature_def_map={ 'predict_images': signature, }, legacy_init_op=legacy_init_op) builder.save() print('save pb to local path success') if pb_save_dir_obs.startswith('s3://'): file.copy_parallel(os.path.join(pb_save_dir_local, 'model'), os.path.join(pb_save_dir_obs, 'model')) print('copy pb to %s success' % pb_save_dir_obs) file.copy(os.path.join(FLAGS.deploy_script_path, 'config.json'), os.path.join(pb_save_dir_obs, 'model/config.json')) file.copy(os.path.join(FLAGS.deploy_script_path, 'customize_service.py'), os.path.join(pb_save_dir_obs, 'model/customize_service.py')) if file.exists(os.path.join(pb_save_dir_obs, 'model/config.json')) and \ file.exists(os.path.join(pb_save_dir_obs, 'model/customize_service.py')): print('copy config.json and customize_service.py success') else: print('copy config.json and customize_service.py failed')
# result_file.write('\n') if phase == 'test': continue result_file.close() import pandas as pd re = pd.read_csv(datadir+'/data/result/{:d}_{:s}_result.csv'.format(epoch, phase)) re.columns = ['target_file','text'] submit = pd.read_csv(datadir+'/submission.csv') submit = pd.merge(submit, re, how='left', on=['target_file']) submit = submit.drop(['target_file'], axis=1) submit = submit.replace(to_replace='None',value=20) submit = submit.fillna('上') submit.to_csv(datadir+'/predict.csv', header=True, index=None, encoding='utf-8') file.copy(datadir+'/predict.csv', args.data_dir_obs+'/predict.csv') def get_weight(labels): labels = labels.data.cpu().numpy() weights = np.zeros_like(labels) # weight_false = 1.0 / ((labels<0.5).sum() + 10e-20) # weight_true = 1.0 / ((labels>0.5).sum() + 10e-20) weight_false = 1.0 / ((labels<0.5).sum(0) + 10e-20) label_true = (labels>0.5).sum(0) for i in range(labels.shape[1]): label_i = labels[:,i] weight_i = np.ones(labels.shape[0]) * weight_false[i] # weight_i = np.ones(labels.shape[0]) * weight_false if label_true[i] > 0: weight_i[label_i>0.5] = 1.0 / label_true[i] weights[:,i] = weight_i
def train_model(FLAGS): # data flow generator train_sequence, validation_sequence = data_flow(FLAGS.data_local, FLAGS.batch_size, FLAGS.num_classes, FLAGS.input_size) # optimizer = adam(lr=FLAGS.learning_rate, clipnorm=0.001) optimizer = Nadam(lr=FLAGS.learning_rate, beta_1=0.9, beta_2=0.999, epsilon=1e-08, schedule_decay=0.004) # optimizer = SGD(lr=FLAGS.learning_rate, momentum=0.9) objective = 'categorical_crossentropy' metrics = ['accuracy'] model = model_fn(FLAGS, objective, optimizer, metrics) if FLAGS.restore_model_path != '' and file.exists( FLAGS.restore_model_path): if FLAGS.restore_model_path.startswith('s3://'): restore_model_name = FLAGS.restore_model_path.rsplit('/', 1)[1] file.copy(FLAGS.restore_model_path, '/cache/tmp/' + restore_model_name) model.load_weights('/cache/tmp/' + restore_model_name) os.remove('/cache/tmp/' + restore_model_name) else: model.load_weights(FLAGS.restore_model_path) if not os.path.exists(FLAGS.train_local): os.makedirs(FLAGS.train_local) tensorBoard = TensorBoard(log_dir=FLAGS.train_local) # reduce_lr = ks.callbacks.ReduceLROnPlateau(monitor='val_acc', factor=0.5, verbose=1, patience=1, # min_lr=1e-7) # 余弦退火学习率 sample_count = len(train_sequence) * FLAGS.batch_size epochs = FLAGS.max_epochs warmup_epoch = 5 batch_size = FLAGS.batch_size learning_rate_base = FLAGS.learning_rate total_steps = int(epochs * sample_count / batch_size) warmup_steps = int(warmup_epoch * sample_count / batch_size) warm_up_lr = WarmUpCosineDecayScheduler( learning_rate_base=learning_rate_base, total_steps=total_steps, warmup_learning_rate=0, warmup_steps=warmup_steps, hold_base_rate_steps=0, ) history = LossHistory(FLAGS) model.fit_generator(train_sequence, steps_per_epoch=len(train_sequence), epochs=FLAGS.max_epochs, verbose=1, callbacks=[history, tensorBoard, warm_up_lr], validation_data=validation_sequence, max_queue_size=10, workers=int(multiprocessing.cpu_count() * 0.7), use_multiprocessing=True, shuffle=True) print('training done!') if FLAGS.deploy_script_path != '': from save_model import save_pb_model save_pb_model(FLAGS, model) if FLAGS.test_data_url != '': print('test dataset predicting...') from eval import load_test_data img_names, test_data, test_labels = load_test_data(FLAGS) predictions = model.predict(test_data, verbose=0) right_count = 0 for index, pred in enumerate(predictions): predict_label = np.argmax(pred, axis=0) test_label = test_labels[index] if predict_label == test_label: right_count += 1 accuracy = right_count / len(img_names) print('accuracy: %0.4f' % accuracy) metric_file_name = os.path.join(FLAGS.train_local, 'metric.json') metric_file_content = '{"total_metric": {"total_metric_values": {"accuracy": %0.4f}}}' % accuracy with open(metric_file_name, "w") as f: f.write(metric_file_content + '\n') print('end')