def on_epoch_end(self, epoch, logs={}):
        self.losses.append(logs.get('loss'))
        self.val_losses.append(logs.get('val_loss'))

        save_path = os.path.join(
            self.FLAGS.train_local,
            'weights_%03d_%.4f.h5' % (epoch, logs.get('val_acc')))
        self.model.save_weights(save_path)
        if self.FLAGS.train_url.startswith('s3://'):
            save_url = os.path.join(
                self.FLAGS.train_url,
                'weights_%03d_%.4f.h5' % (epoch, logs.get('val_acc')))
            file.copy(save_path, save_url)
        print('save weights file', save_path)

        if self.FLAGS.keep_weights_file_num > -1:
            weights_files = glob(os.path.join(self.FLAGS.train_local, '*.h5'))
            if len(weights_files) >= self.FLAGS.keep_weights_file_num:
                weights_files.sort(
                    key=lambda file_name: os.stat(file_name).st_ctime,
                    reverse=True)
                for file_path in weights_files[self.FLAGS.
                                               keep_weights_file_num:]:
                    os.remove(
                        file_path)  # only remove weights files on local path
Example #2
0
def save_model(save_dir, phase, name, epoch, f1score, model):
    if not os.path.exists(save_dir):
        os.mkdir(save_dir)
    save_dir = os.path.join(save_dir, args.model)
    if not os.path.exists(save_dir):
        os.mkdir(save_dir)
    save_dir = os.path.join(save_dir, phase)
    if not os.path.exists(save_dir):
        os.mkdir(save_dir)
    state_dict = model.state_dict()
    for key in state_dict.keys():
        state_dict[key] = state_dict[key].cpu()
    state_dict_all = {
            'state_dict': state_dict,
            'epoch': epoch,
            'f1score': f1score,
            }
    saveStr = '{:s}.ckpt'.format(name)
    torch.save( state_dict_all , os.path.join(save_dir, saveStr))
    file.copy(os.path.join(save_dir, saveStr), os.path.join(args.save_dir_obs, saveStr))
    if 'best' in name and f1score > 0.3:
        bestStr = '{:s}_{:s}.ckpt'.format(name, str(epoch))
        torch.save(state_dict_all , os.path.join(save_dir, bestStr))
        file.copy(os.path.join(save_dir, bestStr),
                  os.path.join(args.save_dir_obs, bestStr))
Example #3
0
def train_model(FLAGS):
    # data flow generator
    train_sequence, validation_sequence = data_flow(FLAGS.data_local, FLAGS.batch_size,
                                                    FLAGS.num_classes, FLAGS.input_size)

    optimizer = adam(lr=FLAGS.learning_rate, clipnorm=0.001)
    objective = 'binary_crossentropy'
    metrics = ['accuracy']
    model = model_fn(FLAGS, objective, optimizer, metrics)
    if FLAGS.restore_model_path != '' and file.exists(FLAGS.restore_model_path):
        if FLAGS.restore_model_path.startswith('s3://'):
            restore_model_name = FLAGS.restore_model_path.rsplit('/', 1)[1]
            file.copy(FLAGS.restore_model_path, '/cache/tmp/' + restore_model_name)
            model.load_weights('/cache/tmp/' + restore_model_name,by_name=True)
            os.remove('/cache/tmp/' + restore_model_name)
        else:
            model.load_weights(FLAGS.restore_model_path,by_name=True)
    if not os.path.exists(FLAGS.train_local):
        os.makedirs(FLAGS.train_local)
    tensorBoard = TensorBoard(log_dir=FLAGS.train_local)
    history = LossHistory(FLAGS)
    model.fit_generator(
        train_sequence,
        steps_per_epoch=len(train_sequence),
        epochs=FLAGS.max_epochs,
        verbose=1,
        callbacks=[history, tensorBoard],
        validation_data=validation_sequence,
        max_queue_size=10,
        workers=int(multiprocessing.cpu_count() * 0.7),
        use_multiprocessing=True,
        shuffle=True
    )

    print('training done!')

    if FLAGS.deploy_script_path != '':
        from save_model import save_pb_model
        save_pb_model(FLAGS, model)

    if FLAGS.test_data_url != '':
        print('test dataset predicting...')
        from eval import load_test_data
        img_names, test_data, test_labels = load_test_data(FLAGS)
        predictions = model.predict(test_data, verbose=0)

        right_count = 0
        for index, pred in enumerate(predictions):
            predict_label = np.argmax(pred, axis=0)
            test_label = test_labels[index]
            if predict_label == test_label:
                right_count += 1
        accuracy = right_count / len(img_names)
        print('accuracy: %0.4f' % accuracy)
        metric_file_name = os.path.join(FLAGS.train_local, 'metric.json')
        metric_file_content = '{"total_metric": {"total_metric_values": {"accuracy": %0.4f}}}' % accuracy
        with open(metric_file_name, "w") as f:
            f.write(metric_file_content + '\n')
    print('end')
def load_weights(model, weighs_file_path):
    if os.path.isfile(weighs_file_path):
        print('load weights from %s' % weighs_file_path)
        if weighs_file_path.startswith('s3://'):
            weighs_file_name = weighs_file_path.rsplit('/', 1)[1]
            file.copy(weighs_file_path, '/cache/tmp/' + weighs_file_name)
            weighs_file_path = '/cache/tmp/' + weighs_file_name
            model.load_weights(weighs_file_path)
            os.remove(weighs_file_path)
        else:
            model.load_weights(weighs_file_path)
        print('load weights success')
    else:
        print('load weights failed! Please check weighs_file_path')
Example #5
0
def save_pb_model(FLAGS, model):
    if FLAGS.mode == 'train':
        pb_save_dir_local = FLAGS.train_local
        pb_save_dir_obs = FLAGS.train_url
    elif FLAGS.mode == 'save_pb':
        freeze_weights_file_dir = FLAGS.freeze_weights_file_path.rsplit(
            '/', 1)[0]
        if freeze_weights_file_dir.startswith('s3://'):
            pb_save_dir_local = '/cache/tmp'
            pb_save_dir_obs = freeze_weights_file_dir
        else:
            pb_save_dir_local = freeze_weights_file_dir
            pb_save_dir_obs = pb_save_dir_local

    signature = tf.saved_model.signature_def_utils.predict_signature_def(
        inputs={'input_img': model.input},
        outputs={'output_score': model.output})
    builder = tf.saved_model.builder.SavedModelBuilder(
        os.path.join(pb_save_dir_local, 'model'))
    legacy_init_op = tf.group(tf.tables_initializer(), name='legacy_init_op')
    builder.add_meta_graph_and_variables(
        sess=backend.get_session(),
        tags=[tf.saved_model.tag_constants.SERVING],
        signature_def_map={
            'predict_images': signature,
        },
        legacy_init_op=legacy_init_op)
    builder.save()
    print('save pb to local path success')

    if pb_save_dir_obs.startswith('s3://'):
        file.copy_parallel(os.path.join(pb_save_dir_local, 'model'),
                           os.path.join(pb_save_dir_obs, 'model'))
        print('copy pb to %s success' % pb_save_dir_obs)

    file.copy(os.path.join(FLAGS.deploy_script_path, 'config.json'),
              os.path.join(pb_save_dir_obs, 'model/config.json'))
    file.copy(os.path.join(FLAGS.deploy_script_path, 'customize_service.py'),
              os.path.join(pb_save_dir_obs, 'model/customize_service.py'))
    if file.exists(os.path.join(pb_save_dir_obs, 'model/config.json')) and \
            file.exists(os.path.join(pb_save_dir_obs, 'model/customize_service.py')):
        print('copy config.json and customize_service.py success')
    else:
        print('copy config.json and customize_service.py failed')
Example #6
0
        # result_file.write('\n')

        if phase == 'test':
            continue

    result_file.close()
    import pandas as pd
    re = pd.read_csv(datadir+'/data/result/{:d}_{:s}_result.csv'.format(epoch, phase))
    re.columns = ['target_file','text']
    submit = pd.read_csv(datadir+'/submission.csv')
    submit = pd.merge(submit, re, how='left', on=['target_file'])
    submit = submit.drop(['target_file'], axis=1)
    submit = submit.replace(to_replace='None',value=20)
    submit = submit.fillna('上')
    submit.to_csv(datadir+'/predict.csv', header=True, index=None, encoding='utf-8')
    file.copy(datadir+'/predict.csv', args.data_dir_obs+'/predict.csv')

def get_weight(labels):
    labels = labels.data.cpu().numpy()
    weights = np.zeros_like(labels)
    # weight_false = 1.0 / ((labels<0.5).sum() + 10e-20)
    # weight_true  = 1.0 / ((labels>0.5).sum() + 10e-20)
    weight_false = 1.0 / ((labels<0.5).sum(0) + 10e-20)
    label_true = (labels>0.5).sum(0)
    for i in range(labels.shape[1]):
        label_i = labels[:,i]
        weight_i = np.ones(labels.shape[0]) * weight_false[i]
        # weight_i = np.ones(labels.shape[0]) * weight_false
        if label_true[i] > 0:
            weight_i[label_i>0.5] = 1.0 / label_true[i]
        weights[:,i] = weight_i
Example #7
0
def train_model(FLAGS):
    # data flow generator
    train_sequence, validation_sequence = data_flow(FLAGS.data_local,
                                                    FLAGS.batch_size,
                                                    FLAGS.num_classes,
                                                    FLAGS.input_size)

    # optimizer = adam(lr=FLAGS.learning_rate, clipnorm=0.001)
    optimizer = Nadam(lr=FLAGS.learning_rate,
                      beta_1=0.9,
                      beta_2=0.999,
                      epsilon=1e-08,
                      schedule_decay=0.004)
    # optimizer = SGD(lr=FLAGS.learning_rate, momentum=0.9)
    objective = 'categorical_crossentropy'
    metrics = ['accuracy']
    model = model_fn(FLAGS, objective, optimizer, metrics)
    if FLAGS.restore_model_path != '' and file.exists(
            FLAGS.restore_model_path):
        if FLAGS.restore_model_path.startswith('s3://'):
            restore_model_name = FLAGS.restore_model_path.rsplit('/', 1)[1]
            file.copy(FLAGS.restore_model_path,
                      '/cache/tmp/' + restore_model_name)
            model.load_weights('/cache/tmp/' + restore_model_name)
            os.remove('/cache/tmp/' + restore_model_name)
        else:
            model.load_weights(FLAGS.restore_model_path)
    if not os.path.exists(FLAGS.train_local):
        os.makedirs(FLAGS.train_local)
    tensorBoard = TensorBoard(log_dir=FLAGS.train_local)
    # reduce_lr = ks.callbacks.ReduceLROnPlateau(monitor='val_acc', factor=0.5, verbose=1, patience=1,
    #                                            min_lr=1e-7)
    # 余弦退火学习率
    sample_count = len(train_sequence) * FLAGS.batch_size
    epochs = FLAGS.max_epochs
    warmup_epoch = 5
    batch_size = FLAGS.batch_size
    learning_rate_base = FLAGS.learning_rate
    total_steps = int(epochs * sample_count / batch_size)
    warmup_steps = int(warmup_epoch * sample_count / batch_size)

    warm_up_lr = WarmUpCosineDecayScheduler(
        learning_rate_base=learning_rate_base,
        total_steps=total_steps,
        warmup_learning_rate=0,
        warmup_steps=warmup_steps,
        hold_base_rate_steps=0,
    )
    history = LossHistory(FLAGS)
    model.fit_generator(train_sequence,
                        steps_per_epoch=len(train_sequence),
                        epochs=FLAGS.max_epochs,
                        verbose=1,
                        callbacks=[history, tensorBoard, warm_up_lr],
                        validation_data=validation_sequence,
                        max_queue_size=10,
                        workers=int(multiprocessing.cpu_count() * 0.7),
                        use_multiprocessing=True,
                        shuffle=True)

    print('training done!')

    if FLAGS.deploy_script_path != '':
        from save_model import save_pb_model
        save_pb_model(FLAGS, model)

    if FLAGS.test_data_url != '':
        print('test dataset predicting...')
        from eval import load_test_data
        img_names, test_data, test_labels = load_test_data(FLAGS)
        predictions = model.predict(test_data, verbose=0)

        right_count = 0
        for index, pred in enumerate(predictions):
            predict_label = np.argmax(pred, axis=0)
            test_label = test_labels[index]
            if predict_label == test_label:
                right_count += 1
        accuracy = right_count / len(img_names)
        print('accuracy: %0.4f' % accuracy)
        metric_file_name = os.path.join(FLAGS.train_local, 'metric.json')
        metric_file_content = '{"total_metric": {"total_metric_values": {"accuracy": %0.4f}}}' % accuracy
        with open(metric_file_name, "w") as f:
            f.write(metric_file_content + '\n')
    print('end')