Esempio n. 1
0
def finetune(model, output_path, log_dir):
    for layer in model.layers[:nb_freezed_layers]:
        layer.trainable = False
    for layer in model.layers[nb_freezed_layers:]:
        layer.trainable = True

    model.compile(optimizer=SGD(lr=learning_rate, momentum=0.9),
                  loss='categorical_crossentropy',
                  metrics=['accuracy'])

    train_gen = data_generator(train_data_dir, augmentation=True)
    val_gen = data_generator(val_data_dir)

    tb = TensorBoard(log_dir=log_dir, write_graph=False)
    es = EarlyStopping(monitor=monitor, patience=patiance)
    mc = ModelCheckpoint(output_path, monitor=monitor, save_best_only=True)

    model.fit_generator(train_gen,
                        samples_per_epoch=nb_train_samples,
                        nb_epoch=nb_epoch,
                        callbacks=[tb, es, mc],
                        validation_data=val_gen,
                        nb_val_samples=nb_val_samples)

    return model
Esempio n. 2
0
def evaluate_ubuntu(config, model, test_size=500000):
    assert test_size % config.test_batch_size == 0
    predict_result = model.predict_generator(
        generator=data_generator(config=config, is_train=False),
        steps=test_size / config.test_batch_size)
    example_id_list, label_list = load_test_label(config)

    acc_right_num = acc_all_num = 0
    prediction_dict = {}
    for example_id, prediction, label in zip(example_id_list, predict_result,
                                             label_list):
        if example_id not in prediction_dict:
            prediction_dict[example_id] = [(prediction, label)]
        else:
            prediction_dict[example_id].append((prediction, label))

        if (prediction > 0.5 and label == 1) or (prediction < 0.5
                                                 and label == 0):
            acc_right_num += 1
        acc_all_num += 1

    print(
        f"acc evaluate acc_right_num: {acc_right_num}\tacc_all_num: {acc_all_num}\tacc: "
        f"{acc_right_num * 1.0 / acc_all_num}")

    recall_ten_at_one = recall_ten_at_two = recall_ten_at_fiv = recall_two_at_one = sample_all_num = 0
    for example_id in prediction_dict:
        prediction_list = prediction_dict[example_id]
        label_pred = prediction_list[0][0]
        sec_pred = prediction_list[1][0]

        if label_pred > sec_pred:
            recall_two_at_one += 1

        sorted_list = sorted(prediction_list, key=lambda x: x[0], reverse=True)
        sorted_label_list = [y for x, y in sorted_list]

        if sorted_label_list[0] == 1:
            recall_ten_at_one += 1
            recall_ten_at_two += 1
            recall_ten_at_fiv += 1
        elif 1 in sorted_label_list[:2]:
            recall_ten_at_two += 1
            recall_ten_at_fiv += 1
        elif 1 in sorted_label_list[:5]:
            recall_ten_at_fiv += 1
        else:
            pass
        sample_all_num += 1

    recall_two_at_one = recall_two_at_one * 1.0 / sample_all_num
    recall_ten_at_one = recall_ten_at_one * 1.0 / sample_all_num
    recall_ten_at_two = recall_ten_at_two * 1.0 / sample_all_num
    recall_ten_at_fiv = recall_ten_at_fiv * 1.0 / sample_all_num

    print(f"rank evaluate sample_all_num: {sample_all_num}\trecall_two_at_one: {recall_two_at_one}\trecall_ten_at_one: {recall_ten_at_one}\t" \
          f"recall_ten_at_two: {recall_ten_at_two}\trecall_ten_at_fiv: {recall_ten_at_fiv}")

    return recall_ten_at_one
Esempio n. 3
0
def predict(model, result_dir):
    test_gen = data_generator(test_data_dir, shuffle=False)

    proba = model.predict_generator(test_gen, nb_test_samples)
    proba_df = DataFrame(proba, index=test_gen.filenames)

    proba_df.to_csv(os.path.join(result_dir, 'proba.csv'))
    proba_df.idxmax(axis=1).to_csv(os.path.join(result_dir, 'pred.csv'))
Esempio n. 4
0
def train_top(model, output_path=None):
    model.compile(optimizer='rmsprop',
                  loss='categorical_crossentropy',
                  metrics=['accuracy'])

    train_gen = data_generator(train_data_dir)
    val_gen = data_generator(val_data_dir)

    model.fit_generator(train_gen,
                        samples_per_epoch=nb_train_samples,
                        nb_epoch=1,
                        validation_data=val_gen,
                        nb_val_samples=nb_val_samples)

    if output_path is not None:
        model.save(output_path)

    return model
Esempio n. 5
0
def get_generator(trainingdir, spa, params):
    encoder = sampling_encoder(params)
    sampled_dsets, event_dsets = read_files(trainingdir, params)
    for sampled_dset, event_dset in zip(sampled_dsets, event_dsets):
        data_gen = data_generator(
            spa,
            sampled_dset.data,
            window_len=params['window_len'],
            labels=event_dset.data,
            encoder=encoder,
            batch_size=1,  # this is required for saving images
            amplitude_norm=params['amplitude_norm'],
            loop=False)
        yield from data_gen
def get_generators(dataset_name, dataset_videos, datasets_frames, fix_len, figure_size, force, classes=1, use_aug=False,
                   use_crop=True, crop_dark=True):
    train_path, valid_path, test_path, \
    train_y, valid_y, test_y, \
    avg_length = preprocess.createDataset(dataset_videos, dataset_name, datasets_frames, fix_len, force=force)

    if fix_len is not None:
        avg_length = fix_len
    crop_x_y = None
    if (crop_dark):
        crop_x_y = (11, 38)

    batch_size=2

    len_train, len_valid = len(train_path), len(valid_path)
    train_gen = preprocess.data_generator(train_path, train_y, batch_size, figure_size, avg_length, use_aug=use_aug,
                                              use_crop=use_crop, crop_x_y=crop_x_y, classes=classes)
    validate_gen = preprocess.data_generator(valid_path, valid_y, batch_size, figure_size, avg_length,
                                                 use_aug=False, use_crop=False, crop_x_y=crop_x_y, classes=classes)
    test_x, test_y = preprocess.get_sequences(test_path, test_y, figure_size, avg_length, crop_x_y=crop_x_y,
                                                  classes=classes)

    return train_gen, validate_gen, test_x, test_y, avg_length, len_train, len_valid
Esempio n. 7
0
def predict(model, sampled_dset, spa, p):
    # note: loop must be set to true to enable using keras's predict_generator
    data_gen = data_generator(spa,
                              sampled_dset.data,
                              window_len=p['window_len'],
                              encoder=p['encoder'],
                              batch_size=p['predict_batch_size'],
                              amplitude_norm=p['amplitude_norm'],
                              loop=True)
    n_step_size = spa._NFFT - spa._noverlap
    n_targets = np.ceil(len(sampled_dset.data) / n_step_size)
    n_steps = np.ceil(n_targets / p['predict_batch_size'])
    print(n_steps)
    print(sampled_dset.name)
    y_est = model.predict_generator(data_gen, n_steps, verbose=1)
    # save outputs
    basename = '{}_{}'.format(
        os.path.splitext(sampled_dset.path)[0], p['model'])
    save(y_est, basename, p)
Esempio n. 8
0
def test(model, sampled_dset, event_dset, spa, p):
    data_gen = data_generator(spa,
                              sampled_dset.data,
                              window_len=p['window_len'],
                              labels=event_dset.data,
                              encoder=p['encoder'],
                              batch_size=p['batch_size'],
                              amplitude_norm=p['amplitude_norm'],
                              loop=True)
    n_step_size = spa._NFFT - spa._noverlap
    n_targets = np.ceil(len(sampled_dset.data) / n_step_size)
    n_steps = np.ceil(n_targets / p['batch_size'])
    print(n_steps)
    print(sampled_dset.name)
    y_est = model.predict_generator(data_gen, n_steps, verbose=1)
    y_true = all_targets_from_events(event_dset.data, n_targets, n_step_size,
                                     p['encoder'], spa._rate)
    print("accuracy score:",
          accuracy_score(np.argmax(y_true, 1), np.argmax(y_est, 1)))
    print(classification_report(np.argmax(y_true, 1), np.argmax(y_est, 1)))
    # save outputs
    basename = '{}_{}'.format(
        os.path.splitext(sampled_dset.path)[0], p['model'])
    save(y_est, basename, p, y_true)
Esempio n. 9
0
def evaluate_douban(config, model, test_size=10000):
    assert test_size % config.test_batch_size == 0
    predict_result = model.predict_generator(generator=data_generator(config=config, is_train=False), \
        steps=test_size / config.test_batch_size)
    example_id_list, label_list = load_test_label(config)

    prediction_dict = {}
    for example_id, prediction, label in zip(example_id_list, predict_result,
                                             label_list):
        if example_id not in prediction_dict:
            prediction_dict[example_id] = [(prediction, label)]
        else:
            prediction_dict[example_id].append((prediction, label))

    # del some invalid example
    del_num = 0
    filtered_prediction_dict = {}

    for example_id in prediction_dict.keys():
        temp_list = prediction_dict[example_id]
        if len(temp_list) != 10:
            print(len(temp_list))
            print(example_id)
            print('ERROR')
            print('############')
        label0_num = 0
        label1_num = 0

        for temp in temp_list:
            if temp[1] == 0:
                label0_num += 1
            if temp[1] == 1:
                label1_num += 1

        if label0_num == 10 or label1_num == 10:
            del_num += 1
        else:
            filtered_prediction_dict[example_id] = temp_list

    print(f'there are {del_num} example have been delete')

    # now calculate each metrics
    mrr_list = []
    map_list = []
    recall_1 = 0
    recall_2 = 0
    recall_5 = 0
    p1 = 0
    example_count = 0
    for example_id in filtered_prediction_dict.keys():
        prediction_list = filtered_prediction_dict[example_id]

        # (score, label)
        prediction_list = sorted(prediction_list,
                                 key=lambda x: x[0],
                                 reverse=True)

        total_positive = 0
        for prediction in prediction_list:
            if prediction[1] == 1:
                total_positive += 1
        if prediction_list[0][1] == 1:
            p1 += 1
            recall_1 += 1 * 1.0 / total_positive
        correct = 0
        for i in range(2):
            if prediction_list[i][1] == 1:
                correct += 1
        recall_2 += correct * 1.0 / total_positive
        correct = 0
        for i in range(5):
            if prediction_list[i][1] == 1:
                correct += 1
        recall_5 += correct * 1.0 / total_positive

        for i in range(len(prediction_list)):
            if prediction_list[i][1] == 1:
                mrr_list.append(1 * 1.0 / (i + 1))
                break

        correct_count = 1
        one_map_list = []
        for i in range(len(prediction_list)):
            if prediction_list[i][1] == 1:
                one_map_list.append(correct_count * 1.0 / (i + 1))
                correct_count += 1
        map_list.append(sum(one_map_list) / total_positive)

        example_count += 1

    MRR = sum(mrr_list) / example_count
    MAP = sum(map_list) / example_count
    P1 = p1 / example_count

    R10_1 = recall_1 / example_count
    R10_2 = recall_2 / example_count
    R10_5 = recall_5 / example_count

    print(
        f'rank evaluate total:{example_count}\tMRR:{MRR}\tMAP:{MAP}\tP1:{P1}\tR10@1:{R10_1}\tR10@2:{R10_2}\tR10@5:{R10_5}'
    )
    return MRR
Esempio n. 10
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--train_file",
                        default='./data/ubuntu/train_shuf.txt',
                        type=str,
                        help='train file')
    parser.add_argument("--test_file",
                        default='./data/ubuntu/test.txt',
                        type=str,
                        help='valid or test file')
    parser.add_argument('--output_dir',
                        default='./model/',
                        type=str,
                        help='a path to output')
    parser.add_argument('--config_file',
                        default=None,
                        type=str,
                        help='config file')
    parser.add_argument(
        '--model_weight_file',
        default='./model/model_epoch7_seed19565_prec0.79006.hdf5',
        type=str)
    parser.add_argument('--stopword_file',
                        default='./data/ubuntu/stopwords.txt',
                        type=str)

    parser.add_argument('--train_ubuntu',
                        action='store_true',
                        help='Whether to run training on ubuntu dataset')
    parser.add_argument('--evaluate_ubuntu',
                        action='store_true',
                        help='Whether to run evaluate on ubuntu dataset')
    parser.add_argument('--train_douban',
                        action='store_true',
                        help='Whether to run training on douban dataset')
    parser.add_argument('--evaluate_douban',
                        action='store_true',
                        help='Whether to run evaluate on douban dataset')

    parser.add_argument('--gpu_nums',
                        default=2,
                        type=int,
                        help='How many gpu will use')
    parser.add_argument('--train_batch_size', default=64, type=int)
    parser.add_argument('--test_batch_size', default=100, type=int)
    parser.add_argument('--use_CuDNNRNN',
                        action='store_true',
                        help='Whether use CuDNNGRU or CuDNNLSTM')

    parser.add_argument('--random_seed',
                        default=10000,
                        type=int,
                        help='random seed')

    parser.add_argument('--use_word_embeddings', action='store_true')
    parser.add_argument('--word_vocab_file',
                        default='./data/ubuntu/vocab_ubuntu',
                        type=str)
    parser.add_argument('--word_embeddings_file',
                        default='./data/ubuntu/glove_ubuntu.txt',
                        type=str)
    parser.add_argument('--word_embeddings_dim', default=200, type=int)
    parser.add_argument('--word_vocab_size', default=297989, type=int)
    parser.add_argument('--max_utterance_num', default=10, type=int)
    parser.add_argument('--max_utterance_len', default=50, type=int)
    parser.add_argument('--hidden_dim', default=200, type=int)
    parser.add_argument('--epochs', default=10, type=int)
    parser.add_argument('--dropout_rate', default=0.3, type=float)

    parser.add_argument('--use_char_embeddings', action='store_true')
    parser.add_argument('--char_vocab',
                        default='./data/ubuntu/char_vocab',
                        type=str)
    parser.add_argument('--char_vocab_size', default=300, type=int)
    parser.add_argument('--char_embeddings_dim', default=64, type=int)
    parser.add_argument('--max_token_len', default=15, type=int)
    parser.add_argument('--char_features_dim', default=200, type=int)
    parser.add_argument('--char_kernel_shape', default=3, type=int)

    args = parser.parse_args()

    if args.config_file:
        with codecs.open(args.config_file, 'r', encoding='utf-8') as f:
            settings = json.load(f)
            for k, v in settings:
                if k not in ['config_file']:
                    args.__dict__[k] = v

    # check args validation
    experiment_on_ubuntu = args.train_ubuntu or args.evaluate_ubuntu
    experiment_on_douban = args.train_douban or args.evaluate_douban
    if experiment_on_ubuntu and experiment_on_douban:
        raise AssertionError(
            "You have to do an experiment in one dataset at the same time!")
    if not experiment_on_ubuntu and not experiment_on_douban:
        raise AssertionError('Must do something')
    if os.path.exists(args.output_dir) and os.listdir(args.output_dir):
        logger.info("Output directory () already exists and is not empty.")
    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir, exist_ok=True)
    if not args.use_word_embeddings and not args.use_char_embeddings:
        raise AssertionError('At least specified one input!')

    args.task = 'ubuntu' if experiment_on_ubuntu else 'douban'

    # set seed
    np.random.seed(args.random_seed)
    random.seed(args.random_seed)

    print(args)
    if args.train_ubuntu or args.train_douban:
        # init model
        with tf.device('/cpu:0'):
            model = TripleNetModel(args).model
            single_model = model
        if args.gpu_nums > 1:
            model = multi_gpu_model(model, args.gpu_nums)
        model.compile(loss='binary_crossentropy',
                      optimizer='Adamax',
                      metrics=['accuracy'])
        model_save_callback = SaveModelCallback(args, single_model)
        print('Training model.....')
        model.fit_generator(generator=data_generator(args, True),
                            steps_per_epoch=1000000 / args.train_batch_size,
                            epochs=args.epochs,
                            callbacks=[model_save_callback])

    if args.evaluate_ubuntu:
        with tf.device('/cpu:0'):
            model = TripleNetModel(args).model
        model.load_weights(args.model_weight_file)
        if args.gpu_nums > 1:
            model = multi_gpu_model(model, args.gpu_nums)
        evaluate_ubuntu(args, model)
    if args.evaluate_douban:
        with tf.device('/cpu:0'):
            model = TripleNetModel(args).model
        model.load_weights(args.model_weight_file)
        if args.gpu_nums > 1:
            model = multi_gpu_model(model, args.gpu_nums)
        evaluate_douban(args, model)