Example #1
0
def main(data, label_col, max_depth, n_trees, lr, mlflow_tracking_url,
         experiment_name, build_number):
    test_data, train_data = reader.load_data(data)

    pipeline_model = model.train_model(train_data, label_col, max_depth,
                                       n_trees, lr)

    rmse, mae, r2 = model.evaluate_model(pipeline_model, test_data, label_col)

    print("Model tree model (max_depth=%f, trees=%f, lr=%f):" %
          (max_depth, n_trees, lr))
    print("  RMSE: %s" % rmse)
    print("  MAE: %s" % mae)
    print("  R2: %s" % r2)

    with tracking.TrackML(mlflow_tracking_url, experiment_name,
                          build_number) as track:
        track.log_params({
            "max_depth": max_depth,
            "n_trees": n_trees,
            "lr": lr
        })
        track.log_metrics({"RMSE": rmse, "R2": r2, "MAE": mae})

        track.log_model("sklearn", pipeline_model, "retail_model")
Example #2
0
def train_and_predict(data_path, data_filename, batch_size, n_epoch):
    """
    Create a model, load the data, and train it.
    """
    """
    Step 1: Load the data
    """
    hdf5_filename = os.path.join(data_path, data_filename)
    print("-" * 30)
    print("Loading the data from HDF5 file ...")
    print("-" * 30)

    imgs_train, msks_train, imgs_validation, msks_validation = \
        load_data(hdf5_filename)

    print("-" * 30)
    print("Creating and compiling model ...")
    print("-" * 30)
    """
    Step 2: Define the model
    """
    model = load_model(imgs_train.shape, msks_train.shape)

    model_filename, model_callbacks = get_callbacks()
    """
    Step 3: Train the model on the data
    """
    print("-" * 30)
    print("Fitting model with training data ...")
    print("-" * 30)

    history = model.fit(imgs_train,
                        msks_train,
                        batch_size=batch_size,
                        epochs=n_epoch,
                        validation_data=(imgs_validation, msks_validation),
                        verbose=1,
                        shuffle="batch",
                        callbacks=model_callbacks)

    # Append training log
    # with open("training.log","a+") as fp:
    #     fp.write("{}: {}\n".format(datetime.datetime.now(),
    #                              history.history["val_dice_coef"]))
    """
    Step 4: Evaluate the best model
    """
    print("-" * 30)
    print("Loading the best trained model ...")
    print("-" * 30)

    model = evaluate_model(model_filename, imgs_validation, msks_validation)
    """
    Step 5: Save the best model for inference
    """

    print("-" * 30)
    print("Saving the model for inference ...")
    print("-" * 30)
    save_inference_model(model, imgs_train.shape, msks_train.shape)
Example #3
0
def train_and_predict(data_path, data_filename, batch_size, n_epoch):
    """
    Create a model, load the data, and train it.
    """
    """
    Step 1: Load the data
    """
    hdf5_filename = os.path.join(data_path, data_filename)
    print("-" * 30)
    print("Loading the data from HDF5 file ...")
    print("-" * 30)

    imgs_train, msks_train, imgs_validation, msks_validation = \
        load_data(hdf5_filename, args.batch_size,
                  [args.crop_dim, args.crop_dim])

    print("-" * 30)
    print("Creating and compiling model ...")
    print("-" * 30)
    """
    Step 2: Define the model
    """
    model = load_model(imgs_train.shape, msks_train.shape)

    model_filename, model_callbacks = get_callbacks()
    """
    Step 3: Train the model on the data
    """
    print("-" * 30)
    print("Fitting model with training data ...")
    print("-" * 30)

    model.fit(imgs_train,
              msks_train,
              batch_size=batch_size,
              epochs=n_epoch,
              validation_data=(imgs_validation, msks_validation),
              verbose=1,
              shuffle="batch",
              callbacks=model_callbacks)
    """
    Step 4: Evaluate the best model
    """
    print("-" * 30)
    print("Loading the best trained model ...")
    print("-" * 30)

    model = evaluate_model(model_filename, imgs_validation, msks_validation)
Example #4
0
    def test_evaluate_model(self):
        """
        Test boundaries of loss and accuracy
        """
        _, (mock_X_test, mock_y_test) = datasets.imdb.load_data(num_words=2000)
        mock_X_test = sequence.pad_sequences(mock_X_test, maxlen=500)
        mock_trained_model = load_model("models/", "mock_trained_model.h5")

        loss, acc = evaluate_model(mock_trained_model,
                                   (mock_X_test, mock_y_test))

        # Loss
        self.assertIsNotNone(loss, "loss not computed")
        self.assertGreaterEqual(loss, 0., "loss is negativ")

        # Accuracy
        self.assertIsNotNone(acc, "accuracy not computed")
        self.assertGreaterEqual(acc, 0., "accuracy is negativ")
        self.assertLessEqual(acc, 1., "accuracy is greater than 1")
Example #5
0
    def test_evaluate_model(self):
        """
        Test boundaries of loss and accuracy
        """
        texts, labels = preprocess_labels(data_dir_path="data/mock_aclImdb",
                                          dataset="test")
        vectorized_texts, word_index = tokenize_data(texts)
        mock_test_set = (vectorized_texts, labels)
        mock_trained_model = load_model("models/", "mock_trained_model.h5")

        loss, acc = evaluate_model(mock_trained_model, mock_test_set)

        # Loss
        self.assertIsNotNone(loss, "loss not computed")
        self.assertGreaterEqual(loss, 0., "loss is negativ")

        # Accuracy
        self.assertIsNotNone(acc, "accuracy not computed")
        self.assertGreaterEqual(acc, 0., "accuracy is negativ")
        self.assertLessEqual(acc, 1., "accuracy is greater than 1")
Example #6
0
from model import evaluate_model

training_data_folder = '/home/girish/dev/sdcnd/CarND-Behavioral-Cloning-P3/train4/'
csv_file_name = 'driving_log.csv'
model_file_name = "model.h5"

csv_file_path = training_data_folder + csv_file_name

test_loss = evaluate_model(model_file_name, training_data_folder,
                           csv_file_path)

print(test_loss)
Example #7
0
 def test_is_float(self):
     assert isinstance(model.evaluate_model(), float)
Example #8
0
 def test_is_probability(self):
     assert 0 <= model.evaluate_model() <= 1
Example #9
0
def main():
    download_dataset()
    vectorize_dataset()
    train_model()
    evaluate_model()
Example #10
0
def main(_):
    if not os.path.exists(FLAGS.save_dir):
        os.makedirs(FLAGS.save_dir)

    with open(os.path.join(FLAGS.save_dir, 'params.json'), 'w') as f:
        f.write(json.dumps(flags.FLAGS.__flags, indent=1))

    tr_infoboxes = os.path.join(FLAGS.data_dir, 'train', 'train.box')
    tr_sentences = os.path.join(FLAGS.data_dir, 'train', 'train_in.sent')

    te_infoboxes = os.path.join(FLAGS.data_dir, 'test', 'test.box')
    te_sentences = os.path.join(FLAGS.data_dir, 'test', 'test_in.sent')

    va_infoboxes = os.path.join(FLAGS.data_dir, 'valid', 'valid.box')
    va_sentences = os.path.join(FLAGS.data_dir, 'valid', 'valid_in.sent')

    # Checkpoint directory
    checkpoint_dir = os.path.join(FLAGS.save_dir, 'checkpoints')
    if not os.path.exists(checkpoint_dir):
        os.makedirs(checkpoint_dir)

    batch_size = FLAGS.batch_size
    max_source_len = FLAGS.max_source_len
    sum_seq_len = FLAGS.sum_seq_length

    print("Building the word index")
    start = time.time()
    if FLAGS.load_pretrained == 1:
        targ_path = os.path.join(FLAGS.data_dir, 'target_words.txt')
        embed_path = os.path.join(FLAGS.data_dir, 'words.txt')
        if FLAGS.trainable == 1:
            trainable = True
        else:
            trainable = False
        if os.path.exists(targ_path) and os.path.exists(embed_path):
            word_to_id, init_embed = build_index(targ_path, embed_path,
                                                 FLAGS.vocab_size)
            load_init = True
        else:
            print('Pretrained embedding not found in the data folder')
            return
    else:
        word_to_id = build_vocabulary(tr_infoboxes, tr_sentences,
                                      FLAGS.vocab_size, FLAGS.min_field_freq,
                                      FLAGS.fields_per_box,
                                      FLAGS.sum_seq_length)
        init_embed = None  # No use of init_embed when we do not use pretrained embeds
        trainable = True
        load_init = False
    id_to_word = dict(zip(word_to_id.values(), word_to_id.keys()))
    vocab_size = len(word_to_id)
    duration = time.time() - start
    print("Built index in %.3f s" % (duration))

    print("Building the training dataset object")
    start = time.time()
    train_dataset = BaseDataset(tr_infoboxes, tr_sentences,
                                FLAGS.tokens_per_field, max_source_len,
                                sum_seq_len, word_to_id, batch_size)
    duration = time.time() - start
    print("Built train dataset in %.5f s" % (duration))

    print("Building the test dataset object")
    start = time.time()
    test_dataset = BaseDataset(te_infoboxes, te_sentences,
                               FLAGS.tokens_per_field, max_source_len,
                               sum_seq_len, word_to_id, batch_size)
    duration = time.time() - start
    print("Built test dataset in %.5f s" % (duration))

    print("Building the valid dataset object")
    start = time.time()
    valid_dataset = BaseDataset(va_infoboxes, va_sentences,
                                FLAGS.tokens_per_field, max_source_len,
                                sum_seq_len, word_to_id, batch_size)
    duration = time.time() - start
    print("Built valid dataset in %.5f s" % (duration))

    with tf.Graph().as_default():
        tf.set_random_seed(1234)

        model = BaselineSeq2Seq(vocab_size, FLAGS.embedding_size,
                                FLAGS.learning_rate, FLAGS.optimizer,
                                FLAGS.rnn_size, init_embed, load_init,
                                trainable)

        enc_inputs = tf.placeholder(tf.int32,
                                    shape=(batch_size, max_source_len),
                                    name="encoder_inputs")
        dec_inputs = tf.placeholder(tf.int32,
                                    shape=(batch_size, sum_seq_len + 1),
                                    name="decoder_inputs")
        dec_weights = tf.placeholder(tf.float32,
                                     shape=(batch_size, sum_seq_len),
                                     name="decoder_weights")
        feed_previous = tf.placeholder(tf.bool, name="feed_previous")

        logits_op = model.inference_s2s_att(enc_inputs, dec_inputs,
                                            feed_previous)
        loss_op = model.loss(logits_op, dec_inputs, dec_weights)
        train_op = model.training(loss_op)

        saver = tf.train.Saver()

        gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.7)
        sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))

        init = tf.initialize_all_variables()
        sess.run(init)

        print "Trainable variables"
        print '\n'.join([v.name for v in tf.trainable_variables()])

        while train_dataset.epochs_done < FLAGS.num_epochs:
            epochs_done = train_dataset.epochs_done
            start_e = time.time()
            for step in range(train_dataset.num_batches):
                benc_ins, bdec_ins, bdec_wts = train_dataset.next_batch()
                if train_dataset.epochs_done >= FLAGS.true_feed:
                    train_feed = True
                else:
                    train_feed = False
                feed_dict = {
                    enc_inputs: benc_ins,
                    dec_inputs: bdec_ins,
                    dec_weights: bdec_wts,
                    feed_previous: train_feed
                }
                _, loss_val = sess.run([train_op, loss_op],
                                       feed_dict=feed_dict)
                perplexity = np.exp(
                    float(loss_val)) if loss_val < 300 else float('inf')

                if step % FLAGS.print_every == 0:
                    with open(
                            os.path.join(
                                FLAGS.save_dir,
                                str(train_dataset.epochs_done + 1) +
                                '_train_loss.log'), 'a') as log_f:
                        log_f.write(
                            'epoch %d batch %d: loss = %.3f perplexity = %.2f\n'
                            % (train_dataset.epochs_done + 1, step, loss_val,
                               perplexity))
                    print('epoch %d batch %d: loss = %.3f perplexity = %.2f ' %
                          (train_dataset.epochs_done + 1, step, loss_val,
                           perplexity))
                    sys.stdout.flush()

                if step % FLAGS.valid_every == 0 and step != 0:
                    v_loss, v_perp = evaluate_model(sess, valid_dataset,
                                                    loss_op, enc_inputs,
                                                    dec_inputs, dec_weights,
                                                    feed_previous)
                    with open(os.path.join(FLAGS.save_dir, 'valid.log'),
                              'a') as log_f:
                        log_f.write(
                            'valid : epoch %d batch %d : loss = %0.3f perplexity = %0.3f\n'
                            % (train_dataset.epochs_done + 1, step, v_loss,
                               v_perp))
                    print(
                        'valid : epoch %d batch %d : loss = %0.3f perplexity = %0.3f\n'
                        %
                        (train_dataset.epochs_done + 1, step, v_loss, v_perp))
                    sys.stdout.flush()

                if step % FLAGS.test_every == 0 and step != 0:
                    t_loss, t_perp = evaluate_model(sess, test_dataset,
                                                    loss_op, enc_inputs,
                                                    dec_inputs, dec_weights,
                                                    feed_previous)
                    with open(os.path.join(FLAGS.save_dir, 'test.log'),
                              'a') as log_f:
                        log_f.write(
                            'test : epoch %d batch %d : loss = %0.3f perplexity = %0.3f\n'
                            % (train_dataset.epochs_done + 1, step, t_loss,
                               t_perp))
                    print(
                        'test : epoch %d batch %d : loss = %0.3f perplexity = %0.3f\n'
                        %
                        (train_dataset.epochs_done + 1, step, t_loss, t_perp))
                    sys.stdout.flush()

                if step % FLAGS.train_step_every == 0 and step != 0:
                    epochs_done = train_dataset.epochs_done
                    index_in_epoch = train_dataset.index_in_epoch
                    benc_ins, bdec_ins, bdec_wts, sents = train_dataset.next_batch_gen(
                    )
                    feed_dict = {
                        enc_inputs: benc_ins,
                        dec_inputs: bdec_ins,
                        dec_weights: bdec_wts,
                        feed_previous: True
                    }
                    logits = np.array(sess.run(logits_op, feed_dict=feed_dict))
                    logits = np.reshape(
                        logits,
                        (FLAGS.batch_size, FLAGS.sum_seq_length, vocab_size))

                    save_path = os.path.join(
                        FLAGS.save_dir,
                        str(train_dataset.epochs_done + 1) + '_' + str(step) +
                        '_train.gen')
                    true_path = os.path.join(
                        FLAGS.save_dir,
                        str(train_dataset.epochs_done + 1) + '_' + str(step) +
                        '_train.true')

                    with open(save_path, 'a') as save_f:
                        for idx in xrange(batch_size):
                            words = []
                            for l in xrange(FLAGS.sum_seq_length):
                                tokenid = np.argmax(logits[idx, l])
                                words.append(id_to_word[tokenid])
                            save_f.write(' '.join(words) + '\n')
                    with open(true_path, 'a') as true_f:
                        for sent in sents:
                            true_f.write(sent)
                    train_dataset.reset_batch(index_in_epoch, epochs_done)

                if step % FLAGS.test_step_every == 0 and step != 0:
                    benc_ins, bdec_ins, bdec_wts, sents = test_dataset.next_batch_gen(
                    )
                    feed_dict = {
                        enc_inputs: benc_ins,
                        dec_inputs: bdec_ins,
                        dec_weights: bdec_wts,
                        feed_previous: True
                    }
                    logits = np.array(sess.run(logits_op, feed_dict=feed_dict))
                    logits = np.reshape(
                        logits,
                        (FLAGS.batch_size, FLAGS.sum_seq_length, vocab_size))

                    save_path = os.path.join(
                        FLAGS.save_dir,
                        str(train_dataset.epochs_done + 1) + '_' + str(step) +
                        '_test.gen')
                    true_path = os.path.join(
                        FLAGS.save_dir,
                        str(train_dataset.epochs_done + 1) + '_' + str(step) +
                        '_test.true')

                    with open(save_path, 'a') as save_f:
                        for idx in xrange(batch_size):
                            words = []
                            for l in xrange(FLAGS.sum_seq_length):
                                tokenid = np.argmax(logits[idx, l])
                                words.append(id_to_word[tokenid])
                            save_f.write(' '.join(words) + '\n')
                    with open(true_path, 'a') as true_f:
                        for sent in sents:
                            true_f.write(sent)
                    test_dataset.reset_batch()

                if step % FLAGS.valid_step_every == 0 and step != 0:
                    benc_ins, bdec_ins, bdec_wts, sents = valid_dataset.next_batch_gen(
                    )
                    feed_dict = {
                        enc_inputs: benc_ins,
                        dec_inputs: bdec_ins,
                        dec_weights: bdec_wts,
                        feed_previous: True
                    }
                    logits = np.array(sess.run(logits_op, feed_dict=feed_dict))
                    logits = np.reshape(
                        logits,
                        (FLAGS.batch_size, FLAGS.sum_seq_length, vocab_size))

                    save_path = os.path.join(
                        FLAGS.save_dir,
                        str(train_dataset.epochs_done + 1) + '_' + str(step) +
                        '_valid.gen')
                    true_path = os.path.join(
                        FLAGS.save_dir,
                        str(train_dataset.epochs_done + 1) + '_' + str(step) +
                        '_valid.true')

                    with open(save_path, 'a') as save_f:
                        for idx in xrange(batch_size):
                            words = []
                            for l in xrange(FLAGS.sum_seq_length):
                                tokenid = np.argmax(logits[idx, l])
                                words.append(id_to_word[tokenid])
                            save_f.write(' '.join(words) + '\n')
                    with open(true_path, 'a') as true_f:
                        for sent in sents:
                            true_f.write(sent)
                    valid_dataset.reset_batch()

            if train_dataset.epochs_done != epochs_done + 1:
                train_dataset.reset_batch(epochs_completed=epochs_done + 1)

            duration_e = time.time() - start_e
            with open(os.path.join(FLAGS.save_dir, 'time_taken.txt'),
                      'a') as time_f:
                time_f.write('Epoch : %d\tTime taken : %0.5f\n' %
                             (train_dataset.epochs_done, duration_e))

            if train_dataset.epochs_done % FLAGS.save_every_epochs == 0:
                modelfile = os.path.join(
                    checkpoint_dir,
                    str(train_dataset.epochs_done) + '.ckpt')
                saver.save(sess, modelfile)

            if train_dataset.epochs_done % FLAGS.gen_valid_every == 0:
                true_path = os.path.join(
                    FLAGS.save_dir,
                    str(train_dataset.epochs_done) + '_valid.true')
                save_path = os.path.join(
                    FLAGS.save_dir,
                    str(train_dataset.epochs_done) + '_valid.gen')
                generate_sentences(sess,
                                   valid_dataset,
                                   logits_op,
                                   enc_inputs,
                                   dec_inputs,
                                   dec_weights,
                                   feed_previous,
                                   id_to_word,
                                   FLAGS.batch_size,
                                   FLAGS.sum_seq_length,
                                   vocab_size,
                                   save_path,
                                   true_path,
                                   only_num_batches=None)

            if train_dataset.epochs_done % FLAGS.gen_train_every == 0:
                true_path = os.path.join(
                    FLAGS.save_dir,
                    str(train_dataset.epochs_done) + '_train.true')
                save_path = os.path.join(
                    FLAGS.save_dir,
                    str(train_dataset.epochs_done) + '_train.gen')
                generate_sentences(sess,
                                   train_dataset,
                                   logits_op,
                                   enc_inputs,
                                   dec_inputs,
                                   dec_weights,
                                   feed_previous,
                                   id_to_word,
                                   FLAGS.batch_size,
                                   FLAGS.sum_seq_length,
                                   vocab_size,
                                   save_path,
                                   true_path,
                                   only_num_batches=10)

            if train_dataset.epochs_done % FLAGS.gen_test_every == 0:
                true_path = os.path.join(
                    FLAGS.save_dir,
                    str(train_dataset.epochs_done) + '_test.true')
                save_path = os.path.join(
                    FLAGS.save_dir,
                    str(train_dataset.epochs_done) + '_test.gen')
                generate_sentences(sess,
                                   test_dataset,
                                   logits_op,
                                   enc_inputs,
                                   dec_inputs,
                                   dec_weights,
                                   feed_previous,
                                   id_to_word,
                                   FLAGS.batch_size,
                                   FLAGS.sum_seq_length,
                                   vocab_size,
                                   save_path,
                                   true_path,
                                   only_num_batches=None)
Example #11
0
if __name__ == "__main__":
    config = Config()
    train_file = "data_labeled"
    if len(sys.argv) > 2:
        train_file = sys.argv[1]

    dataset = Dataset(config)
    lines = dataset.read_lines(train_file)
    triples = dataset.get_data(lines)
    sentence_batches, action_batches = dataset.get_action_batches(
        triples, config.batches_num)

    model = Transformer(config, len(dataset.vocab))
    if torch.cuda.is_available():
        model.cuda()
    model.train()
    optimizer = optim.Adam(model.parameters(), lr=config.lr)
    # 多分类交叉熵损失函数
    NLLLoss = nn.NLLLoss()
    # 模型加载优化函数和损失函数
    model.add_optimizer(optimizer)
    model.add_loss_op(NLLLoss)

    train_losses = []
    for i in range(config.max_epochs):
        print("Epoch: {}".format(i))
        train_loss = model.run_epoch(sentence_batches, action_batches, i)
        train_losses.append(train_loss)

    train_acc = evaluate_model(model, sentence_batches, action_batches)
    print("Final Training Dataset Accuracy: {.4f}".format(train_acc))
Example #12
0
    '''Faster computation on CPU (only if using tensorflow-gpu)'''
    if TENSORFLOW_BACKEND:
        print('Switching to TensorFlow for CPU...')
        os.environ['CUDA_VISIBLE_DEVICES'] = '-1'

    if len(sys.argv) == 8:
        train_stock_name, val_stock_name, window_size, batch_size, episode_count, model_name, pretrained = sys.argv[1], \
            sys.argv[2], int(sys.argv[3]), int(sys.argv[4]), int(sys.argv[5]), sys.argv[6], bool(int(sys.argv[7]))
    else:
        print(
            'Usage: python train.py [train stock] [val stock] [window] [batch size] [episodes] [model] [pretrained (0/1)]'
        )
        exit(0)

    agent = Agent(window_size, pretrained=pretrained, model_name=model_name)
    train_data = get_stock_data(train_stock_name)
    test_data = get_stock_data(val_stock_name)

    initial_offset = test_data[1] - test_data[0]

    for episode in range(1, episode_count + 1):
        train_result = train_model(agent,
                                   episode,
                                   train_data,
                                   episode_count=episode_count,
                                   batch_size=batch_size,
                                   window_size=window_size)
        val_result = evaluate_model(agent, test_data, window_size=window_size)
        show_train_result(train_result, val_result, initial_offset)

    print('Done Training!')
Example #13
0
def main(cfg):
    """
    # 訓練データと検証データのパス
    train_list = make_datapath_list(
        csv_file=cfg.csv.train, data_id=cfg.csv.id, data_dir=cfg.data.train_dir
    )
    val_list = make_datapath_list(
        csv_file=cfg.csv.val, data_id=cfg.csv.id, data_dir=cfg.data.val_dir
    )
    test_list = make_datapath_list(
        csv_file=cfg.csv.test, data_id=cfg.csv.id, data_dir=cfg.data.test_dir
    )

    # 画像表示と確認
    
    size = 224
    mean = (0.485, 0.456, 0.406)
    std = (0.229, 0.224, 0.225)
    img = train_list[100]
    img = Image.open(img)
    plt.imshow(img)
    plt.show()

    transform = ImageTransform(size, mean, std)
    img_transformed = transform(img)

    img_transformed = img_transformed.numpy().transpose((1, 2, 0))
    img_transformed = np.clip(img_transformed, 0, 1)
    plt.imshow(img_transformed)
    plt.show()

    
    # データセットの作成
    train_dataset = IsicDataset(
        file_list=train_list,
        transform=ImageTransform(cfg.image.size, cfg.image.mean, cfg.image.std),
        phase="train",
        csv_file=cfg.csv.train,
        label_name=cfg.csv.label,
    )
    val_dataset = IsicDataset(
        file_list=val_list,
        transform=ImageTransform(cfg.image.size, cfg.image.mean, cfg.image.std),
        phase="val",
        csv_file=cfg.csv.val,
        label_name=cfg.csv.label,
    )

    test_dataset = IsicDataset(
        file_list=test_list,
        transform=ImageTransform(cfg.image.size, cfg.image.mean, cfg.image.std),
        phase="test",
        csv_file=cfg.csv.test,
        label_name=cfg.csv.label,
    )
    """

    # Imagedatafolderを用いたデータセットの作成(使わない場合はコメントアウト)
    # 学習用データセット作成
    train_dataset = make_trainset(
        dataroot=cfg.data.train_dir,
        resize=cfg.image.size,
        mean=cfg.image.mean,
        std=cfg.image.std,
    )
    # 検証用データセット
    val_dataset = make_testset(
        dataroot=cfg.data.val_dir,
        resize=cfg.image.size,
        mean=cfg.image.mean,
        std=cfg.image.std,
    )
    # 検証用データセット
    test_dataset = make_testset(
        dataroot=cfg.data.test_dir,
        resize=cfg.image.size,
        mean=cfg.image.mean,
        std=cfg.image.std,
    )

    # 辞書型'train'と'val'と'test'のデータローダを作成
    dataloaders_dict = create_dataloader(
        batch_size=cfg.image.batch_size,
        train_dataset=train_dataset,
        val_dataset=val_dataset,
        test_dataset=test_dataset,
    )

    # バッチごとの動作確認
    batch_iterator = iter(dataloaders_dict["train"])
    inputs, labels = next(batch_iterator)

    print(inputs.size())
    print(labels)

    # ネットワークモデルのロード
    net = models.vgg16_bn(pretrained=True)
    log.info(net)

    net.classifier[6] = nn.Linear(in_features=4096, out_features=2)
    net.classifier[2] = nn.Dropout(p=0.6)
    net.classifier[5] = nn.Dropout(p=0.6)
    net.train()

    # 損失関数の設定

    criterion = nn.CrossEntropyLoss()
    # criterion = nn.BCELoss()
    log.info(net)

    # 調整するパラメータの設定
    params_to_update = []

    update_param_names = cfg.train.update_param_names

    # update_param_namesに含まれているパラメータだけ調整
    for name, param in net.named_parameters():
        if name in update_param_names:
            param.requires_grad = True
            params_to_update.append(param)
            log.info(name)
        else:
            param.requires_grad = False

    # 調整するパラメータ名をログに保存
    log.info(params_to_update)

    # 最適化手法の設定
    optimizer = optim.SGD(params=params_to_update,
                          lr=cfg.optimizer.lr,
                          momentum=cfg.optimizer.momentum)
    log.info(optimizer)

    # 学習回数を設定ファイルから読み込む
    num_epochs = cfg.train.num_epochs

    # GPU初期設定
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    print("使用デバイス: ", device)

    # ネットワークをGPUへ
    net.to(device)
    torch.backends.cudnn.benchmark = True

    # 損失値と認識率を保持するリスト
    train_loss = []
    train_acc = []
    test_loss = []
    test_acc = []

    # 学習と検証
    for epoch in range(num_epochs):

        log.info("Epoch {} / {}".format(epoch + 1, num_epochs))
        log.info("----------")

        # 学習
        train_history = train_model(net, dataloaders_dict["train"], criterion,
                                    optimizer)

        # 学習したlossと認識率のリストを作成
        train_loss.append(train_history["train_loss"])
        train_acc.append(train_history["train_acc"])

        # 検証
        test_history = test_model(net, dataloaders_dict["val"], criterion)

        # 検証したlossと認識率のリストを作成
        test_loss.append(test_history["test_loss"])
        test_acc.append(test_history["test_acc"])

    # figインスタンスとaxインスタンスを作成
    fig_loss, ax_loss = plt.subplots(figsize=(10, 10))
    ax_loss.plot(range(1, num_epochs + 1, 1), train_loss, label="train_loss")
    ax_loss.plot(range(1, num_epochs + 1, 1), test_loss, label="test_loss")
    ax_loss.set_xlabel("epoch")
    ax_loss.legend()
    fig_loss.savefig("loss.png")

    fig_acc, ax_acc = plt.subplots(figsize=(10, 10))
    ax_acc.plot(range(1, num_epochs + 1, 1), train_acc, label="train_acc")
    ax_acc.plot(range(1, num_epochs + 1, 1), test_acc, label="test_acc")
    ax_acc.legend()
    ax_acc.set_xlabel("epoch")
    fig_acc.savefig("acc.png")
    """
    # Pytorchのネットワークパラメータのロード
    # 現在のディレクトリを取得
    current_dir = pathlib.Path(__file__).resolve().parent
    print(current_dir)
    # 学習済みのパラメータを使用したいとき
    load_path = str(current_dir) + "/weights_fine_tuning.pth"
    load_weights = torch.load(load_path)
    net.load_state_dict(load_weights)
    """

    evaluate_history = evaluate_model(net, dataloaders_dict["test"], criterion)
    print(evaluate_history["confusion_matrix"])

    # 性能評価指標の計算(正解率、適合率、再現率、F1値)
    efficienct = calculate_efficiency(evaluate_history["confusion_matrix"])

    log.info("正解率: " + str(efficienct["accuracy"]))
    log.info("適合率: " + str(efficienct["precision"]))
    log.info("再現率: " + str(efficienct["recall"]))
    log.info("f1値 :" + str(efficienct["f1"]))

    # 混同行列の作成と表示
    fig_conf, ax_conf = plt.subplots(figsize=(10, 10))
    sns.heatmap(
        evaluate_history["confusion_matrix"],
        annot=True,
        fmt="d",
        cmap="Reds",
    )
    ax_conf.set_title("confusion_matrix")
    ax_conf.set_xlabel("Predicted label")
    ax_conf.set_ylabel("True label")
    fig_conf.savefig("confusion_matrix.png")

    # パラメータの保存
    save_path = "./melanoma_nevi_classifier.pth"
    torch.save(net.state_dict(), save_path)
Example #14
0
"""
    The main driver script
"""

from __future__ import print_function
import sys
from model import evaluate_model


if len(sys.argv) != 1:
    print ("The correct syntax for running the script is python main.py")
    evaluate_model()    
Example #15
0
def run_validation(data_path, n_times, num_epochs, train_df, valid_df):

    for i in range(n_times):

        print 'RUN {}'.format(i)

        train_input = RNNInput(train_df)
        config = RNNModelConfig.init_random()
        print(config)

        tf.reset_default_graph()
        with tf.variable_scope("Model", reuse=None):
            model = RNNModel(config)

        with tf.Session() as sess:
            sess.run(tf.global_variables_initializer())

            # Train model
            start = time.time()
            try:
                train_losses = train_model(sess, model, train_input, num_epochs)

            except ValueError:
                train_loss = np.nan
                valid_loss = np.nan
                train_time = 0.0
                valid_preds = []

            else:
                train_time = time.time() - start
                # Dump figure
                learning_fig = plt.semilogy(train_losses)[0].figure
                fig_fname = '{}.png'.format(i)
                learning_fig.savefig(os.path.join(data_path, fig_fname))
                plt.close(learning_fig)

                # Evaluate model
                train_loss, _ = evaluate_model(sess, model, train_input)
                valid_input = RNNInput(valid_df)
                valid_loss, valid_preds = evaluate_model(sess, model, valid_input)

                print "Training loss: {}".format(train_loss)
                print
                print "Validation loss: {}".format(valid_loss)
                print

            finally:
                print 'Dumping...'
                # Dump results
                res = {
                    'train_loss': float(train_loss),
                    'train_time': float(train_time),
                    'valid_loss': float(valid_loss),
                    'config': config.as_dict(),
                    'valid_preds': np.array(valid_preds).astype(float).tolist()
                }
                res_fname = '{}.txt'.format(i)
                with open(os.path.join(data_path, res_fname), 'w') as f:
                    json.dump(res, f)

            print 'Done!'
            print
Example #16
0
    if len(sys.argv) == 3:
        stock_name, window_size = sys.argv[1], int(sys.argv[2])
    elif len(sys.argv) == 4:
        stock_name, window_size, model_name = sys.argv[1], int(
            sys.argv[2]), sys.argv[3]
    else:
        print('Usage: python evaluate.py [stock] [window] [model (optional)]')
        print(
            'NOTE - All models in "models/" dir will be evaluated if no pretrained model is provided'
        )
        exit(0)

    data = get_stock_data(stock_name)
    initial_offset = data[1] - data[0]

    if model_name is not None:
        '''Single Model Evaluation'''
        agent = Agent(window_size, pretrained=True, model_name=model_name)
        profit = evaluate_model(agent, data, window_size=window_size)
        show_eval_result(model_name, profit, initial_offset)
        del agent
    else:
        '''Multiple Model Evaluation'''
        for model in os.listdir('models'):
            if not os.path.isdir('models/{}'.format(model)):
                agent = Agent(window_size, pretrained=True, model_name=model)
                profit = evaluate_model(agent, data, window_size=window_size)
                show_eval_result(model, profit, initial_offset)
                del agent

    print('Done Evaluating!')
    plt.gcf().savefig('loss.png')


if __name__ == '__main__':
    # Load the dataset and split them into training and test sets
    X_train, X_test, Y_train, Y_test = get_dataset()

    # Create the model and compile it
    model = create_model()
    compile_model(model)

    print(model.summary())
    print()

    print('Training model...')
    training_history = fit_model(model, X_train, Y_train)
    print()

    print('Evaluating model...')
    metrics = evaluate_model(model, X_test, Y_test)
    print()

    print('Loss on test set is:', metrics[0])
    print('Accuracy on test set is:', metrics[-1])
    print()

    # Uncomment to see the plot of the training and validation losses (loss.png)
    # print('Plotting training history...')
    # plot_training_history(training_history)
    # print('Done')
Example #18
0
def main(filename):
    print('loading data')
    # Establish database connection
    with open(
            '/data/groups/schools1/mlpolicylab_fall20_schools1/pipeline/db_info.yaml',
            'r') as f:
        db_params = yaml.safe_load(f)['db']

    engine = create_engine('postgres://:@{host}:{port}/{dbname}'.format(
        host=db_params['host'],
        port=db_params['port'],
        dbname=db_params['dbname'],
    ))
    # Load data from database to dataframe
    df = load_data(filename, engine)

    # Split dataframe into train and test data.
    splits, years_reference = train_test_split(df)

    for i, (train_df, test_df) in enumerate(splits):
        print(f'processing split {i}')

        # Explore data for each of the cohort
        explore_data(train_df)

        # Process train and test data seperately
        updated_df_train = process_data(train_df)
        updated_df_test = process_data(test_df)

        # Upload the test and train data to database for future reference and easy retrival
        updated_df_train.columns = [
            col.replace('(', '').replace(')',
                                         '').replace(' ',
                                                     '_').replace('/', '_')
            for col in updated_df_train.columns
        ]
        updated_df_test.columns = [
            col.replace('(', '').replace(')',
                                         '').replace(' ',
                                                     '_').replace('/', '_')
            for col in updated_df_test.columns
        ]

        table_name = timestamp + '_' + str(years_reference[i][1]) + '_' + str(
            years_reference[i][0])

        df_to_db(table_name, 'processed_data', updated_df_train,
                 updated_df_test, engine)

        # Retreive test and train data from database
        processed_train, processed_test = db_to_df(table_name,
                                                   'processed_data', engine)

        updated_df_train_f = processed_train.copy()
        updated_df_train_l = processed_train.copy()
        updated_df_test_f = processed_test.copy()
        updated_df_test_l = processed_test.copy()

        # Create features for test and train data
        features_train, train_student_ids = create_features(updated_df_train_f)
        features_test, test_student_ids = create_features(updated_df_test_f)

        # Create labels
        label_train = create_label(updated_df_train_l)
        label_test = create_label(updated_df_test_l)

        # Concatenating features and labels to save in the database
        train_concat = pd.concat([features_train, label_train],
                                 axis=1,
                                 sort=False)
        test_concat = pd.concat([features_test, label_test],
                                axis=1,
                                sort=False)

        # Calculating baseline rate using grade 9 gpa and base rate
        baseline_precision = baseline(test_concat, years_reference[i])
        base_rate = sum(train_concat.not_graduated) / len(train_concat)

        # Saving and reading from database
        df_to_db(table_name, 'model_data', train_concat, test_concat, engine)
        model_train, model_test = db_to_df(table_name, 'model_data', engine)

        features_train = model_train.iloc[:, :-1]
        label_train = model_train.iloc[:, -1]
        features_test = model_test.iloc[:, :-1]
        label_test = model_test.iloc[:, -1]

        # Build model
        algos = ["Logistic", "SVM", "RandomForest", "DecisionTree"]
        gs_params = {
            "Logistic":
            ParameterGrid({
                'solver': ['lbfgs', 'liblinear', 'saga'],
                'C': [0.001, 0.01, 0.1, 1, 2, 5, 10]
            }),
            "SVM":
            ParameterGrid({
                'C': [0.01, 1, 2, 5, 10],
                'kernel': ['rbf', 'sigmoid']
            }),
            "RandomForest":
            ParameterGrid({
                'n_estimators': [30, 50, 100, 500, 1000, 10000],
                'max_depth': [5, 10, 20, 50],
                'min_samples_split': [5, 10, 15],
                'max_features': ['auto', 'log2', 'sqrt']
            }),
            "DecisionTree":
            ParameterGrid({
                'criterion': ['gini', 'entropy'],
                'max_depth': [5, 10, 20, 50],
                'min_samples_split': [5, 10, 15]
            })
        }

        print('performing model grid search')
        for model_name in algos:
            params = gs_params[model_name]
            for param in params:
                model = build_model(features_train, label_train, model_name,
                                    param)

                # Perform prediction
                pred_proba_train = prediction(features_train, model)
                pred_proba_test = prediction(features_test, model)

                # Convert prediction probabilities to dataframes for further processing
                pred_train_df = pd.DataFrame(pred_proba_train,
                                             columns=['probability'])
                pred_test_df = pd.DataFrame(pred_proba_test,
                                            columns=['probability'])

                # Retreive hyperparameters for processing
                hyperparameters = ' '.join(
                    ["{}: {}".format(key, param[key]) for key in param.keys()])

                pred_train_df['model'], pred_train_df[
                    'params'] = model_name, hyperparameters
                pred_test_df['model'], pred_test_df[
                    'params'] = model_name, hyperparameters

                # Get the prediction scores for test and train data
                predictions_train = pd.concat(
                    [train_student_ids, pred_train_df], axis=1, sort=False)
                predictions_test = pd.concat([test_student_ids, pred_test_df],
                                             axis=1,
                                             sort=False)

                # Calculate the bias metrics
                TPR_gender, FDR_gender = bias_metrics(predictions_test,
                                                      processed_test, 'gender')
                TPR_disadvantagement, FDR_disadvantagement = bias_metrics(
                    predictions_test, processed_test, 'disadvantagement')

                # Load the prediction results to database for creating visualizations
                df_to_db(table_name, 'predictions', predictions_train,
                         predictions_test, engine)

                # Evaluate model
                metric = evaluate_model(features_test,
                                        label_test,
                                        model,
                                        model_name,
                                        baseline_precision,
                                        hyperparameters,
                                        columns=model_train.columns[:-1])

                # saving results
                df_summary = pd.DataFrame({
                    'test_year':
                    years_reference[i][1],
                    'train_since':
                    years_reference[i][0],
                    'algorithm':
                    model_name,
                    'hyperparameters':
                    hyperparameters,
                    'baserate':
                    base_rate,
                    'baseline': [baseline_precision],
                    'precision':
                    metric,
                    'TPR_gender':
                    TPR_gender,
                    'FDR_gender':
                    FDR_gender,
                    'TPR_disadvantagement':
                    TPR_disadvantagement,
                    'FDR_disadvantagement':
                    FDR_disadvantagement
                })
                df_summary.to_sql(name=timestamp,
                                  schema='performance_metrics',
                                  con=engine,
                                  if_exists='append',
                                  index=False)