Beispiel #1
0
 def run_experiment(_run):
     assert _run.info.get("tensorflow", None) is None
     with tf.Session() as s:
         with LogFileWriter(ex):
             swr = tf.summary.FileWriter(logdir=TEST_LOG_DIR, graph=s.graph)
         assert swr is not None
         assert _run.info["tensorflow"]["logdirs"] == [TEST_LOG_DIR]
         tf.summary.FileWriter(TEST_LOG_DIR2, s.graph)
         assert _run.info["tensorflow"]["logdirs"] == [TEST_LOG_DIR, TEST_LOG_DIR2]
Beispiel #2
0
 def run_experiment(_run):
     assert _run.info.get("tensorflow", None) is None
     with tf.Session() as s:
         # Capturing the log directory should be done only in scope of the context manager
         try:
             with LogFileWriter(ex):
                 swr = tf.summary.FileWriter(logdir=TEST_LOG_DIR, graph=s.graph)
                 assert swr is not None
                 assert _run.info["tensorflow"]["logdirs"] == [TEST_LOG_DIR]
                 raise ValueError("I want to be raised!")
         except ValueError:
             pass
         # This should not be captured:
         tf.summary.FileWriter("/tmp/whatever", s.graph)
         assert _run.info["tensorflow"]["logdirs"] == [TEST_LOG_DIR]
def train(ds, arch, ms, cuda, log_dir, seed, save_model, save_best_only,
          early_stop, unsupervized, _run):
    # fix seed
    print("seed: ", seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)

    if ds['fold'] is None:
        ds['fold'] = 0  # pick first and only column in csv files
        log_dir = os.path.join(log_dir, str(_run._id),
                               _run.experiment_info['name'])
    else:
        import pathlib
        log_dir = os.path.join(log_dir, 'fold' + str(ds['fold']))
        pathlib.Path(log_dir).mkdir(parents=True, exist_ok=True)
        cfg_path = os.path.abspath(
            os.path.join(log_dir, os.pardir, 'config.json'))
        with open(cfg_path, 'w') as outfile:
            json.dump(_run.config, outfile)
        print("log_dir JSON: ", log_dir)
        if 'expert_models' in ms.keys():
            ms['expert_models'] = [
                os.path.join(x, f"fold{ds['fold']}", "checkpoint.pth.tar")
                for x in ms['expert_models']
            ]

    if _run._id is not None:
        # Capture TensorBoard logs with sacred
        with LogFileWriter(ex):
            logger = Logger(log_dir, _run)
    else:
        logger = Logger(log_dir, None)

    # Fit the model
    clf = Base(logger=logger, cuda=cuda, verbose=True)
    clf.fit(arch, ms, **ds, early_stop=early_stop, unsupervized=unsupervized)

    _run.info['modelstr'] = str(clf.model)

    if save_model:
        clf.save_checkpoint_(save_best_only=save_best_only)

    return clf.best_acc_
Beispiel #4
0
    def run_experiment(_run):
        assert _run.info.get("tensorflow", None) is None
        with tf.Session() as s:
            # Without using the LogFileWriter context manager, nothing should change
            swr = tf.summary.FileWriter(logdir=TEST_LOG_DIR, graph=s.graph)
            assert swr is not None
            assert _run.info.get("tensorflow", None) is None

            # Capturing the log directory should be done only in scope of the context manager
            with LogFileWriter(ex):
                swr = tf.summary.FileWriter(logdir=TEST_LOG_DIR, graph=s.graph)
                assert swr is not None
                assert _run.info["tensorflow"]["logdirs"] == [TEST_LOG_DIR]
                tf.summary.FileWriter(TEST_LOG_DIR2, s.graph)
                assert _run.info["tensorflow"]["logdirs"] == [TEST_LOG_DIR, TEST_LOG_DIR2]

            # This should not be captured:
            tf.summary.FileWriter("/tmp/whatever", s.graph)
            assert _run.info["tensorflow"]["logdirs"] == [TEST_LOG_DIR, TEST_LOG_DIR2]
def train_model(model, data_train, data_val, generator, lr_val, num_epochs,
                batch_size, logdir, ex_name, val_epochs, modelpath,
                learning_rate, epochs_pretrain, latent_dim, num_clusters):
    """Trains the VarPSOM model.
    Args:
        model (VarPSOM): VarPSOM model to train.
        data_train (np.array): Training set.
        data_val (np.array): Validation/test set.
        generator (generator): Data generator for the batches.
        lr_val (tf.Tensor): Placeholder for the learning rate value.
        num_epochs (int): Number of training epochs.
        batch_size (int): Batch size for the training.
        logdir (path): Directory for the experiment logs.
        ex_name (string): Unique name of this particular run.
        val_epochs (bool): If "True" clustering results are saved every 10 epochs on default output files.
        modelpath (path): Path for the model checkpoints.
        learning_rate (float): Learning rate for the optimization.
        epochs_pretrain (int): Number of VAE pretraining epochs.
        latent_dim (int): Dimensionality of the VarPSOM's latent space.
        num_clusters (int): Number of clusters.
    """

    epochs = 0
    iterations = 0
    train_gen = generator("train", batch_size)
    val_gen = generator("val", batch_size)
    len_data_train = len(data_train)
    len_data_val = len(data_val)
    num_batches = len_data_train // batch_size

    saver = tf.train.Saver(max_to_keep=1)
    summaries = tf.summary.merge_all()

    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        test_losses = []
        test_losses_mean = []
        with LogFileWriter(ex):
            train_writer = tf.summary.FileWriter(logdir + "/train", sess.graph)
            test_writer = tf.summary.FileWriter(logdir + "/test", sess.graph)
        train_step_SOMVAE, train_step_ae = model.optimize
        x = model.inputs
        p = model.p
        is_training = model.is_training
        graph = tf.get_default_graph()
        z = graph.get_tensor_by_name("reconstruction_e/decoder/z_e:0")

        print("\n**********Starting job {}********* \n".format(ex_name))
        pbar = tqdm(total=(num_epochs + epochs_pretrain) * (num_batches))

        print("\n\nAutoencoder Pretraining...\n")
        a = np.zeros((batch_size, num_clusters))
        dp = {p: a, is_training: True, z: np.zeros((batch_size, latent_dim))}
        for epoch in range(epochs_pretrain):
            for i in range(num_batches):
                batch_data, _, _ = next(train_gen)
                f_dic = {x: batch_data, lr_val: learning_rate}
                f_dic.update(dp)
                train_step_ae.run(feed_dict=f_dic)
                if i % 100 == 0:
                    batch_val, _, _ = next(val_gen)
                    f_dic = {x: batch_val}
                    f_dic.update(dp)
                    test_loss, summary = sess.run(
                        [model.loss_reconstruction_ze, summaries],
                        feed_dict=f_dic)
                    test_writer.add_summary(
                        summary, tf.train.global_step(sess, model.global_step))
                    f_dic = {x: batch_data}
                    f_dic.update(dp)
                    train_loss, summary = sess.run(
                        [model.loss_reconstruction_ze, summaries],
                        feed_dict=f_dic)
                    train_writer.add_summary(
                        summary, tf.train.global_step(sess, model.global_step))
                pbar.set_postfix(epoch=epoch,
                                 train_loss=train_loss,
                                 test_loss=test_loss,
                                 refresh=False)
                pbar.update(1)

        print("\nClusters initialization...\n")
        z_e = []
        for t in range(9):
            z_e.extend(
                sess.run(model.sample_z_e,
                         feed_dict={
                             x:
                             data_train[int(len(data_train) / 10) *
                                        t:int(len(data_train) / 10) * (t + 1)],
                             is_training:
                             True,
                             z:
                             np.zeros((int(len(data_train) / 10), latent_dim))
                         }))
        z_e.extend(
            sess.run(model.sample_z_e,
                     feed_dict={
                         x: data_train[int(len(data_train) / 10) * 9:],
                         is_training: True,
                         z: np.zeros((int(len(data_val) / 10), latent_dim))
                     }))
        z_e = np.array(z_e)
        assign_mu_op = model.get_assign_cluster_centers_op(features=z_e)
        _ = sess.run(assign_mu_op)

        print("\nTraining...\n")
        for epoch in range(num_epochs):
            epochs += 1
            #Compute initial soft probabilities between data points and centroids
            q = []
            for t in range(9):
                q.extend(
                    sess.run(
                        model.q,
                        feed_dict={
                            x:
                            data_train[int(len(data_train) / 10) *
                                       t:int(len(data_train) / 10) * (t + 1)],
                            is_training:
                            True,
                            z:
                            np.zeros((int(len(data_train) / 10), latent_dim))
                        }))
            q.extend(
                sess.run(model.q,
                         feed_dict={
                             x: data_train[int(len(data_train) / 10) * 9:],
                             is_training: True,
                             z: np.zeros(
                                 (int(len(data_train) / 10), latent_dim))
                         }))
            q = np.array(q)
            ppt = model.target_distribution(q)
            q = sess.run(model.q,
                         feed_dict={
                             x: data_val,
                             is_training: True,
                             z: np.zeros((len(data_val), latent_dim))
                         })
            ppv = model.target_distribution(q)

            #Train
            for i in range(num_batches):
                iterations += 1
                batch_data, _, ii = next(train_gen)
                ftrain = {
                    p: ppt[ii * batch_size:(ii + 1) * batch_size],
                    is_training: True,
                    z: np.zeros((batch_size, latent_dim))
                }
                f_dic = {x: batch_data, lr_val: learning_rate}
                f_dic.update(ftrain)
                train_step_SOMVAE.run(feed_dict=f_dic)
                batch_val, _, ii = next(val_gen)
                fval = {
                    p: ppv[ii * batch_size:(ii + 1) * batch_size],
                    is_training: True,
                    z: np.zeros((batch_size, latent_dim))
                }
                f_dic = {x: batch_val}
                f_dic.update(fval)
                test_loss, summary = sess.run([model.loss, summaries],
                                              feed_dict=f_dic)
                test_losses.append(test_loss)
                if i % 100 == 0:
                    test_writer.add_summary(
                        summary, tf.train.global_step(sess, model.global_step))
                    f_dic = {x: batch_data}
                    f_dic.update(ftrain)
                    train_loss, summary = sess.run([model.loss, summaries],
                                                   feed_dict=f_dic)
                    train_writer.add_summary(
                        summary, tf.train.global_step(sess, model.global_step))
                if i % 1000 == 0:
                    test_loss_mean = np.mean(test_losses)
                    test_losses_mean.append(test_loss_mean)
                    test_losses = []

                if len(test_losses_mean) > 0:
                    test_s = test_losses_mean[-1]
                else:
                    test_s = test_losses_mean

                pbar.set_postfix(epoch=epoch,
                                 train_loss=train_loss,
                                 test_loss=test_s,
                                 refresh=False)
                pbar.update(1)
            saver.save(sess, modelpath)

            if val_epochs == True and epochs % 10 == 0:
                saver.save(sess, modelpath)
                results = evaluate_model(model, generator, len_data_val, x,
                                         modelpath, epochs)
                if results is None:
                    return None

        saver.save(sess, modelpath)
        results = evaluate_model(model, generator, len_data_val, x, modelpath,
                                 epochs)
    return results
Beispiel #6
0
def train_model(model, data_train, data_val, endpoints_total_val, lr_val,
                num_epochs, patience, batch_size, logdir, modelpath,
                learning_rate, interactive, only_evaluate, benchmark,
                train_ratio):
    """Trains the SOM-VAE model.
    
    Args:
        model (SOM-VAE): SOM-VAE model to train.
        x (tf.Tensor): Input tensor or placeholder.
        lr_val (tf.Tensor): Placeholder for the learning rate value.
        num_epochs (int): Number of epochs to train.
        patience (int): Patience parameter for the early stopping.
        batch_size (int): Batch size for the training generator.
        logdir (path): Directory for saving the logs.
        modelpath (path): Path for saving the model checkpoints.
        learning_rate (float): Learning rate for the optimization.
        interactive (bool): Indicator if we want to have an interactive
            progress bar for training.
        generator (generator): Generator for the data batches.
        only_evaluate (bool): Do not actually perform train but just load the model
    """

    len_data_val = len(data_val)
    val_gen = batch_generator(data_train,
                              data_val,
                              endpoints_total_val,
                              mode="val")
    x = model.inputs

    if benchmark:
        times_per_epoch = []

    # Train the model
    if not only_evaluate:
        len_data_train = len(data_train)
        num_batches = len_data_train // batch_size
        train_gen = batch_generator(data_train,
                                    data_val,
                                    endpoints_total_val,
                                    mode="train")
        saver = tf.train.Saver(keep_checkpoint_every_n_hours=0.5)
        summaries = tf.summary.merge_all()

        with tf.Session() as sess:
            sess.run(tf.global_variables_initializer())
            patience_count = 0
            test_losses = []
            with LogFileWriter(ex):
                train_writer = tf.summary.FileWriter(logdir + "/train",
                                                     sess.graph)
                test_writer = tf.summary.FileWriter(logdir + "/test",
                                                    sess.graph)
            print("Training...")
            train_step_SOMVAE, train_step_prob = model.optimize
            if interactive:
                pbar = tqdm(total=num_epochs * (num_batches))
            if benchmark:
                t_begin_all = timeit.default_timer()

            for epoch in range(num_epochs):

                if benchmark:
                    t_begin = timeit.default_timer()

                batch_val, _ = next(val_gen)
                test_loss, summary = sess.run(
                    [model.loss, summaries],
                    feed_dict={
                        x: batch_val,
                        model.is_training: True,
                        model.prediction_input: np.zeros(batch_size)
                    })
                test_losses.append(test_loss)
                test_writer.add_summary(
                    summary, tf.train.global_step(sess, model.global_step))
                if test_losses[-1] == min(test_losses):
                    saver.save(sess, modelpath, global_step=epoch)
                    patience_count = 0
                else:
                    patience_count += 1
                if patience_count >= patience:
                    break
                for i in range(num_batches):
                    batch_data = next(train_gen)
                    if i % 100 == 0:
                        train_loss, summary = sess.run(
                            [model.loss, summaries],
                            feed_dict={
                                x: batch_data,
                                model.is_training: True,
                                model.prediction_input: np.zeros(batch_size)
                            })
                        train_writer.add_summary(
                            summary,
                            tf.train.global_step(sess, model.global_step))
                    train_step_SOMVAE.run(
                        feed_dict={
                            x: batch_data,
                            lr_val: learning_rate,
                            model.is_training: True,
                            model.prediction_input: np.zeros(batch_size)
                        })
                    train_step_prob.run(
                        feed_dict={
                            x: batch_data,
                            lr_val: learning_rate * 100,
                            model.is_training: True,
                            model.prediction_input: np.zeros(batch_size)
                        })
                    if interactive:
                        pbar.set_postfix(epoch=epoch,
                                         train_loss=train_loss,
                                         test_loss=test_loss,
                                         refresh=False)
                        pbar.update(1)

                if benchmark:
                    t_end = timeit.default_timer()
                    times_per_epoch.append(t_end - t_begin)

            if benchmark:
                t_end_all = timeit.default_timer()
                ttime_all = t_end_all - t_begin_all

            saver.save(sess, modelpath)
            pbar.close()

    if benchmark:
        print("Total time series: {}/{}".format(train_ratio, len(data_train)))
        print("Fitting time per epoch: {:.3f}".format(
            np.mean(times_per_epoch)))
        print("Total fitting time: {:.3f}".format(ttime_all))
        sys.exit(0)

    # Evaluate the model in any case
    with tf.Session() as sess:
        results = evaluate_model(model, x, val_gen, len_data_val, modelpath)
    return results
Beispiel #7
0
def train_model(model, x, lr_val, num_epochs, patience, batch_size, logdir,
        modelpath, learning_rate, interactive, generator):
    """Trains the SOM-VAE model.
    
    Args:
        model (SOM-VAE): SOM-VAE model to train.
        x (tf.Tensor): Input tensor or placeholder.
        lr_val (tf.Tensor): Placeholder for the learning rate value.
        num_epochs (int): Number of epochs to train.
        patience (int): Patience parameter for the early stopping.
        batch_size (int): Batch size for the training generator.
        logdir (path): Directory for saving the logs.
        modelpath (path): Path for saving the model checkpoints.
        learning_rate (float): Learning rate for the optimization.
        interactive (bool): Indicator if we want to have an interactive
            progress bar for training.
        generator (generator): Generator for the data batches.
    """
    train_gen = generator("train", batch_size)
    val_gen = generator("val", batch_size)

    num_batches = len(data_train)//batch_size

    saver = tf.train.Saver(keep_checkpoint_every_n_hours=2.)
    summaries = tf.summary.merge_all()

    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        patience_count = 0
        test_losses = []
        with LogFileWriter(ex):
            train_writer = tf.summary.FileWriter(logdir+"/train", sess.graph)
            test_writer = tf.summary.FileWriter(logdir+"/test", sess.graph)
        print("Training...")
        train_step_SOMVAE, train_step_prob = model.optimize
        try:
            if interactive:
                pbar = tqdm(total=num_epochs*(num_batches)) 
            for epoch in range(num_epochs):
                batch_val = next(val_gen)
                test_loss, summary = sess.run([model.loss, summaries], feed_dict={x: batch_val})
                test_losses.append(test_loss)
                test_writer.add_summary(summary, tf.train.global_step(sess, model.global_step))
                if test_losses[-1] == min(test_losses):
                    saver.save(sess, modelpath, global_step=epoch)
                    patience_count = 0
                else:
                    patience_count += 1
                if patience_count >= patience:
                    break
                for i in range(num_batches):
                    batch_data = next(train_gen)
                    if i%100 == 0:
                        train_loss, summary = sess.run([model.loss, summaries], feed_dict={x: batch_data})
                        train_writer.add_summary(summary, tf.train.global_step(sess, model.global_step))
                    train_step_SOMVAE.run(feed_dict={x: batch_data, lr_val:learning_rate})
                    train_step_prob.run(feed_dict={x: batch_data, lr_val:learning_rate*100})
                    if interactive:
                        pbar.set_postfix(epoch=epoch, train_loss=train_loss, test_loss=test_loss, refresh=False)
                        pbar.update(1)

        except KeyboardInterrupt:
            pass
        finally:
            saver.save(sess, modelpath)
            if interactive:
                pbar.close()
Beispiel #8
0
def train_model(model, data_train, data_val, endpoints_total_val, lr_val,
                num_epochs, batch_size, latent_dim, som_dim, learning_rate,
                epochs_pretrain, ex_name, logdir, modelpath, val_epochs,
                save_pretrain, use_saved_pretrain, benchmark, train_ratio):
    """Trains the T-DPSOM model.
        Params:
            model (T-DPSOM): T-DPSOM model to train.
            data_train (np.array): Training set.
            data_val (np.array): Validation/test set.
            endpoints_total_val (np.array): Validation/test labels.
            lr_val (tf.Tensor): Placeholder for the learning rate value.
            num_epochs (int): Number of training epochs.
            batch_size (int): Batch size for the training.
            latent_dim (int): Dimensionality of the T-DPSOM's latent space.
            som_dim (list): Dimensionality of the self-organizing map.
            learning_rate (float): Learning rate for the optimization.
            epochs_pretrain (int): Number of VAE pretraining epochs.
            ex_name (string): Unique name of this particular run.
            logdir (path): Directory for the experiment logs.
            modelpath (path): Path for the model checkpoints.
            val_epochs (bool): If "True" clustering results are saved every 10 epochs on default output files.
        """

    max_n_step = 72
    epochs = 0
    iterations = 0
    pretrainpath = "../models/pretrain/LSTM"
    len_data_train = len(data_train)
    len_data_val = len(data_val)
    num_batches = len_data_train // batch_size
    train_gen = batch_generator(data_train,
                                data_val,
                                endpoints_total_val,
                                mode="train")
    val_gen = batch_generator(data_train,
                              data_val,
                              endpoints_total_val,
                              mode="val")

    saver = tf.train.Saver(max_to_keep=50)
    summaries = tf.summary.merge_all()
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        test_losses = []
        test_losses_mean = []
        with LogFileWriter(ex):
            train_writer = tf.summary.FileWriter(logdir + "/train", sess.graph)
            test_writer = tf.summary.FileWriter(logdir + "/test", sess.graph)
        train_step_SOMVAE, train_step_ae, train_step_som, train_step_prob = model.optimize
        x = model.inputs
        p = model.p
        is_training = model.is_training
        graph = tf.get_default_graph()
        init_1 = graph.get_tensor_by_name("prediction/next_state/init_state:0")
        z_e_p = graph.get_tensor_by_name("prediction/next_state/input_lstm:0")
        z_e_rec = graph.get_tensor_by_name('reconstruction_e/decoder/z_e:0')
        training_dic = {
            is_training: True,
            z_e_p: np.zeros((max_n_step * batch_size, latent_dim)),
            init_1: np.zeros((2, batch_size, 100)),
            z_e_rec: np.zeros((max_n_step * batch_size, latent_dim))
        }

        pbar = tqdm(total=(num_epochs + epochs_pretrain * 3) * (num_batches))

        print("\n**********Starting job {}********* \n".format(ex_name))
        a = np.zeros((batch_size * 72, som_dim[0] * som_dim[1]))
        dp = {p: a}
        dp.update(training_dic)

        if benchmark:
            ttime_per_epoch = []
            ttime_ae_per_epoch = []
            ttime_som_per_epoch = []
            ttime_pred_per_epoch = []

        if use_saved_pretrain:
            print("\n\nUsing Saved Pretraining...\n")
            saver.restore(sess, pretrainpath)
        else:
            print("\n\nAutoencoder Pretraining...\n")
            if benchmark:
                t_begin_all = timeit.default_timer()

            for epoch in range(epochs_pretrain):
                if benchmark:
                    t_begin = timeit.default_timer()
                for i in range(num_batches):
                    batch_data, ii = next(train_gen)
                    f_dic = {x: batch_data, lr_val: learning_rate}
                    f_dic.update(dp)
                    train_step_ae.run(feed_dict=f_dic)
                    if i % 100 == 0:
                        batch_val, _, ii = next(val_gen)
                        f_dic = {x: batch_val}
                        f_dic.update(dp)
                        test_loss, summary = sess.run(
                            [model.loss_reconstruction_ze, summaries],
                            feed_dict=f_dic)
                        test_writer.add_summary(
                            summary,
                            tf.train.global_step(sess, model.global_step))
                        f_dic = {x: batch_data}
                        f_dic.update(dp)
                        train_loss, summary = sess.run(
                            [model.loss_reconstruction_ze, summaries],
                            feed_dict=f_dic)
                        train_writer.add_summary(
                            summary,
                            tf.train.global_step(sess, model.global_step))
                    pbar.set_postfix(epoch=epoch,
                                     train_loss=train_loss,
                                     test_loss=test_loss,
                                     refresh=False)
                    pbar.update(1)
                if benchmark:
                    t_end = timeit.default_timer()
                    ttime_ae_per_epoch.append(t_end - t_begin)

            if benchmark:
                t_end_all = timeit.default_timer()
                ttime_ae_pretrain = t_end_all - t_begin_all

            print("\n\nSOM initialization...\n")
            if benchmark:
                t_begin_all = timeit.default_timer()

            for epoch in range(epochs_pretrain // 3):
                if benchmark:
                    t_begin = timeit.default_timer()
                for i in range(num_batches):
                    batch_data, ii = next(train_gen)
                    f_dic = {x: batch_data, lr_val: 0.1}
                    f_dic.update(dp)
                    train_step_som.run(feed_dict=f_dic)
                    if i % 100 == 0:
                        batch_val, _, ii = next(val_gen)
                        f_dic = {x: batch_val}
                        f_dic.update(dp)
                        test_loss, summary = sess.run(
                            [model.loss_a, summaries], feed_dict=f_dic)
                        test_writer.add_summary(
                            summary,
                            tf.train.global_step(sess, model.global_step))
                        f_dic = {x: batch_data}
                        f_dic.update(dp)
                        train_loss, summary = sess.run(
                            [model.loss_a, summaries], feed_dict=f_dic)
                        train_writer.add_summary(
                            summary,
                            tf.train.global_step(sess, model.global_step))
                    pbar.set_postfix(epoch=epoch,
                                     train_loss=train_loss,
                                     test_loss=test_loss,
                                     refresh=False)
                    pbar.update(1)
                if benchmark:
                    t_end = timeit.default_timer()
                    ttime_som_per_epoch.append(t_end - t_begin)

            for epoch in range(epochs_pretrain // 3):
                if benchmark:
                    t_begin = timeit.default_timer()
                for i in range(num_batches):
                    batch_data, ii = next(train_gen)
                    f_dic = {x: batch_data, lr_val: 0.01}
                    f_dic.update(dp)
                    train_step_som.run(feed_dict=f_dic)
                    if i % 100 == 0:
                        batch_val, _, ii = next(val_gen)
                        f_dic = {x: batch_val}
                        f_dic.update(dp)
                        test_loss, summary = sess.run(
                            [model.loss_a, summaries], feed_dict=f_dic)
                        test_writer.add_summary(
                            summary,
                            tf.train.global_step(sess, model.global_step))
                        f_dic = {x: batch_data}
                        f_dic.update(dp)
                        train_loss, summary = sess.run(
                            [model.loss_a, summaries], feed_dict=f_dic)
                        train_writer.add_summary(
                            summary,
                            tf.train.global_step(sess, model.global_step))
                    pbar.set_postfix(epoch=epoch,
                                     train_loss=train_loss,
                                     test_loss=test_loss,
                                     refresh=False)
                    pbar.update(1)
                if benchmark:
                    t_end = timeit.default_timer()
                    ttime_som_per_epoch.append(t_end - t_begin)

            for epoch in range(epochs_pretrain // 3):
                if benchmark:
                    t_begin = timeit.default_timer()
                for i in range(num_batches):
                    batch_data, ii = next(train_gen)
                    f_dic = {x: batch_data, lr_val: 0.01}
                    f_dic.update(dp)
                    train_step_som.run(feed_dict=f_dic)
                    if i % 100 == 0:
                        batch_val, _, ii = next(val_gen)
                        f_dic = {x: batch_val}
                        f_dic.update(dp)
                        test_loss, summary = sess.run(
                            [model.loss_a, summaries], feed_dict=f_dic)
                        test_writer.add_summary(
                            summary,
                            tf.train.global_step(sess, model.global_step))
                        f_dic = {x: batch_data}
                        f_dic.update(dp)
                        train_loss, summary = sess.run(
                            [model.loss_a, summaries], feed_dict=f_dic)
                        train_writer.add_summary(
                            summary,
                            tf.train.global_step(sess, model.global_step))
                    pbar.set_postfix(epoch=epoch,
                                     train_loss=train_loss,
                                     test_loss=test_loss,
                                     refresh=False)
                    pbar.update(1)
                if benchmark:
                    t_end = timeit.default_timer()
                    ttime_som_per_epoch.append(t_end - t_begin)

            if benchmark:
                t_end_all = timeit.default_timer()
                ttime_som = t_end_all - t_begin_all

            if save_pretrain:
                saver.save(sess, pretrainpath)

        print("\n\nTraining...\n")

        if benchmark:
            t_begin_all = timeit.default_timer()

        for epoch in range(num_epochs):
            if benchmark:
                t_begin = timeit.default_timer()
            epochs += 1
            print(epochs)
            f_dic = {x: data_train}
            f_dic.update(training_dic)
            q = []
            for t in range(19):
                q.extend(
                    sess.run(
                        model.q,
                        feed_dict={
                            x:
                            data_train[int(len(data_train) / 20) *
                                       t:int(len(data_train) / 20) * (t + 1)]
                        }))
            q.extend(
                sess.run(
                    model.q,
                    feed_dict={x:
                               data_train[int(len(data_train) / 20) * 19:]}))
            q = np.array(q)
            ppt = model.target_distribution(q)
            q = []
            f_dic = {x: data_val}
            f_dic.update(training_dic)
            for t in range(9):
                q.extend(
                    sess.run(model.q,
                             feed_dict={
                                 x:
                                 data_val[int(len(data_val) / 10) *
                                          t:int(len(data_val) / 10) * (t + 1)]
                             }))
            q.extend(
                sess.run(model.q,
                         feed_dict={x:
                                    data_val[int(len(data_val) / 10) * 9:]}))
            q = np.array(q)
            ppv = model.target_distribution(q)

            for i in range(num_batches):
                iterations += 1
                batch_data, ii = next(train_gen)
                ftrain = {
                    p: ppt[ii * batch_size * 72:(ii + 1) * batch_size * 72]
                }
                f_dic = {x: batch_data, lr_val: learning_rate}
                f_dic.update(ftrain)
                f_dic.update(training_dic)
                train_step_SOMVAE.run(feed_dict=f_dic)
                train_step_prob.run(feed_dict=f_dic)

                batch_val, _, ii = next(val_gen)
                fval = {
                    p: ppv[ii * batch_size * 72:(ii + 1) * batch_size * 72]
                }
                f_dic = {x: batch_val}
                f_dic.update(fval)
                f_dic.update(training_dic)
                test_loss, summary = sess.run([model.loss, summaries],
                                              feed_dict=f_dic)
                test_losses.append(test_loss)
                if i % 100 == 0:
                    test_writer.add_summary(
                        summary, tf.train.global_step(sess, model.global_step))
                    f_dic = {x: batch_data}
                    f_dic.update(ftrain)
                    f_dic.update(training_dic)
                    train_loss, summary = sess.run([model.loss, summaries],
                                                   feed_dict=f_dic)
                    if math.isnan(train_loss):
                        return None
                    train_writer.add_summary(
                        summary, tf.train.global_step(sess, model.global_step))
                if i % 1000 == 0:
                    test_loss_mean = np.mean(test_losses)
                    test_losses_mean.append(test_loss_mean)
                    test_losses = []

                if len(test_losses_mean) > 0:
                    test_s = test_losses_mean[-1]
                else:
                    test_s = test_losses_mean

                pbar.set_postfix(epoch=epoch,
                                 train_loss=train_loss,
                                 test_loss=test_s,
                                 refresh=False)
                pbar.update(1)

            if val_epochs == True and epoch % 5 == 0:
                path = "../models/exp/exp" + str(epoch) + "/LSTM"
                saver.save(sess, path)
                #results = evaluate_model(model, x, val_gen, len_data_val, modelpath, epochs)

            if benchmark:
                t_end = timeit.default_timer()
                ttime_per_epoch.append(t_end - t_begin)

        if benchmark:
            t_end_all = timeit.default_timer()
            ttime_training = t_end_all - t_begin_all

        print("\n\nPrediction Finetuning...\n")
        if benchmark:
            t_begin_all = timeit.default_timer()

        for epoch in range(200):
            if benchmark:
                t_begin = timeit.default_timer()
            for i in range(num_batches):
                batch_data, ii = next(train_gen)
                f_dic = {x: batch_data, lr_val: learning_rate * 10}
                f_dic.update(dp)
                train_step_prob.run(feed_dict=f_dic)
                if i % 100 == 0:
                    batch_val, _, ii = next(val_gen)
                    f_dic = {x: batch_val}
                    f_dic.update(dp)
                    test_loss, summary = sess.run(
                        [model.loss_prediction, summaries], feed_dict=f_dic)
                    test_writer.add_summary(
                        summary, tf.train.global_step(sess, model.global_step))
                    f_dic = {x: batch_data}
                    f_dic.update(dp)
                    train_loss, summary = sess.run(
                        [model.loss_prediction, summaries], feed_dict=f_dic)
                    train_writer.add_summary(
                        summary, tf.train.global_step(sess, model.global_step))
                pbar.set_postfix(epoch=epoch,
                                 train_loss=train_loss,
                                 test_loss=test_loss,
                                 refresh=False)
                pbar.update(1)

            if benchmark:
                t_end = timeit.default_timer()
                ttime_pred_per_epoch.append(t_end - t_begin)

        if benchmark:
            t_end_all = timeit.default_timer()
            ttime_pred = t_end_all - t_begin_all

        saver.save(sess, modelpath)
        results = evaluate_model(model, x, val_gen, len_data_val, modelpath,
                                 epochs)
        pbar.close()

        if benchmark:
            print("\nNumber of time series in train: {} %, {}".format(
                train_ratio, len(data_train)))
            print("SOM init time: {:.3f}".format(ttime_som))
            print("SOM init time per epoch: {:.3f}".format(
                np.mean(ttime_som_per_epoch)))
            print("AE pretrain time: {:.3f}".format(ttime_ae_pretrain))
            print("AE pretrain time per epoch: {:.3f}".format(
                np.mean(ttime_ae_per_epoch)))
            print("Training time: {:.3f}".format(ttime_training))
            print("Training time per epoch: {:.3f}".format(
                np.mean(ttime_per_epoch)))
            print("Pred finetuning time: {:.3f}".format(ttime_pred))
            print("Pred finetuning time per epoch: {:.3f}".format(
                np.mean(ttime_pred_per_epoch)))
            sys.exit(0)

        return results
Beispiel #9
0
def train_model(model, data_train, data_val, generator, lr_val, num_epochs,
                batch_size, logdir, ex_name, validation, val_epochs, modelpath,
                learning_rate, epochs_pretrain, som_dim, latent_dim,
                use_saved_pretrain, learning_rate_pretrain, save_pretrain):
    """Trains the DPSOM model.
    Args:
        model (DPSOM): DPSOM model to train.
        data_train (np.array): Training set.
        data_val (np.array): Validation/test set.
        generator (generator): Data generator for the batches.
        lr_val (tf.Tensor): Placeholder for the learning rate value.
        num_epochs (int): Number of training epochs.
        batch_size (int): Batch size for the training.
        logdir (path): Directory for the experiment logs.
        ex_name (string): Unique name of this particular run.
        val_epochs (bool): If "True" clustering results are saved every 10 epochs on default output files.
        modelpath (path): Path for the model checkpoints.
        learning_rate (float): Learning rate for the optimization.
        epochs_pretrain (int): Number of VAE pretraining epochs.
        som_dim (list): Dimensionality of the self-organizing map.
        latent_dim (int): Dimensionality of the DPSOM's latent space.
    """
    epochs = 0
    iterations = 0
    train_gen = generator("train", batch_size)
    if validation:
        val_gen = generator("val", batch_size)
    else:
        val_gen = generator("test", batch_size)
    len_data_train = len(data_train)
    len_data_val = len(data_val)
    num_batches = len_data_train // batch_size

    saver = tf.train.Saver(max_to_keep=5)
    summaries = tf.summary.merge_all()

    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        test_losses = []
        test_losses_mean = []
        pretrainpath = "../models/pretrainVAE/VAE"
        with LogFileWriter(ex):
            train_writer = tf.summary.FileWriter(logdir + "/train", sess.graph)
            test_writer = tf.summary.FileWriter(logdir + "/test", sess.graph)
        train_step_VARPSOM, train_step_vae, train_step_som = model.optimize
        x = model.inputs
        p = model.p
        is_training = model.is_training
        graph = tf.get_default_graph()
        z = graph.get_tensor_by_name("reconstruction_e/decoder/z_e:0")

        print("\n**********Starting job {}********* \n".format(ex_name))
        pbar = tqdm(total=(num_epochs + epochs_pretrain + 40) * num_batches)

        if use_saved_pretrain:
            print("\n\nUsing Saved Pretraining...\n")
            saver.restore(sess, pretrainpath)
        else:
            print("\n\nAutoencoder Pretraining...\n")
            a = np.zeros((batch_size, som_dim[0] * som_dim[1]))
            dp = {
                p: a,
                is_training: True,
                z: np.zeros((batch_size, latent_dim))
            }
            for epoch in range(epochs_pretrain):
                for i in range(num_batches):
                    batch_data, _, _ = next(train_gen)
                    f_dic = {x: batch_data, lr_val: learning_rate_pretrain}
                    f_dic.update(dp)
                    train_step_vae.run(feed_dict=f_dic)
                    if i % 100 == 0:
                        batch_val, _, _ = next(val_gen)
                        f_dic = {x: batch_val}
                        f_dic.update(dp)
                        test_loss, summary = sess.run(
                            [model.loss_reconstruction_ze, summaries],
                            feed_dict=f_dic)
                        test_writer.add_summary(
                            summary,
                            tf.train.global_step(sess, model.global_step))
                        f_dic = {x: batch_data}
                        f_dic.update(dp)
                        train_loss, summary = sess.run(
                            [model.loss_reconstruction_ze, summaries],
                            feed_dict=f_dic)
                        train_writer.add_summary(
                            summary,
                            tf.train.global_step(sess, model.global_step))
                    pbar.set_postfix(epoch=epoch,
                                     train_loss=train_loss,
                                     test_loss=test_loss,
                                     refresh=False)
                    pbar.update(1)

            print("\n\nSOM initialization...\n")
            for epoch in range(5):
                for i in range(num_batches):
                    batch_data, _, ii = next(train_gen)
                    f_dic = {x: batch_data, lr_val: 0.9}
                    f_dic.update(dp)
                    train_step_som.run(feed_dict=f_dic)
                    if i % 100 == 0:
                        batch_val, _, ii = next(val_gen)
                        f_dic = {x: batch_val}
                        f_dic.update(dp)
                        test_loss, summary = sess.run(
                            [model.loss_a, summaries], feed_dict=f_dic)
                        test_writer.add_summary(
                            summary,
                            tf.train.global_step(sess, model.global_step))
                        f_dic = {x: batch_data}
                        f_dic.update(dp)
                        train_loss, summary = sess.run(
                            [model.loss_a, summaries], feed_dict=f_dic)
                        train_writer.add_summary(
                            summary,
                            tf.train.global_step(sess, model.global_step))
                    pbar.set_postfix(epoch=epoch,
                                     train_loss=train_loss,
                                     test_loss=test_loss,
                                     refresh=False)
                    pbar.update(1)
            for epoch in range(5):
                for i in range(num_batches):
                    batch_data, _, ii = next(train_gen)
                    f_dic = {x: batch_data, lr_val: 0.3}
                    f_dic.update(dp)
                    train_step_som.run(feed_dict=f_dic)
                    if i % 100 == 0:
                        batch_val, _, ii = next(val_gen)
                        f_dic = {x: batch_val}
                        f_dic.update(dp)
                        test_loss, summary = sess.run(
                            [model.loss_a, summaries], feed_dict=f_dic)
                        test_writer.add_summary(
                            summary,
                            tf.train.global_step(sess, model.global_step))
                        f_dic = {x: batch_data}
                        f_dic.update(dp)
                        train_loss, summary = sess.run(
                            [model.loss_a, summaries], feed_dict=f_dic)
                        train_writer.add_summary(
                            summary,
                            tf.train.global_step(sess, model.global_step))
                    pbar.set_postfix(epoch=epoch,
                                     train_loss=train_loss,
                                     test_loss=test_loss,
                                     refresh=False)
                    pbar.update(1)
            for epoch in range(5):
                for i in range(num_batches):
                    batch_data, _, ii = next(train_gen)
                    f_dic = {x: batch_data, lr_val: 0.1}
                    f_dic.update(dp)
                    train_step_som.run(feed_dict=f_dic)
                    if i % 100 == 0:
                        batch_val, _, ii = next(val_gen)
                        f_dic = {x: batch_val}
                        f_dic.update(dp)
                        test_loss, summary = sess.run(
                            [model.loss_a, summaries], feed_dict=f_dic)
                        test_writer.add_summary(
                            summary,
                            tf.train.global_step(sess, model.global_step))
                        f_dic = {x: batch_data}
                        f_dic.update(dp)
                        train_loss, summary = sess.run(
                            [model.loss_a, summaries], feed_dict=f_dic)
                        train_writer.add_summary(
                            summary,
                            tf.train.global_step(sess, model.global_step))
                    pbar.set_postfix(epoch=epoch,
                                     train_loss=train_loss,
                                     test_loss=test_loss,
                                     refresh=False)
                    pbar.update(1)
            for epoch in range(5):
                for i in range(num_batches):
                    batch_data, _, ii = next(train_gen)
                    f_dic = {x: batch_data, lr_val: 0.01}
                    f_dic.update(dp)
                    train_step_som.run(feed_dict=f_dic)
                    if i % 100 == 0:
                        batch_val, _, ii = next(val_gen)
                        f_dic = {x: batch_val}
                        f_dic.update(dp)
                        test_loss, summary = sess.run(
                            [model.loss_a, summaries], feed_dict=f_dic)
                        test_writer.add_summary(
                            summary,
                            tf.train.global_step(sess, model.global_step))
                        f_dic = {x: batch_data}
                        f_dic.update(dp)
                        train_loss, summary = sess.run(
                            [model.loss_a, summaries], feed_dict=f_dic)
                        train_writer.add_summary(
                            summary,
                            tf.train.global_step(sess, model.global_step))
                    pbar.set_postfix(epoch=epoch,
                                     train_loss=train_loss,
                                     test_loss=test_loss,
                                     refresh=False)
                    pbar.update(1)
            if save_pretrain:
                saver.save(sess, pretrainpath)

        print("\n\nTraining...\n")
        lratios = []
        l2ratios = []
        l3ratios = []
        for epoch in range(num_epochs):
            epochs += 1
            #Compute initial soft probabilities between data points and centroids
            q = []
            for t in range(9):
                q.extend(
                    sess.run(
                        model.q,
                        feed_dict={
                            x:
                            data_train[int(len(data_train) / 10) *
                                       t:int(len(data_train) / 10) * (t + 1)],
                            is_training:
                            True,
                            z:
                            np.zeros((int(len(data_train) / 10), latent_dim))
                        }))
            q.extend(
                sess.run(model.q,
                         feed_dict={
                             x: data_train[int(len(data_train) / 10) * 9:],
                             is_training: True,
                             z: np.zeros(
                                 (int(len(data_train) / 10), latent_dim))
                         }))
            q = np.array(q)
            ppt = model.target_distribution(q)
            q = sess.run(model.q,
                         feed_dict={
                             x: data_val,
                             is_training: True,
                             z: np.zeros((len(data_val), latent_dim))
                         })
            ppv = model.target_distribution(q)

            #Train
            for i in range(num_batches):
                iterations += 1
                batch_data, _, ii = next(train_gen)
                ftrain = {
                    p: ppt[ii * batch_size:(ii + 1) * batch_size],
                    is_training: True,
                    z: np.zeros((batch_size, latent_dim))
                }
                f_dic = {x: batch_data, lr_val: learning_rate}
                f_dic.update(ftrain)
                train_step_VARPSOM.run(feed_dict=f_dic)
                batch_val, _, ii = next(val_gen)
                fval = {
                    p: ppv[ii * batch_size:(ii + 1) * batch_size],
                    is_training: True,
                    z: np.zeros((batch_size, latent_dim))
                }
                f_dic = {x: batch_val}
                f_dic.update(fval)
                test_loss, summary = sess.run([model.loss, summaries],
                                              feed_dict=f_dic)
                test_losses.append(test_loss)
                if i % 100 == 0:
                    test_writer.add_summary(
                        summary, tf.train.global_step(sess, model.global_step))
                    f_dic = {x: batch_data}
                    f_dic.update(ftrain)
                    train_loss, summary = sess.run([model.loss, summaries],
                                                   feed_dict=f_dic)
                    elbo_loss = sess.run(
                        [model.theta * model.loss_reconstruction_ze],
                        feed_dict=f_dic)
                    cah_loss = sess.run([model.gamma * model.loss_commit],
                                        feed_dict=f_dic)
                    ssom_loss = sess.run([model.beta * model.loss_som],
                                         feed_dict=f_dic)
                    cah_ssom_ratio = cah_loss[0] / ssom_loss[0]
                    vae_cah_ratio = elbo_loss[0] / cah_loss[0]
                    clust_vae_ratio = elbo_loss[0] / (ssom_loss[0] +
                                                      cah_loss[0])
                    lratios.append(cah_ssom_ratio)
                    l2ratios.append(vae_cah_ratio)
                    l3ratios.append(clust_vae_ratio)
                    train_writer.add_summary(
                        summary, tf.train.global_step(sess, model.global_step))
                if i % 1000 == 0:
                    test_loss_mean = np.mean(test_losses)
                    test_losses_mean.append(test_loss_mean)
                    test_losses = []

                if len(test_losses_mean) > 0:
                    test_s = test_losses_mean[-1]
                else:
                    test_s = test_losses_mean

                pbar.set_postfix(epoch=epoch,
                                 train_loss=train_loss,
                                 test_loss=test_s,
                                 ssom=ssom_loss,
                                 cah=cah_loss,
                                 vae=elbo_loss,
                                 cs_ratio=np.mean(lratios),
                                 vc_ratio=np.mean(l2ratios),
                                 cr_ratio=np.mean(l3ratios),
                                 refresh=False)
                pbar.update(1)
            saver.save(sess, modelpath)

            if val_epochs == True and epochs % 10 == 0:
                saver.save(sess, modelpath)
                results = evaluate_model(model, generator, len_data_val, x,
                                         modelpath, epochs)
                if results is None:
                    return None

        saver.save(sess, modelpath)
        results = evaluate_model(model, generator, len_data_val, x, modelpath,
                                 epochs)
    return results