Beispiel #1
0
def train_cnn():
    """Training CNN model."""

    # Load sentences, labels, and training parameters
    # logger.info('✔︎ Loading data...')

    # logger.info('✔︎ Training data processing...')
    # train_data = dh.load_data_and_labels(FLAGS.training_data_file, FLAGS.embedding_dim)

    # logger.info('✔︎ Validation data processing...')
    # validation_data = dh.load_data_and_labels(FLAGS.validation_data_file, FLAGS.embedding_dim)

    # logger.info('Recommended padding Sequence length is: {0}'.format(FLAGS.pad_seq_len))

    # logger.info('✔︎ Training data padding...')
    # x_train_front, x_train_behind, y_train = dh.pad_data(train_data, FLAGS.pad_seq_len)

    # logger.info('✔︎ Validation data padding...')
    # x_validation_front, x_validation_behind, y_validation = dh.pad_data(validation_data, FLAGS.pad_seq_len)

    # Build vocabulary
    # VOCAB_SIZE = dh.load_vocab_size(FLAGS.embedding_dim)
    # pretrained_word2vec_matrix = dh.load_word2vec_matrix(VOCAB_SIZE, FLAGS.embedding_dim)
    pretrained_word2vec_matrix = None

    # Build a graph and cnn object
    with tf.Graph().as_default():
        session_conf = tf.ConfigProto(
            allow_soft_placement=True,
            log_device_placement=False)
        session_conf.gpu_options.allow_growth = FLAGS.gpu_options_allow_growth
        sess = tf.Session(config=session_conf)
        timestamp = str(int(time.time()))
        with sess.as_default():
            batch_loader = MulBatchLoader("data/train.data", FLAGS.batch_size, "runs/"+timestamp+"/")
            if mode == "cnn":
                cnn = TwoLangTextCNN(
                    sequence_length=batch_loader.max_len,
                    num_classes=FLAGS.num_classes,
                    vocab_size_en=batch_loader.vocab_size_en,
                    vocab_size_zh=batch_loader.vocab_size_zh,
                    fc_hidden_size=FLAGS.fc_hidden_size,
                    embedding_size=FLAGS.embedding_dim,
                    embedding_type=FLAGS.embedding_type,
                    filter_sizes=list(map(int, FLAGS.filter_sizes.split(","))),
                    num_filters=FLAGS.num_filters,
                    l2_reg_lambda=FLAGS.l2_reg_lambda,
                    pretrained_embedding=pretrained_word2vec_matrix)
            elif mode == "rnn":
                cnn = SiameseLSTM(
                    sequence_length=batch_loader.max_len,
                    num_classes=FLAGS.num_classes,
                    vocab_size_en=batch_loader.vocab_size_en,
                    vocab_size_zh=batch_loader.vocab_size_zh,
                    fc_hidden_size=FLAGS.fc_hidden_size,
                    embedding_size=FLAGS.embedding_dim,
                    embedding_type=FLAGS.embedding_type,
                    l2_reg_lambda=FLAGS.l2_reg_lambda,
                    pretrained_embedding=pretrained_word2vec_matrix)

            # Define training procedure
            # learning_rate = tf.train.exponential_decay(learning_rate=FLAGS.learning_rate, global_step=cnn.global_step,
            #                                            decay_steps=FLAGS.decay_steps, decay_rate=FLAGS.decay_rate,
            #                                            staircase=True)
            with tf.control_dependencies(tf.get_collection(tf.GraphKeys.UPDATE_OPS)):
                optimizer = tf.train.AdamOptimizer(args.learning_rate)
                grads, vars = zip(*optimizer.compute_gradients(cnn.loss))
                grads, _ = tf.clip_by_global_norm(grads, clip_norm=FLAGS.norm_ratio)
                train_op = optimizer.apply_gradients(zip(grads, vars), global_step=cnn.global_step, name="train_op")

            # Keep track of gradient values and sparsity (optional)
            # grad_summaries = []
            # for g, v in zip(grads, vars):
            #     if g is not None:
            #         grad_hist_summary = tf.summary.histogram("{0}/grad/hist".format(v.name), g)
            #         sparsity_summary = tf.summary.scalar("{0}/grad/sparsity".format(v.name), tf.nn.zero_fraction(g))
            #         grad_summaries.append(grad_hist_summary)
            #         grad_summaries.append(sparsity_summary)
            # grad_summaries_merged = tf.summary.merge(grad_summaries)

            # Output directory for models and summaries
            if FLAGS.train_or_restore == 'R':
                MODEL = input("☛ Please input the checkpoints model you want to restore, "
                              "it should be like(1490175368): ")  # The model you want to restore

                while not (MODEL.isdigit() and len(MODEL) == 10):
                    MODEL = input('✘ The format of your input is illegal, please re-input: ')
                logger.info('✔︎ The format of your input is legal, now loading to next step...')

                checkpoint_dir = 'runs/' + MODEL + '/checkpoints/'

                out_dir = os.path.abspath(os.path.join(os.path.curdir, "runs", MODEL))
                logger.info("✔︎ Writing to {0}\n".format(out_dir))
            else:
                out_dir = os.path.abspath(os.path.join(os.path.curdir, "runs", timestamp))
                logger.info("✔︎ Writing to {0}\n".format(out_dir))

            # Summaries for loss and accuracy
            loss_summary = tf.summary.scalar("loss", cnn.loss)
            acc_summary = tf.summary.scalar("accuracy", cnn.accuracy)

            # Train summaries
            # train_summary_op = tf.summary.merge([loss_summary, acc_summary, grad_summaries_merged])
            train_summary_op = tf.summary.merge([loss_summary, acc_summary])
            train_summary_dir = os.path.join(out_dir, "summaries", "train")
            train_summary_writer = tf.summary.FileWriter(train_summary_dir, sess.graph)

            # Validation summaries
            validation_summary_op = tf.summary.merge([loss_summary, acc_summary])
            validation_summary_dir = os.path.join(out_dir, "summaries", "validation")
            validation_summary_writer = tf.summary.FileWriter(validation_summary_dir, sess.graph)

            saver = tf.train.Saver(tf.global_variables(), max_to_keep=FLAGS.num_checkpoints)

            if FLAGS.train_or_restore == 'R':
                # Load cnn model
                logger.info("✔ Loading model...")
                checkpoint_file = tf.train.latest_checkpoint(checkpoint_dir)
                logger.info(checkpoint_file)

                # Load the saved meta graph and restore variables
                saver = tf.train.import_meta_graph("{0}.meta".format(checkpoint_file))
                saver.restore(sess, checkpoint_file)
            else:
                checkpoint_dir = os.path.abspath(os.path.join(out_dir, "checkpoints"))
                if not os.path.exists(checkpoint_dir):
                    os.makedirs(checkpoint_dir)
                sess.run(tf.global_variables_initializer())
                sess.run(tf.local_variables_initializer())

                # Embedding visualization config
                config = projector.ProjectorConfig()
                embedding_conf = config.embeddings.add()
                embedding_conf.tensor_name = 'embedding'
                embedding_conf.metadata_path = FLAGS.metadata_file

                projector.visualize_embeddings(train_summary_writer, config)
                projector.visualize_embeddings(validation_summary_writer, config)

                # Save the embedding visualization
                saver.save(sess, os.path.join(out_dir, 'embedding', 'embedding.ckpt'))

            current_step = sess.run(cnn.global_step)

            def train_step(x_batch_front, x_batch_behind, y_batch, epoch):
                """A single training step"""
                feed_dict = {
                    cnn.input_x_en: x_batch_front,
                    cnn.input_x_zh: x_batch_behind,
                    cnn.input_y: y_batch,
                    cnn.dropout_keep_prob: FLAGS.dropout_keep_prob,
                    cnn.is_training: True
                }
                _, step, summaries, loss, accuracy = sess.run(
                    [train_op, cnn.global_step, train_summary_op, cnn.loss, cnn.accuracy], feed_dict)
                logger.info("epoch/step {}/{}: loss {:5.4f}, acc {:5.4f}".format(epoch+1, step, loss, accuracy))
                train_summary_writer.add_summary(summaries, step)

            def validation_step(writer=None):
                """Evaluates model on a validation set"""
                total_step = 0
                total_loss = 0
                total_accuracy = 0
                total_recall = 0
                total_precision = 0
                total_f1 = 0
                total_auc = 0
                for x_batch_front, x_batch_behind, y_batch in batch_loader.gen_dev_batch():
                    feed_dict = {
                        cnn.input_x_en: x_batch_front,
                        cnn.input_x_zh: x_batch_behind,
                        cnn.input_y: y_batch,
                        cnn.dropout_keep_prob: 1.0,
                        cnn.is_training: False
                    }
                    step, summaries, loss, accuracy, recall, precision, f1, auc = sess.run(
                        [cnn.global_step, validation_summary_op, cnn.loss, cnn.accuracy,
                         cnn.recall, cnn.precision, cnn.F1, cnn.AUC], feed_dict)
                    total_step += 1
                    total_loss += loss
                    total_accuracy += accuracy
                    total_recall += recall
                    total_precision += precision
                    total_f1 += f1
                    # total_auc += auc
                def get_div(a, b):
                    if b == 0:
                        return 0
                    return a*1.0/b
                avg_loss = get_div(total_loss, total_step)
                avg_accuracy = get_div(total_accuracy, total_step)
                avg_recall = get_div(total_recall, total_step)
                avg_precision = get_div(total_precision, total_step)
                avg_f1 = get_div(total_f1, total_step)
                # avg_auc = get_div(total_auc, total_step)
                # logger.info("total_step {0}: avg_loss {1:g}, avg_acc {2:g}, avg_recall {3:g}, avg_precision {4:g}, avg_f1 {5:g}, avg_AUC {6}"
                #             .format(total_step, avg_loss, avg_accuracy, avg_recall, avg_precision, avg_f1, avg_auc))
                logger.info("total_step {0}: avg_loss {1:g}, avg_acc {2:g}, avg_recall {3:g}, avg_precision {4:g}, avg_f1 {5:g}"
                            .format(total_step, avg_loss, avg_accuracy, avg_recall, avg_precision, avg_f1))
                avg_summaries = tf.Summary()
                loss_val = avg_summaries.value.add()
                loss_val.tag = "loss"
                loss_val.simple_value = avg_loss
                accuracy_val = avg_summaries.value.add()
                accuracy_val.tag = "accuracy"
                accuracy_val.simple_value = avg_accuracy
                if writer:
                    writer.add_summary(avg_summaries, step)

            # Training loop. For each batch...
            for epoch in range(20):
                for en_batch, zh_batch, y_batch in batch_loader.gen_batch():
                    train_step(en_batch, zh_batch, y_batch, epoch)
                    current_step = tf.train.global_step(sess, cnn.global_step)

                    if current_step % FLAGS.checkpoint_every == 0:
                        checkpoint_prefix = os.path.join(checkpoint_dir, "model")
                        path = saver.save(sess, checkpoint_prefix, global_step=current_step)
                        logger.info("✔︎ Saved model checkpoint to {0}\n".format(path))

                    if current_step % FLAGS.evaluate_every == 0:
                        logger.info("\nEvaluation:")
                        validation_step(writer=validation_summary_writer)

    logger.info("✔︎ Done.")
    def train(self, dataset_list, config):
        """
        Args:
            dataset_list (<StockDataSet>)
            config (tf.app.flags.FLAGS)
        """
        assert len(dataset_list) > 0
        self.merged_sum = tf.summary.merge_all()

        # Set up the logs folder
        self.writer = tf.summary.FileWriter(
            os.path.join("./logs", self.model_name))
        self.writer.add_graph(self.sess.graph)

        if self.use_embed:
            # Set up embedding visualization
            # Format: tensorflow/tensorboard/plugins/projector/projector_config.proto
            projector_config = projector.ProjectorConfig()

            # You can add multiple embeddings. Here we add only one.
            added_embed = projector_config.embeddings.add()
            added_embed.tensor_name = self.embed_matrix.name
            # Link this tensor to its metadata file (e.g. labels).
            shutil.copyfile(os.path.join(self.logs_dir, "metadata.tsv"),
                            os.path.join(self.model_logs_dir, "metadata.tsv"))
            added_embed.metadata_path = "metadata.tsv"

            # The next line writes a projector_config.pbtxt in the LOG_DIR. TensorBoard will
            # read this file during startup.
            projector.visualize_embeddings(self.writer, projector_config)

        tf.global_variables_initializer().run()

        # Merged test data of different stocks.
        merged_test_X = []
        merged_test_y = []
        merged_test_labels = []

        for label_, d_ in enumerate(dataset_list):
            merged_test_X += list(d_.test_X)
            merged_test_y += list(d_.test_y)
            merged_test_labels += [[label_]] * len(d_.test_X)

        merged_test_X = np.array(merged_test_X)
        merged_test_y = np.array(merged_test_y)
        merged_test_labels = np.array(merged_test_labels)

        print "len(merged_test_X) =", len(merged_test_X)
        print "len(merged_test_y) =", len(merged_test_y)
        print "len(merged_test_labels) =", len(merged_test_labels)

        test_data_feed = {
            self.learning_rate: 0.0,
            self.inputs: merged_test_X,
            self.targets: merged_test_y,
            self.symbols: merged_test_labels,
        }

        global_step = 0

        num_batches = sum(len(d_.train_X)
                          for d_ in dataset_list) // config.batch_size
        random.seed(time.time())

        # Select samples for plotting.
        sample_labels = range(min(config.sample_size, len(dataset_list)))
        sample_indices = {}
        for l in sample_labels:
            sym = dataset_list[l].stock_sym
            target_indices = np.array([
                i for i, sym_label in enumerate(merged_test_labels)
                if sym_label[0] == l
            ])
            sample_indices[sym] = target_indices
        print sample_indices

        print "Start training for stocks:", [d.stock_sym for d in dataset_list]
        for epoch in xrange(config.max_epoch):
            epoch_step = 0
            learning_rate = config.init_learning_rate * (
                config.learning_rate_decay**max(
                    float(epoch + 1 - config.init_epoch), 0.0))

            for label_, d_ in enumerate(dataset_list):
                for batch_X, batch_y in d_.generate_one_epoch(
                        config.batch_size):
                    global_step += 1
                    epoch_step += 1
                    batch_labels = np.array([[label_]] * len(batch_X))
                    train_data_feed = {
                        self.learning_rate: learning_rate,
                        self.inputs: batch_X,
                        self.targets: batch_y,
                        self.symbols: batch_labels,
                    }
                    train_loss, _, train_merged_sum = self.sess.run(
                        [self.loss, self.optim, self.merged_sum],
                        train_data_feed)
                    self.writer.add_summary(train_merged_sum,
                                            global_step=global_step)

                    if np.mod(global_step,
                              len(dataset_list) * 100 /
                              config.input_size) == 1:
                        test_loss, test_pred = self.sess.run(
                            [self.loss, self.pred], test_data_feed)

                        print "Step:%d [Epoch:%d] [Learning rate: %.6f] train_loss:%.6f test_loss:%.6f" % (
                            global_step, epoch, learning_rate, train_loss,
                            test_loss)

                        # Plot samples
                        for sample_sym, indices in sample_indices.iteritems():
                            image_path = os.path.join(
                                self.model_plots_dir,
                                "{}_epoch{:02d}_step{:04d}.png".format(
                                    sample_sym, epoch, epoch_step))
                            sample_preds = test_pred[indices]
                            sample_truth = merged_test_y[indices]
                            self.plot_samples(sample_preds,
                                              sample_truth,
                                              image_path,
                                              stock_sym=sample_sym)

                        self.save(global_step)

        final_pred, final_loss = self.sess.run([self.pred, self.loss],
                                               test_data_feed)

        # Save the final model
        self.save(global_step)
        return final_pred
Beispiel #3
0
def create_tensorboard_visualizations(model,
                                      loc,
                                      labels=None,
                                      write_metadata=True,
                                      export_tsv_embeddings=True):
    """Export embeddings to Tensorboard.

    This function exports embeddings to disk in a format used by
    `TensorBoard <https://www.tensorflow.org/tensorboard>`_ and
    `TensorBoard Embedding Projector <https://projector.tensorflow.org>`_.
    The function exports:

    * A number of checkpoint and graph embedding files in the provided location that will allow
      you to visualize embeddings using Tensorboard. This is generally for use with a
      `local Tensorboard instance <https://www.tensorflow.org/tensorboard/r1/overview>`_.
    * a tab-separated file of embeddings ``embeddings_projector.tsv``. This is generally used to
      visualize embeddings by uploading to `TensorBoard Embedding Projector <https://projector.tensorflow.org>`_.
    * embeddings metadata (i.e. the embeddings labels from the original knowledge graph), saved to ``metadata.tsv``.
      Such file can be used in TensorBoard or uploaded to TensorBoard Embedding Projector.

    The content of ``loc`` will look like: ::

        tensorboard_files/
            ├── checkpoint
            ├── embeddings_projector.tsv
            ├── graph_embedding.ckpt.data-00000-of-00001
            ├── graph_embedding.ckpt.index
            ├── graph_embedding.ckpt.meta
            ├── metadata.tsv
            └── projector_config.pbtxt

    .. Note ::
        A TensorBoard guide is available at `this address <https://www.tensorflow.org/tensorboard/r1/overview>`_.

    .. Note ::
        Uploading ``embeddings_projector.tsv`` and ``metadata.tsv`` to
        `TensorBoard Embedding Projector <https://projector.tensorflow.org>`_ will give a result
        similar to the picture below:

        .. image:: ../img/embeddings_projector.png

    Examples
    --------
    >>> import numpy as np
    >>> from ampligraph.latent_features import TransE
    >>> from ampligraph.utils import create_tensorboard_visualizations
    >>>
    >>> X = np.array([['a', 'y', 'b'],
    >>>               ['b', 'y', 'a'],
    >>>               ['a', 'y', 'c'],
    >>>               ['c', 'y', 'a'],
    >>>               ['a', 'y', 'd'],
    >>>               ['c', 'y', 'd'],
    >>>               ['b', 'y', 'c'],
    >>>               ['f', 'y', 'e']])
    >>>
    >>> model = TransE(batches_count=1, seed=555, epochs=20, k=10, loss='pairwise',
    >>>                loss_params={'margin':5})
    >>> model.fit(X)
    >>>
    >>> create_tensorboard_visualizations(model, 'tensorboard_files')


    Parameters
    ----------
    model: EmbeddingModel
        A trained neural knowledge graph embedding model, the model must be an instance of TransE,
        DistMult, ComplEx, or HolE.
    loc: string
        Directory where the files are written.
    labels: pd.DataFrame
        Label(s) for each embedding point in the Tensorboard visualization.
        Default behaviour is to use the embeddings labels included in the model.
    export_tsv_embeddings: bool (Default: True
         If True, will generate a tab-separated file of embeddings at the given path. This is generally used to
         visualize embeddings by uploading to `TensorBoard Embedding Projector <https://projector.tensorflow.org>`_.
    write_metadata: bool (Default: True)
        If True will write a file named 'metadata.tsv' in the same directory as path.

    """

    # Create loc if it doesn't exist
    if not os.path.exists(loc):
        logger.debug('Creating Tensorboard visualization directory: %s' % loc)
        os.mkdir(loc)

    if not model.is_fitted:
        raise ValueError('Cannot write embeddings if model is not fitted.')

    # If no label data supplied, use model ent_to_idx keys as labels
    if labels is None:

        logger.info(
            'Using model entity dictionary to create Tensorboard metadata.tsv')
        labels = list(model.ent_to_idx.keys())
    else:
        if len(labels) != len(model.ent_to_idx):
            raise ValueError(
                'Label data rows must equal number of embeddings.')

    if write_metadata:
        logger.debug('Writing metadata.tsv to: %s' % loc)
        write_metadata_tsv(loc, labels)

    if export_tsv_embeddings:
        tsv_filename = "embeddings_projector.tsv"
        logger.info('Writing embeddings tsv to: %s' %
                    os.path.join(loc, tsv_filename))
        np.savetxt(os.path.join(loc, tsv_filename),
                   model.trained_model_params[0],
                   delimiter='\t')

    checkpoint_path = os.path.join(loc, 'graph_embedding.ckpt')

    # Create embeddings Variable
    embedding_var = tf.Variable(model.trained_model_params[0],
                                name='graph_embedding')

    with tf.Session() as sess:
        saver = tf.train.Saver([embedding_var])

        sess.run(embedding_var.initializer)

        saver.save(sess, checkpoint_path)

        config = projector.ProjectorConfig()

        # One can add multiple embeddings.
        embedding = config.embeddings.add()
        embedding.tensor_name = embedding_var.name

        # Link this tensor to its metadata file (e.g. labels).
        embedding.metadata_path = 'metadata.tsv'

        # Saves a config file that TensorBoard will read during startup.
        projector.visualize_embeddings(tf.summary.FileWriter(loc), config)
Beispiel #4
0
        loss_summary = tf.scalar_summary("loss", cnn.loss)
        acc_summary = tf.scalar_summary("accuracy", cnn.accuracy)
        prec_summary = tf.scalar_summary("precision", cnn.precision)
        recl_summary = tf.scalar_summary("recall", cnn.recall)

        # Train Summaries
        train_summary_op = tf.merge_summary([
            loss_summary, acc_summary, prec_summary, recl_summary,
            grad_summaries_merged
        ])
        train_summary_dir = out_dir  #os.path.join(out_dir, "summaries", "train")
        train_summary_writer = tf.train.SummaryWriter(train_summary_dir,
                                                      sess.graph)

        config_pro = projector.ProjectorConfig()
        embedding = config_pro.embeddings.add()
        embedding.tensor_name = cnn.embedding.name
        embedding.metadata_path = os.path.join(out_dir, 'vocab_raw')
        projector.visualize_embeddings(train_summary_writer, config_pro)

        # Dev summaries
        dev_summary_op = tf.merge_summary([loss_summary, acc_summary])
        dev_summary_dir = os.path.join(out_dir, "summaries", "dev")
        dev_summary_writer = tf.train.SummaryWriter(dev_summary_dir,
                                                    sess.graph)

        # Checkpoint directory. Tensorflow assumes this directory already exists so we need to create it
        checkpoint_dir = out_dir  #os.path.abspath(os.path.join(out_dir, "checkpoints"))
        checkpoint_prefix = os.path.join(checkpoint_dir, "model")
        if not os.path.exists(checkpoint_dir):
Beispiel #5
0
def word2vec_basic(log_dir):
    """Example of building, training and visualizing a word2vec model."""
    # Create the directory for TensorBoard variables if there is not.
    if not os.path.exists(log_dir):
        os.makedirs(log_dir)

    # Step 1: Download the data.
    url = 'http://mattmahoney.net/dc/'

    # pylint: disable=redefined-outer-name
    def maybe_download(filename, expected_bytes):
        """Download a file if not present, and make sure it's the right size."""
        local_filename = os.path.join(gettempdir(), filename)
        if not os.path.exists(local_filename):
            local_filename, _ = urllib.request.urlretrieve(
                url + filename, local_filename)
        statinfo = os.stat(local_filename)
        if statinfo.st_size == expected_bytes:
            print('Found and verified', filename)
        else:
            print(statinfo.st_size)
            raise Exception('Failed to verify ' + local_filename +
                            '. Can you get to it with a browser?')
        return local_filename

    #filename = maybe_download('text8.zip', 31344016)
    filename = "/Users/cherry/MachineLearning/tf-scaffold/mnist_datasets/text8.zip"

    # Read the data into a list of strings.
    def read_data(filename):
        """Extract the first file enclosed in a zip file as a list of words."""
        with zipfile.ZipFile(filename) as f:
            data = tf.compat.as_str(f.read(f.namelist()[0])).split()
        return data

    vocabulary = read_data(filename)
    print('Data size', len(vocabulary))

    # Step 2: Build the dictionary and replace rare words with UNK token.
    vocabulary_size = 50000

    def build_dataset(words, n_words):
        """Process raw inputs into a dataset."""
        count = [['UNK', -1]]
        count.extend(collections.Counter(words).most_common(n_words - 1))
        dictionary = {}
        for word, _ in count:
            dictionary[word] = len(dictionary)
        data = []
        unk_count = 0
        for word in words:
            index = dictionary.get(word, 0)
            if index == 0:  # dictionary['UNK']
                unk_count += 1
            data.append(index)
        count[0][1] = unk_count
        reversed_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
        return data, count, dictionary, reversed_dictionary

    # Filling 4 global variables:
    # data - list of codes (integers from 0 to vocabulary_size-1).
    #   This is the original text but words are replaced by their codes
    # count - map of words(strings) to count of occurrences
    # dictionary - map of words(strings) to their codes(integers)
    # reverse_dictionary - maps codes(integers) to words(strings)
    data, count, unused_dictionary, reverse_dictionary = build_dataset(
        vocabulary, vocabulary_size)
    del vocabulary  # Hint to reduce memory.
    print('Most common words (+UNK)', count[:5])
    print('Sample data', data[:10], [reverse_dictionary[i] for i in data[:10]])

    # Step 3: Function to generate a training batch for the skip-gram model.
    def generate_batch(batch_size, num_skips, skip_window):
        global data_index
        assert batch_size % num_skips == 0
        assert num_skips <= 2 * skip_window
        batch = np.ndarray(shape=(batch_size), dtype=np.int32)
        labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
        span = 2 * skip_window + 1  # [ skip_window target skip_window ]
        buffer = collections.deque(maxlen=span)  # pylint: disable=redefined-builtin
        if data_index + span > len(data):
            data_index = 0
        buffer.extend(data[data_index:data_index + span])
        data_index += span
        for i in range(batch_size // num_skips):
            context_words = [w for w in range(span) if w != skip_window]
            words_to_use = random.sample(context_words, num_skips)
            for j, context_word in enumerate(words_to_use):
                batch[i * num_skips + j] = buffer[skip_window]
                labels[i * num_skips + j, 0] = buffer[context_word]
            if data_index == len(data):
                buffer.extend(data[0:span])
                data_index = span
            else:
                buffer.append(data[data_index])
                data_index += 1
        # Backtrack a little bit to avoid skipping words in the end of a batch
        data_index = (data_index + len(data) - span) % len(data)
        return batch, labels

    batch, labels = generate_batch(batch_size=8, num_skips=2, skip_window=1)
    for i in range(8):
        print(batch[i], reverse_dictionary[batch[i]], '->', labels[i, 0],
              reverse_dictionary[labels[i, 0]])

    # Step 4: Build and train a skip-gram model.

    batch_size = 128
    embedding_size = 128  # Dimension of the embedding vector.
    skip_window = 1  # How many words to consider left and right.
    num_skips = 2  # How many times to reuse an input to generate a label.
    num_sampled = 64  # Number of negative examples to sample.

    # We pick a random validation set to sample nearest neighbors. Here we limit
    # the validation samples to the words that have a low numeric ID, which by
    # construction are also the most frequent. These 3 variables are used only for
    # displaying model accuracy, they don't affect calculation.
    valid_size = 16  # Random set of words to evaluate similarity on.
    valid_window = 100  # Only pick dev samples in the head of the distribution.
    valid_examples = np.random.choice(valid_window, valid_size, replace=False)

    graph = tf.Graph()

    with graph.as_default():

        # Input data.
        with tf.name_scope('inputs'):
            train_inputs = tf.placeholder(tf.int32, shape=[batch_size])
            train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])
            valid_dataset = tf.constant(valid_examples, dtype=tf.int32)

        # Ops and variables pinned to the CPU because of missing GPU implementation
        with tf.device('/cpu:0'):
            # Look up embeddings for inputs.
            with tf.name_scope('embeddings'):
                embeddings = tf.Variable(
                    tf.random_uniform([vocabulary_size, embedding_size], -1.0,
                                      1.0))
                embed = tf.nn.embedding_lookup(embeddings, train_inputs)

            # Construct the variables for the NCE loss
            with tf.name_scope('weights'):
                nce_weights = tf.Variable(
                    tf.truncated_normal([vocabulary_size, embedding_size],
                                        stddev=1.0 /
                                        math.sqrt(embedding_size)))
            with tf.name_scope('biases'):
                nce_biases = tf.Variable(tf.zeros([vocabulary_size]))

        # Compute the average NCE loss for the batch.
        # tf.nce_loss automatically draws a new sample of the negative labels each
        # time we evaluate the loss.
        # Explanation of the meaning of NCE loss:
        #   http://mccormickml.com/2016/04/19/word2vec-tutorial-the-skip-gram-model/
        with tf.name_scope('loss'):
            loss = tf.reduce_mean(
                tf.nn.nce_loss(weights=nce_weights,
                               biases=nce_biases,
                               labels=train_labels,
                               inputs=embed,
                               num_sampled=num_sampled,
                               num_classes=vocabulary_size))

        # Add the loss value as a scalar to summary.
        tf.summary.scalar('loss', loss)

        # Construct the SGD optimizer using a learning rate of 1.0.
        with tf.name_scope('optimizer'):
            optimizer = tf.train.GradientDescentOptimizer(1.0).minimize(loss)

        # Compute the cosine similarity between minibatch examples and all
        # embeddings.
        norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keepdims=True))
        normalized_embeddings = embeddings / norm
        valid_embeddings = tf.nn.embedding_lookup(normalized_embeddings,
                                                  valid_dataset)
        similarity = tf.matmul(valid_embeddings,
                               normalized_embeddings,
                               transpose_b=True)

        # Merge all summaries.
        merged = tf.summary.merge_all()

        # Add variable initializer.
        init = tf.global_variables_initializer()

        # Create a saver.
        saver = tf.train.Saver()

    # Step 5: Begin training.
    num_steps = 100001

    with tf.Session(graph=graph) as session:
        # Open a writer to write summaries.
        writer = tf.summary.FileWriter(log_dir, session.graph)

        # We must initialize all variables before we use them.
        init.run()
        print('Initialized')

        average_loss = 0
        for step in xrange(num_steps):
            batch_inputs, batch_labels = generate_batch(
                batch_size, num_skips, skip_window)
            feed_dict = {
                train_inputs: batch_inputs,
                train_labels: batch_labels
            }

            # Define metadata variable.
            run_metadata = tf.RunMetadata()

            # We perform one update step by evaluating the optimizer op (including it
            # in the list of returned values for session.run()
            # Also, evaluate the merged op to get all summaries from the returned
            # "summary" variable. Feed metadata variable to session for visualizing
            # the graph in TensorBoard.
            _, summary, loss_val = session.run([optimizer, merged, loss],
                                               feed_dict=feed_dict,
                                               run_metadata=run_metadata)
            average_loss += loss_val

            # Add returned summaries to writer in each step.
            writer.add_summary(summary, step)
            # Add metadata to visualize the graph for the last run.
            if step == (num_steps - 1):
                writer.add_run_metadata(run_metadata, 'step%d' % step)

            if step % 2000 == 0:
                if step > 0:
                    average_loss /= 2000
                # The average loss is an estimate of the loss over the last 2000
                # batches.
                print('Average loss at step ', step, ': ', average_loss)
                average_loss = 0

            # Note that this is expensive (~20% slowdown if computed every 500 steps)
            if step % 10000 == 0:
                sim = similarity.eval()
                for i in xrange(valid_size):
                    valid_word = reverse_dictionary[valid_examples[i]]
                    top_k = 8  # number of nearest neighbors
                    nearest = (-sim[i, :]).argsort()[1:top_k + 1]
                    log_str = 'Nearest to %s:' % valid_word
                    for k in xrange(top_k):
                        close_word = reverse_dictionary[nearest[k]]
                        log_str = '%s %s,' % (log_str, close_word)
                    print(log_str)
        final_embeddings = normalized_embeddings.eval()

        # Write corresponding labels for the embeddings.
        with open(log_dir + '/metadata.tsv', 'w') as f:
            for i in xrange(vocabulary_size):
                f.write(reverse_dictionary[i] + '\n')

        # Save the model for checkpoints.
        saver.save(session, os.path.join(log_dir, 'model.ckpt'))

        # Create a configuration for visualizing embeddings with the labels in
        # TensorBoard.
        config = projector.ProjectorConfig()
        embedding_conf = config.embeddings.add()
        embedding_conf.tensor_name = embeddings.name
        embedding_conf.metadata_path = os.path.join(log_dir, 'metadata.tsv')
        projector.visualize_embeddings(writer, config)

    writer.close()

    # Step 6: Visualize the embeddings.

    # pylint: disable=missing-docstring
    # Function to draw visualization of distance between embeddings.
    def plot_with_labels(low_dim_embs, labels, filename):
        assert low_dim_embs.shape[0] >= len(
            labels), 'More labels than embeddings'
        plt.figure(figsize=(18, 18))  # in inches
        for i, label in enumerate(labels):
            x, y = low_dim_embs[i, :]
            plt.scatter(x, y)
            plt.annotate(label,
                         xy=(x, y),
                         xytext=(5, 2),
                         textcoords='offset points',
                         ha='right',
                         va='bottom')

        plt.savefig(filename)

    try:
        # pylint: disable=g-import-not-at-top
        from sklearn.manifold import TSNE
        import matplotlib.pyplot as plt

        tsne = TSNE(perplexity=30,
                    n_components=2,
                    init='pca',
                    n_iter=5000,
                    method='exact')
        plot_only = 500
        low_dim_embs = tsne.fit_transform(final_embeddings[:plot_only, :])
        labels = [reverse_dictionary[i] for i in xrange(plot_only)]
        plot_with_labels(low_dim_embs, labels,
                         os.path.join(gettempdir(), 'tsne.png'))

    except ImportError as ex:
        print(
            'Please install sklearn, matplotlib, and scipy to show embeddings.'
        )
        print(ex)
Beispiel #6
0
def main():

    cfg = TrainConfig().parse()
    print(cfg.name)
    result_dir = os.path.join(
        cfg.result_root,
        cfg.name + '_' + datetime.strftime(datetime.now(), '%Y%m%d-%H%M%S'))
    if not os.path.isdir(result_dir):
        os.makedirs(result_dir)
    utils.write_configure_to_file(cfg, result_dir)
    np.random.seed(seed=cfg.seed)

    # prepare dataset
    feat_train = np.load('/mnt/work/CUB_200_2011/data/feat_train.npy')
    val_feats = np.load('/mnt/work/CUB_200_2011/data/feat_test.npy')
    label_train = np.load('/mnt/work/CUB_200_2011/data/label_train.npy')
    label_train -= 1  # make labels start from 0
    val_labels = np.load('/mnt/work/CUB_200_2011/data/label_test.npy')

    class_idx_dict = {}
    for i, l in enumerate(label_train):
        l = int(l)
        if l not in class_idx_dict:
            class_idx_dict[l] = [i]
        else:
            class_idx_dict[l].append(i)
    C = len(list(class_idx_dict.keys()))

    val_triplet_idx = select_triplets_random(val_labels, 1000)

    # generate metadata.tsv for visualize embedding
    with open(os.path.join(result_dir, 'metadata_val.tsv'), 'w') as fout:
        for l in val_labels:
            fout.write('{}\n'.format(int(l)))

    # construct the graph
    with tf.Graph().as_default():
        tf.set_random_seed(cfg.seed)
        global_step = tf.Variable(0, trainable=False)
        lr_ph = tf.placeholder(tf.float32, name='learning_rate')

        # load backbone model
        model_emb = networks.CUBLayer(n_input=1024, n_output=cfg.emb_dim)
        #model_emb = networks.OutputLayer(n_input=1024, n_output=cfg.emb_dim)

        # get the embedding
        input_ph = tf.placeholder(tf.float32, shape=[None, 1024])
        dropout_ph = tf.placeholder(tf.float32, shape=[])
        model_emb.forward(input_ph, dropout_ph)
        if cfg.normalized:
            embedding = tf.nn.l2_normalize(model_emb.logits,
                                           axis=-1,
                                           epsilon=1e-10)
        else:
            embedding = model_emb.logits

        # variable for visualizing the embeddings
        emb_var = tf.Variable([0.0], name='embeddings')
        set_emb = tf.assign(emb_var, embedding, validate_shape=False)

        # calculated for monitoring all-pair embedding distance
        #        diffs = utils.all_diffs_tf(embedding, embedding)
        #        all_dist = utils.cdist_tf(diffs)
        #        tf.summary.histogram('embedding_dists', all_dist)

        # split embedding into anchor, positive and negative and calculate triplet loss
        anchor, positive, negative = tf.unstack(
            tf.reshape(embedding, [-1, 3, cfg.emb_dim]), 3, 1)
        metric_loss = networks.triplet_loss(anchor, positive, negative,
                                            cfg.alpha)

        regularization_loss = tf.reduce_sum(
            tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES))
        total_loss = metric_loss + regularization_loss * cfg.lambda_l2

        tf.summary.scalar('learning_rate', lr_ph)
        train_op = utils.optimize(total_loss, global_step, cfg.optimizer,
                                  lr_ph, tf.global_variables())

        saver = tf.train.Saver(max_to_keep=10)

        summary_op = tf.summary.merge_all()

        # Start running the graph
        if cfg.gpu:
            os.environ['CUDA_VISIBLE_DEVICES'] = cfg.gpu

        gpu_options = tf.GPUOptions(allow_growth=True)
        sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))

        summary_writer = tf.summary.FileWriter(result_dir, sess.graph)

        with sess.as_default():

            sess.run(tf.global_variables_initializer())

            ################## Training loop ##################
            for epoch in range(cfg.max_epochs):

                # learning rate schedule, reference: "In defense of Triplet Loss"
                if epoch < cfg.static_epochs:
                    learning_rate = cfg.learning_rate
                else:
                    learning_rate = cfg.learning_rate * \
                            0.001**((epoch-cfg.static_epochs)/(cfg.max_epochs-cfg.static_epochs))

                # sample images
                class_in_batch = set()
                idx_batch = np.array([], dtype=np.int32)
                while len(idx_batch) < cfg.batch_size:
                    sampled_class = np.random.choice(
                        list(class_idx_dict.keys()))
                    if not sampled_class in class_in_batch:
                        class_in_batch.add(sampled_class)
                        subsample_size = np.random.choice(range(5, 11))
                        subsample = np.random.permutation(
                            class_idx_dict[sampled_class])[:subsample_size]
                        idx_batch = np.append(idx_batch, subsample)
                idx_batch = idx_batch[:cfg.batch_size]

                feat_batch = feat_train[idx_batch]
                lab_batch = label_train[idx_batch]

                emb = sess.run(embedding,
                               feed_dict={
                                   input_ph: feat_batch,
                                   dropout_ph: 1.0
                               })

                # get distance for all pairs
                all_diff = utils.all_diffs(emb, emb)
                triplet_input_idx, active_count = select_triplets_facenet(
                    lab_batch,
                    utils.cdist(all_diff, metric=cfg.metric),
                    cfg.triplet_per_batch,
                    cfg.alpha,
                    num_negative=cfg.num_negative)

                if triplet_input_idx is not None:
                    triplet_input = feat_batch[triplet_input_idx]

                    # perform training on the selected triplets
                    err, _, step, summ = sess.run(
                        [total_loss, train_op, global_step, summary_op],
                        feed_dict={
                            input_ph: triplet_input,
                            dropout_ph: cfg.keep_prob,
                            lr_ph: learning_rate
                        })

                    print ("%s\tEpoch: %d\tImages num: %d\tTriplet num: %d\tLoss %.4f" % \
                            (cfg.name, epoch+1, feat_batch.shape[0], triplet_input.shape[0]//3, err))

                    summary = tf.Summary(value=[
                        tf.Summary.Value(tag="train_loss", simple_value=err),
                        tf.Summary.Value(tag="active_count",
                                         simple_value=active_count),
                        tf.Summary.Value(tag="images_num",
                                         simple_value=feat_batch.shape[0]),
                        tf.Summary.Value(tag="triplet_num",
                                         simple_value=triplet_input.shape[0] //
                                         3)
                    ])
                    summary_writer.add_summary(summary, step)
                    summary_writer.add_summary(summ, step)

                # validation on val_set
                if (epoch + 1) % 100 == 0:
                    print("Evaluating on validation set...")
                    val_err = sess.run(total_loss,
                                       feed_dict={
                                           input_ph:
                                           val_feats[val_triplet_idx],
                                           dropout_ph: 1.0
                                       })

                    summary = tf.Summary(value=[
                        tf.Summary.Value(tag="Valiation loss",
                                         simple_value=val_err),
                    ])
                    print("Epoch: [%d]\tloss: %.4f" % (epoch + 1, val_err))

                    if (epoch + 1) % 1000 == 0:
                        val_embeddings, _ = sess.run([embedding, set_emb],
                                                     feed_dict={
                                                         input_ph: val_feats,
                                                         dropout_ph: 1.0
                                                     })
                        mAP, mPrec, recall = utils.evaluate_simple(
                            val_embeddings, val_labels)
                        summary = tf.Summary(value=[
                            tf.Summary.Value(tag="Valiation mAP",
                                             simple_value=mAP),
                            tf.Summary.Value(tag="Validation Recall@1",
                                             simple_value=recall),
                            tf.Summary.Value(tag="Validation [email protected]",
                                             simple_value=mPrec)
                        ])
                        print("Epoch: [%d]\tmAP: %.4f\trecall: %.4f" %
                              (epoch + 1, mAP, recall))

                        # config for embedding visualization
                        config = projector.ProjectorConfig()
                        visual_embedding = config.embeddings.add()
                        visual_embedding.tensor_name = emb_var.name
                        visual_embedding.metadata_path = os.path.join(
                            result_dir, 'metadata_val.tsv')
                        projector.visualize_embeddings(summary_writer, config)

                    summary_writer.add_summary(summary, step)

                    # save model
                    saver.save(sess,
                               os.path.join(result_dir, cfg.name + '.ckpt'),
                               global_step=step)
Beispiel #7
0
    def train_model(self, sentences):
        sentences, words_to_indices, indices_to_words, words = flatten_and_build_indices(
            sentences)

        # Global position within sentences array
        sentence_index = 0
        vocabulary_size = len(
            set(words))  # Number of unique words in our vocabulary

        logging.logger.debug("number of training sentences: " +
                             str(len(sentences)))
        logging.logger.debug("words: " + str(len(words)))
        logging.logger.debug("vocabulary size: " + str(vocabulary_size))

        graph = tf.Graph()

        with graph.as_default():
            with tf.name_scope('inputs'):
                # Placeholders are structures for feeding input values
                train_inputs = tf.placeholder(tf.int32,
                                              shape=[self.batch_size])
                train_labels = tf.placeholder(tf.int32,
                                              shape=[self.batch_size, 1])

            # Ops and variables pinned to the CPU because of missing GPU implementation
            with tf.device('/cpu:0'):
                # Define embedding matrix variable
                # Variables are the parameters of the model that are being optimized
                with tf.name_scope('embeddings'):
                    embeddings = tf.Variable(tf.random_uniform(
                        [vocabulary_size, self.embedding_size], -1.0, 1.0),
                                             name="embeddings")
                    # Take an input vector of integer indices,
                    # and “look up” these indices in the supplied embeddings tensor.
                    embed = tf.nn.embedding_lookup(embeddings, train_inputs)

                # Construct the variables for the NCE loss
                with tf.name_scope('weights'):
                    nce_weights = tf.Variable(tf.truncated_normal(
                        [vocabulary_size, self.embedding_size],
                        stddev=1.0 / math.sqrt(self.embedding_size)),
                                              name="weights")
                with tf.name_scope('biases'):
                    nce_biases = tf.Variable(tf.zeros([vocabulary_size]),
                                             name="biases")

            # Compute the average NCE loss for the batch.
            # tf.nce_loss automatically draws a new sample of the negative labels each
            # time we evaluate the loss.
            with tf.name_scope('loss'):
                loss = tf.reduce_mean(
                    tf.nn.nce_loss(weights=nce_weights,
                                   biases=nce_biases,
                                   labels=train_labels,
                                   inputs=embed,
                                   num_sampled=self.num_sampled,
                                   num_classes=vocabulary_size))

            # Add the loss value as a scalar to summary.
            tf.summary.scalar('loss', loss)

            # Construct the SGD optimizer using a learning rate of 1.0.
            with tf.name_scope('optimizer'):
                optimizer = tf.train.GradientDescentOptimizer(1.0).minimize(
                    loss)

            # Compute the cosine similarity between minibatch examples and all embeddings.
            norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings),
                                         1,
                                         keepdims=True),
                           name="norm")
            # normalized_embeddings = embeddings / norm

            # Merge all summaries.
            merged = tf.summary.merge_all()

            # Add variable initializer.
            init = tf.global_variables_initializer()

            # Add saver
            # Save only latest model
            saver = tf.train.Saver(max_to_keep=1, save_relative_paths=True)

        # BEGIN TRAINING
        logging.logger.info("training word2vec model using " +
                            str(len(sentences)) + " samples")
        logging.init_ticker(total_steps=self.num_steps,
                            desc=self.model_name +
                            " - training word2vec model")

        with tf.Session(graph=graph) as session:
            # Open a writer to write summaries.
            writer = tf.summary.FileWriter(self.log_dir, session.graph)

            # We must initialize all variables before we use them.
            init.run()
            logging.logger.debug('Initialized all variables')
            logging.logger.debug(norm)

            average_loss = 0
            average_historical_loss = list()

            step = 0
            while step < self.num_steps:
                logging.tick()

                batch_inputs, batch_labels, sentence_index = generate_batch(
                    self.batch_size, self.skip_window, sentences,
                    sentence_index)
                feed_dict = {
                    train_inputs: batch_inputs,
                    train_labels: batch_labels
                }

                # Define metadata variable.
                run_metadata = tf.RunMetadata()

                # We perform one update step by evaluating the optimizer op (including it in the list of returned values for session.run())
                # Also, evaluate the merged op to get all summaries from the returned "summary" variable.
                # Feed metadata variable to session for visualizing the graph in TensorBoard.
                _, summary, loss_val = session.run([optimizer, merged, loss],
                                                   feed_dict=feed_dict,
                                                   run_metadata=run_metadata)
                average_loss += loss_val

                # Add returned summaries to writer in each step.
                writer.add_summary(summary, step)
                # Add metadata to visualize the graph for the last run.
                if step == (self.num_steps - 1):
                    writer.add_run_metadata(run_metadata, 'step%d' % step)

                if step % 1000 == 0:
                    if step > 0:
                        average_loss /= 1000
                        average_historical_loss.append(average_loss)
                    # The average loss is an estimate of the loss over the last 1000 steps.
                    logging.logger.info('average loss at step ' + str(step) +
                                        ': ' + str(average_loss))
                    average_loss = 0

                    # Check if historical loss is showing signs of improvement
                    if len(average_historical_loss) >= 10:
                        if np.std(average_historical_loss[-10:]) < 1:
                            logging.logger.info(
                                "loss seems to have stabilized, stopping training process"
                            )
                            step = self.num_steps - 1

                if step % self.save_every == 0:
                    saver.save(session, self.meta_graph_dir)

                step = step + 1

            # Save used embeddings together with the model
            with open(self.words_to_indices_filename, "w") as f:
                json.dump(words_to_indices, f)

            with open(self.indices_to_words_filename, "w") as f:
                json.dump(indices_to_words, f)

            # Write corresponding labels for the embeddings.
            with open(self.metadata_filename, 'w') as f:
                for i in range(vocabulary_size):
                    f.write(indices_to_words[i] + '\n')

            # Create a configuration for visualizing embeddings with the labels in TensorBoard.
            config = projector.ProjectorConfig()
            embedding_conf = config.embeddings.add()
            embedding_conf.tensor_name = embeddings.name
            embedding_conf.metadata_path = self.metadata_filename
            projector.visualize_embeddings(writer, config)

            writer.close()
    def trainer_initial(self):
        graph = tf.Graph()
        with graph.as_default():

            # logging
            self.logger = tf.summary.FileWriter(self.log_dir)

            with tf.name_scope("embedding"):
                batch_inputs = tf.placeholder(tf.int64, shape=([
                    None,
                ]))
                batch_labels = tf.placeholder(tf.int64, shape=([None, 1]))

                graph_embeddings = tf.Variable(tf.random_uniform(
                    [self.num_words, self.embedding_size],
                    -0.5 / self.embedding_size, 0.5 / self.embedding_size),
                                               name='word_embedding')

                batch_graph_embeddings = tf.nn.embedding_lookup(
                    graph_embeddings, batch_inputs)  #hiddeb layer

                weights = tf.Variable(
                    tf.truncated_normal(
                        [self.num_words, self.embedding_size],
                        stddev=1.0 /
                        math.sqrt(self.embedding_size)))  #output layer wt
                biases = tf.Variable(tf.zeros(
                    self.num_words))  #output layer biases

                #negative sampling part
                loss = tf.reduce_mean(
                    tf.nn.nce_loss(
                        weights=weights,
                        biases=biases,
                        labels=batch_labels,
                        inputs=batch_graph_embeddings,
                        num_sampled=self.neg_sampling,
                        num_classes=self.num_words,
                        sampled_values=tf.nn.fixed_unigram_candidate_sampler(
                            true_classes=batch_labels,
                            num_true=1,
                            num_sampled=self.neg_sampling,
                            unique=True,
                            range_max=self.num_words,
                            distortion=0.75,
                            unigrams=self.unigrams
                        )  #word_id_freq_map_as_list is the
                        # frequency of each word in vocabulary
                    ))
                norm = tf.sqrt(
                    tf.reduce_mean(tf.square(graph_embeddings),
                                   1,
                                   keep_dims=True))
                normalized_embeddings = graph_embeddings / norm

                # summary
                tf.summary.histogram("weights", weights)
                tf.summary.histogram("biases", biases)
                tf.summary.scalar("loss", loss)

                config = projector.ProjectorConfig()
                emb = config.embeddings.add()
                emb.tensor_name = normalized_embeddings.name
                emb.metadata_path = os.path.join(self.log_dir, 'vocab.tsv')
                projector.visualize_embeddings(self.logger, config)

            with tf.name_scope('descent'):
                global_step = tf.Variable(0, trainable=False)
                learning_rate = tf.train.exponential_decay(
                    self.learning_rate,
                    global_step,
                    100000,
                    0.96,
                    staircase=True)  #linear decay over time

                learning_rate = tf.maximum(
                    learning_rate, 0.001
                )  #cannot go below 0.001 to ensure at least a minimal learning

                optimizer = tf.train.GradientDescentOptimizer(
                    learning_rate).minimize(loss, global_step=global_step)

                self.logger.add_graph(graph)
        return graph, batch_inputs, batch_labels, normalized_embeddings, loss, optimizer
def main(argv=None):
    maybe_download()
    graph = load_graph()

    basedir = os.path.dirname(__file__)

    # ensure log directory exists
    logs_path = os.path.join(basedir, LOG_DIR)
    if not os.path.exists(logs_path):
        os.makedirs(logs_path)

    with tf.Session(graph=graph) as sess:

        pool3 = sess.graph.get_tensor_by_name('pool_3:0')
        jpeg_data = tf.placeholder(tf.string)
        thumbnail = tf.cast(
            tf.image.resize_images(tf.image.decode_jpeg(jpeg_data, channels=3),
                                   [100, 100]), tf.uint8)

        outputs = []
        images = []

        # Create metadata
        metadata_path = os.path.join(basedir, LOG_DIR, 'metadata.tsv')
        metadata = open(metadata_path, 'w')
        metadata.write("Name\tLabels\n")

        for folder_name in os.listdir(IMAGE_DIR):
            for file_name in os.listdir(IMAGE_DIR + '/' + folder_name):
                if not file_name.endswith('.jpg'):
                    continue
                print('process %s...' % file_name)

                with open(
                        os.path.join(basedir, IMAGE_DIR + '/' + folder_name,
                                     file_name), 'rb') as f:
                    data = f.read()
                    results = sess.run([pool3, thumbnail], {
                        'DecodeJpeg/contents:0': data,
                        jpeg_data: data
                    })
                    outputs.append(results[0])
                    images.append(results[1])
                    metadata.write('{}\t{}\n'.format(file_name, folder_name))
        metadata.close()

        embedding_var = tf.Variable(tf.stack([tf.squeeze(x) for x in outputs],
                                             axis=0),
                                    trainable=False,
                                    name='embed')

        # prepare projector config
        config = projector.ProjectorConfig()
        embedding = config.embeddings.add()
        embedding.tensor_name = embedding_var.name
        summary_writer = tf.summary.FileWriter(os.path.join(basedir, LOG_DIR))

        # link metadata
        embedding.metadata_path = metadata_path

        # write to sprite image file
        image_path = os.path.join(basedir, LOG_DIR, 'sprite.jpg')
        size = int(math.sqrt(len(images))) + 1
        while len(images) < size * size:
            images.append(np.zeros((100, 100, 3), dtype=np.uint8))
        rows = []
        for i in range(size):
            rows.append(tf.concat(images[i * size:(i + 1) * size], 1))
        jpeg = tf.image.encode_jpeg(tf.concat(rows, 0))
        with open(image_path, 'wb') as f:
            f.write(sess.run(jpeg))

        embedding.sprite.image_path = image_path
        embedding.sprite.single_image_dim.extend([100, 100])

        # save embedding_var
        projector.visualize_embeddings(summary_writer, config)
        sess.run(tf.variables_initializer([embedding_var]))

        saver = tf.train.Saver()
        saver.save(sess, os.path.join(basedir, LOG_DIR, 'model.ckpt'))
Beispiel #10
0
def create_embedding():
    print('***** Config *****')
    print('***** Building Point {}...'.format(MODEL_NAME))
    print('** num_frames: {}'.format(cfg.num_frames))
    print('** num_classes: {}'.format(cfg.num_classes))
    print('** batch_size: {}'.format(cfg.batch_size))
    print('** epoch: {}'.format(cfg.epoch))
    print('** init_learning_rate: {}'.format(cfg.init_learning_rate))
    print('** decay_step: {}'.format(cfg.decay_step))
    print('** decay_rate: {}'.format(cfg.decay_rate))
    print('** weight_decay: {}'.format(cfg.weight_decay))

    with tf.Graph().as_default():
        anchor, labels = placeholder_inputs(cfg.batch_size, cfg.num_frames)
        # input_negative, labels = placeholder_inputs(
        #     cfg.batch_size, cfg.num_frames)
        # input_positive, labels = placeholder_inputs(
        #     cfg.batch_size, cfg.num_frames)
        is_training_pl = tf.placeholder(tf.bool, shape=())
        keep_prob = tf.placeholder(tf.float32)

        global_step = tf.Variable(0, dtype=tf.int64)

        bn_decay = True

        # tf.summary.scalar('bn_decay', bn_decay)

        # Get model and loss
        anchor_embed, pred = build_graph(anchor,
                                         is_training_pl,
                                         keep_prob,
                                         weight_decay=cfg.weight_decay,
                                         bn_decay=bn_decay,
                                         reuse_layers=False)

        print("\nplaceholders loaded...")

        # %% restore a previous model
        sess = tf.InteractiveSession()
        # load_model_path = LOGDIR+'/model_epoch_{}'.format(cfg.load_model_epoch)
        # load_model_path = '/media/tjosh/vault/MSRAction3D/trained_models/logdir_multitask_lowlr_simple_ff_5_96/model_epoch_200'
        load_model_path = '/media/tjosh/vault/MSRAction3D/trained_models/logdir_multitask_lowlr_128_simple_ff_5_96/model_epoch_65'
        saver = tf.train.Saver()
        saver.restore(sess, load_model_path)

        print("\nModel restored...", load_model_path)

        # Plot Variable Histogram
        t_vars = tf.trainable_variables()
        # Count number of trainable parameters
        num_params = np.sum([np.prod(v.get_shape().as_list()) for v in t_vars])
        print(
            '************ The Number of Trainable Parameters: {} ************'.
            format(num_params))
        num_g_params = np.sum(
            [np.prod(v.get_shape().as_list()) for v in tf.global_variables()])
        print('************ The Number of Global Parameters: {} ************'.
              format(num_g_params))

        print("\nEmbedding session initialized...\n")

        # class_list = [2, 4, 5, 6, 7, 9, 10, 11, 12, 13, 14, 16, 17, 19, 20]
        class_list = [1, 3, 8, 15, 18]

        # load datasets:
        test_dataset = np.load(
            '/media/tjosh/vault/MSRAction3D/one_shot_test_for_known.npy')
        # '/media/tjosh/vault/MSRAction3D/one_shot_train.npy')
        # '/media/tjosh/vault/MSRAction3D/one_shot_test_for_unknown.npy')

        test_data_gen = NewTripletGenerator(test_dataset,
                                            classes=class_list,
                                            batch_size=cfg.batch_size)

        current_data, current_label = next(test_data_gen.generator)

        embedding_var = tf.Variable(tf.zeros(
            (cfg.batch_size, anchor_embed.get_shape()[1].value)),
                                    name=VISUALIZE_NAME)
        embedding_assign = embedding_var.assign(anchor_embed)
        summary_writer = tf.summary.FileWriter(EMBED_LOG_DIR)

        # create embedding projector
        config = projector.ProjectorConfig()
        embedding = config.embeddings.add()
        embedding.tensor_name = embedding_var.name

        # Specify where you find the metadata
        embedding.metadata_path = 'metadata.tsv'

        # # Specify where you find the sprite (we dont need this for our case)
        # embedding.sprite.image_path = path_for_mnist_sprites #'mnistdigits.png'
        # embedding.sprite.single_image_dim.extend([28,28])

        # Say that you want to visualise the embeddings
        projector.visualize_embeddings(summary_writer, config)

        # run session to evaluate the embedding tensor
        current_data = np.array(current_data)
        embedding_feed_dict = {
            anchor: current_data[:, 0, :, :],
            keep_prob: 1.0,
            is_training_pl: False
        }

        sess.run(embedding_assign, feed_dict=embedding_feed_dict)

        # save the data and checkpoint
        new_saver = tf.train.Saver()
        new_saver.save(sess, "embed/model.ckpt", 1)
        print("logdir: ", "embed/model.ckpt")

        # save the metadata
        with open(METADATA_PATH, 'w') as f:
            f.write("Index\tLabel\n")
            for index, label in enumerate(current_label):
                f.write("%d\t%d\n" % (index, label[0]))
Beispiel #11
0
def main(_):
    # Create training directories
    now = datetime.datetime.now()
    train_dir_name = now.strftime('alexnet_%Y%m%d_%H%M%S')
    train_dir = os.path.join(FLAGS.train_root_dir, train_dir_name)
    checkpoint_dir = os.path.join(train_dir, 'checkpoint')
    tensorboard_dir = os.path.join(train_dir, 'tensorboard')
    tensorboard_train_dir = os.path.join(tensorboard_dir, 'train')
    tensorboard_val_dir = os.path.join(tensorboard_dir, 'val')

    if not os.path.isdir(FLAGS.train_root_dir):
        os.mkdir(FLAGS.train_root_dir)
    if not os.path.isdir(train_dir):
        os.mkdir(train_dir)
    if not os.path.isdir(checkpoint_dir):
        os.mkdir(checkpoint_dir)
    if not os.path.isdir(tensorboard_dir):
        os.mkdir(tensorboard_dir)
    if not os.path.isdir(tensorboard_train_dir):
        os.mkdir(tensorboard_train_dir)
    if not os.path.isdir(tensorboard_val_dir):
        os.mkdir(tensorboard_val_dir)

    # Write flags to txt
    flags_file_path = os.path.join(train_dir, 'flags.txt')
    flags_file = open(flags_file_path, 'w')
    flags_file.write('learning_rate={}\n'.format(FLAGS.learning_rate))
    flags_file.write('dropout_keep_prob={}\n'.format(FLAGS.dropout_keep_prob))
    flags_file.write('num_epochs={}\n'.format(FLAGS.num_epochs))
    flags_file.write('batch_size={}\n'.format(FLAGS.batch_size))
    flags_file.write('train_layers={}\n'.format(FLAGS.train_layers))
    flags_file.write('multi_scale={}\n'.format(FLAGS.multi_scale))
    flags_file.write('train_root_dir={}\n'.format(FLAGS.train_root_dir))
    flags_file.write('log_step={}\n'.format(FLAGS.log_step))
    flags_file.close()

    adlamb = tf.placeholder(tf.float32, name='adlamb')
    sntglamb = tf.placeholder(tf.float32, name='sntglamb')
    decay_learning_rate = tf.placeholder(tf.float32)
    dropout_keep_prob = tf.placeholder(tf.float32)
    is_training = tf.placeholder(tf.bool)
    weightlamb = tf.placeholder(tf.float32)

    # Model
    train_layers = FLAGS.train_layers.split(',')
    model = LeNetModel(num_classes=NUM_CLASSES,
                       image_size=32,
                       is_training=is_training,
                       dropout_keep_prob=dropout_keep_prob)
    # Placeholders
    if FLAGS.source == "svhn":
        x_s = tf.placeholder(tf.float32, [None, 32, 32, 3], name='x')
    elif FLAGS.source == "mnist":
        x_s = tf.placeholder(tf.float32, [None, 28, 28, 1], name='x')
    else:
        x_s = tf.placeholder(tf.float32, [None, 16, 16, 1], name='x')
    if FLAGS.target == "mnist":
        x_t = tf.placeholder(tf.float32, [None, 28, 28, 1], name='xt')
    else:
        x_t = tf.placeholder(tf.float32, [None, 16, 16, 1], name='xt')
    x = preprocessing(x_s, model)
    xt = preprocessing(x_t, model)
    tf.summary.image('Source Images', x)
    tf.summary.image('Target Images', xt)
    print('x_s ', x_s.get_shape())
    print('x ', x.get_shape())
    print('x_t ', x_t.get_shape())
    print('xt ', xt.get_shape())
    y = tf.placeholder(tf.float32, [None, NUM_CLASSES], name='y')
    yt = tf.placeholder(tf.float32, [None, NUM_CLASSES], name='yt')
    print('y ', y.get_shape())
    y_predict, loss = model.loss(x, y)
    # Training accuracy of the model
    source_correct_pred = tf.equal(tf.argmax(y_predict, 1), tf.argmax(y, 1))
    source_correct = tf.reduce_sum(tf.cast(source_correct_pred, tf.float32))
    source_accuracy = tf.reduce_mean(tf.cast(source_correct_pred, tf.float32))

    G_loss, D_loss = model.adloss(x, xt, y, yt, FLAGS.LAMBDA, weightlamb)

    # Testing accuracy of the model
    with tf.variable_scope('reuse_inference') as scope:
        scope.reuse_variables()
        target_feature_test = model.g(xt, training=False)
    with tf.variable_scope('reuse_inference') as scope:
        scope.reuse_variables()
        target_pred_test = model.f(target_feature_test, training=False)
        correct_pred = tf.equal(tf.argmax(target_pred_test, 1),
                                tf.argmax(yt, 1))
        correct = tf.reduce_sum(tf.cast(correct_pred, tf.float32))
        accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

    update_op = model.optimize(decay_learning_rate, train_layers, adlamb,
                               sntglamb, FLAGS.unbalance, FLAGS.revgrad)

    D_op = model.adoptimize(decay_learning_rate, train_layers)
    optimizer = tf.group(update_op, D_op)

    train_writer = tf.summary.FileWriter(tensorboard_dir)
    train_writer.add_graph(tf.get_default_graph())
    config = projector.ProjectorConfig()
    embedding = config.embeddings.add()
    embedding.tensor_name = model.feature.name
    embedding.metadata_path = 'domain.csv'
    projector.visualize_embeddings(train_writer, config)
    tf.summary.scalar('G_loss', model.G_loss)
    tf.summary.scalar('D_loss', model.D_loss)
    tf.summary.scalar('C_loss', model.loss)
    tf.summary.scalar('Training Accuracy', source_accuracy)
    tf.summary.scalar('Testing Accuracy', accuracy)
    merged = tf.summary.merge_all()

    print(
        '============================GLOBAL TRAINABLE VARIABLES ============================'
    )
    print(tf.trainable_variables())
    #print('============================GLOBAL VARIABLES ======================================')
    #print tf.global_variables()

    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        saver = tf.train.Saver()
        train_writer.add_graph(sess.graph)

        print("{} Start training...".format(datetime.datetime.now()))
        gd = 0
        step = 1
        max_acc = 0
        times = -time.time()
        error_list = []
        for epoch in range(40000):
            # Start training
            gd += 1
            lamb = adaptation_factor(gd * 1.0 / 40000)
            if FLAGS.unbalance == 1.:
                lamb2 = math.exp(-(1 - min(
                    (gd - 5000) * 1.0 / 10000, 1.)) * 10) if gd >= 5000 else 0.
                if FLAGS.source == "mnist" and FLAGS.target == "usps":
                    lamb2 = math.exp(-(1 - min((gd - 2000) * 1.0 / 5000, 1.)) *
                                     10) if gd >= 2000 else 0.
            else:
                lamb2 = math.exp(-(1 - min(
                    (gd - 2000) * 1.0 / 2000, 1.)) * 10) if gd >= 2000 else 0.
            #rate=decay(FLAGS.learning_rate,gd,MAX_STEP)
            rate = FLAGS.learning_rate
            batch_xs, batch_ys = TRAIN.next_batch(FLAGS.batch_size)
            Tbatch_xs, Tbatch_ys = VALID.next_batch(FLAGS.batch_size)

            summary, _, closs, gloss, dloss, sn_loss, chose_rate_ = sess.run(
                [
                    merged, optimizer, model.loss, model.G_loss, model.D_loss,
                    model.sntg_loss, model.chose_rate
                ],
                feed_dict={
                    x_s: batch_xs,
                    x_t: Tbatch_xs,
                    decay_learning_rate: rate,
                    adlamb: lamb,
                    y: batch_ys,
                    yt: Tbatch_ys,
                    sntglamb: lamb2
                })
            train_writer.add_summary(summary, gd)

            step += 1
            if gd % 250 == 0:
                epoch = gd / (72357 / 100)
                print(
                    'Epoch {} time {}s Step {} lambda {:.4f} lamb2 {:.4f} rate {:.4f} C_loss {:.4f} G_loss {:.4f} D_loss {:.4f} SNTG_loss {:.4f} chose_rate {:.4f}'
                    .format(epoch, times + time.time(), gd, lamb, lamb2, rate,
                            closs, gloss, dloss, sn_loss, chose_rate_))

                test_acc = 0.
                test_count = 0
                tt_embs = []
                tt_y = []
                for _ in range(int((len(TEST.labels)) / 100)):
                    batch_tx, batch_ty = TEST.next_batch(100)
                    #print TEST.pointer,'   ',TEST.shuffle
                    acc, t_embs = sess.run([correct, model.feature],
                                           feed_dict={
                                               x_t: batch_tx,
                                               yt: batch_ty
                                           })
                    tt_embs.append(t_embs)
                    tt_y.append(batch_ty)
                    test_acc += acc
                    test_count += 100

                test_acc /= test_count
                max_acc = max(max_acc, test_acc)
                print("Validation Accuracy = {:.4f} Max_Accuracy = {:.4f}".
                      format(test_acc, max_acc))
                error_list.append(test_acc)

                if gd % 5000 == 0 and False:
                    tt_embs_s = []
                    tt_y_s = []
                    TRAIN.reset_pointer()
                    for _ in range((len(TRAIN.labels)) / 100):
                        batch_tx, batch_ty = TRAIN.next_batch(100)
                        #print TEST.pointer,'   ',TEST.shuffle
                        t_embs = sess.run(model.source_feature,
                                          feed_dict={
                                              x_s: batch_tx,
                                              y: batch_ty
                                          })
                        tt_embs_s.append(t_embs)
                        tt_y_s.append(batch_ty)

                    #np.savez("features_{}.npz".format(FLAGS.tag), x1=np.vstack(tt_embs_s), y1=np.argmax(np.vstack(tt_y_s), axis=1), x2=np.vstack(tt_embs), y2=np.argmax(np.vstack(tt_y), axis=1))
                    if True:
                        if NUM_CLASSES == 10:
                            test_h_s = np.vstack(tt_embs_s)[:5000]
                            y_test_s = np.ones((test_h_s.shape[0], )) * 8

                            test_h = np.concatenate(
                                [np.vstack(tt_embs)[:5000], test_h_s], 0)
                            y_test = np.argmax(np.vstack(tt_y)[:5000], axis=1)
                            y_test[y_test == 8] = 10
                            y_test = np.concatenate([y_test, y_test_s], 0)
                            z_dev_2D = TSNE().fit_transform(test_h)
                            scatter(data=z_dev_2D,
                                    label=y_test,
                                    dir="./embedding",
                                    file_name='s{}_{}_epoch{:03d}.png'.format(
                                        FLAGS.source, FLAGS.target, gd / 5000))
                        else:
                            test_h_s = np.vstack(tt_embs_s)
                            y_test_s = np.argmax(np.vstack(tt_y_s), axis=1)
                            y_test_s[y_test_s == 0] = 2
                            y_test_s[y_test_s == 1] = 3

                            test_h = np.concatenate(
                                [np.vstack(tt_embs), test_h_s], 0)
                            y_test = np.argmax(np.vstack(tt_y), axis=1)
                            y_test = np.concatenate([y_test, y_test_s], 0)
                            z_dev_2D = TSNE().fit_transform(test_h)
                            scatter(
                                data=z_dev_2D,
                                label=y_test,
                                dir="./embedding",
                                file_name='cattsne_manifold_epoch{:03d}.png'.
                                format(gd / 5000))

                if gd % 5000 == 0 and gd > 0:
                    print()  #error_list

                times = -time.time()
Beispiel #12
0
    def buildDict(self, train_raw):
        def summ(x, y): return x+y
        allwords = reduce(
            summ, [reduce(summ, h) + q + a for h, q, a in train_raw])
        vocab = collections.Counter(allwords)

        vocab_sort = sorted(vocab, key=vocab.get, reverse=True)
        # print vocabulary to file
        with open(self.fvocab, 'w') as f:
            for word in vocab_sort:
                print(f, '\t'.join([word, str(vocab[word])]))
        print ('===============================================')
        print ('written vocabulary to ', self.fvocab)
        self.vocab_size = self.vocab_size if self.vocab_size != - \
            1 else len(vocab_sort) + 2
        vocab = sorted(
            zip(vocab_sort[0: self.vocab_size - 2], range(1, self.vocab_size - 1)))
        filename = maybe_download('text8.zip', 31344016)
        vocabulary = read_data(filename)
        print('Data size is here', len(vocabulary))

        data, count, dictionary, reverse_dictionary = build_dataset(
            vocabulary, len(vocabulary))


        # del vocab  # Hint to reduce memory.

        # print('Most common words (+UNK)', count[:5])
        # print('Sample data', data[:10], [
        #       reverse_dictionary[i] for i in data[:10]])

        # print("DICTIONARY", dictionary)
        # print("REVERSED DICTIONARY", reverse_dictionary)

        batch, labels = generate_batch(
            data, batch_size=8, num_skips=2, skip_window=1)
        for i in range(8):
            print(batch[i], reverse_dictionary[batch[i]], '->', labels[i, 0],
                  reverse_dictionary[labels[i, 0]])

        batch_size = 128
        embedding_size = 128  # Dimension of the embedding vector.
        skip_window = 1  # How many words to consider left and right.
        num_skips = 2  # How many times to reuse an input to generate a label.
        num_sampled = 64  # Number of negative examples to sample.

        # We pick a random validation set to sample nearest neighbors. Here we limit the
        # validation samples to the words that have a low numeric ID, which by
        # construction are also the most frequent. These 3 variables are used only for
        # displaying model accuracy, they don't affect calculation.
        valid_size = 16  # Random set of words to evaluate similarity on.
        valid_window = 100  # Only pick dev samples in the head of the distribution.
        valid_examples = np.random.choice(valid_window, valid_size, replace=False)

        graph = tf.Graph()

        with graph.as_default():

            # Input data.
            with tf.name_scope('inputs'):
                train_inputs = tf.placeholder(tf.int32, shape=[batch_size])
                train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])
                valid_dataset = tf.constant(valid_examples, dtype=tf.int32)

            # Ops and variables pinned to the CPU because of missing GPU implementation
            with tf.device('/cpu:0'):
                # Look up embeddings for inputs.
                with tf.name_scope('embeddings'):
                    embeddings = tf.Variable(
                        tf.random_uniform([self.vocab_size, embedding_size], -1.0, 1.0))
                    embed = tf.nn.embedding_lookup(embeddings, train_inputs)

                # Construct the variables for the NCE loss
                with tf.name_scope('weights'):
                    nce_weights = tf.Variable(
                        tf.truncated_normal(
                            [self.vocab_size, embedding_size],
                            stddev=1.0 / math.sqrt(embedding_size)))
                with tf.name_scope('biases'):
                    nce_biases = tf.Variable(tf.zeros([self.vocab_size]))

            # Compute the average NCE loss for the batch.
            # tf.nce_loss automatically draws a new sample of the negative labels each
            # time we evaluate the loss.
            # Explanation of the meaning of NCE loss:
            #   http://mccormickml.com/2016/04/19/word2vec-tutorial-the-skip-gram-model/
            with tf.name_scope('loss'):
                loss = tf.reduce_mean(
                    tf.nn.nce_loss(
                        weights=nce_weights,
                        biases=nce_biases,
                        labels=train_labels,
                        inputs=embed,
                        num_sampled=num_sampled,
                        num_classes=self.vocab_size))

            # Add the loss value as a scalar to summary.
            tf.summary.scalar('loss', loss)

            # Construct the SGD optimizer using a learning rate of 1.0.
            with tf.name_scope('optimizer'):
                optimizer = tf.train.GradientDescentOptimizer(1.0).minimize(loss)

            # Compute the cosine similarity between minibatch examples and all embeddings.
            norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keepdims=True))
            normalized_embeddings = embeddings / norm
            valid_embeddings = tf.nn.embedding_lookup(normalized_embeddings,
                                                  valid_dataset)
            similarity = tf.matmul(
                valid_embeddings, normalized_embeddings, transpose_b=True)

            # Merge all summaries.
            merged = tf.summary.merge_all()

            # Add variable initializer.
            init = tf.global_variables_initializer()

            # Create a saver.
            saver = tf.train.Saver()

        # Step 5: Begin training.
        num_steps = 100001

        with tf.Session(graph=graph) as session:
            # Open a writer to write summaries.
            writer = tf.summary.FileWriter(FLAGS.log_dir, session.graph)

            # We must initialize all variables before we use them.
            init.run()
            # print('Initialized')

            average_loss = 0
            for step in xrange(num_steps):
                filename = maybe_download('text8.zip', 31344016)
                vocabulary = read_data(filename)
                print('Data size is here', len(vocabulary))

                data, count, dictionary, reverse_dictionary = build_dataset(
                vocabulary, len(vocabulary))
                batch_inputs, batch_labels = generate_batch(data, batch_size, num_skips, skip_window)
                feed_dict = {train_inputs: batch_inputs, train_labels: batch_labels}

                # Define metadata variable.
                run_metadata = tf.RunMetadata()

                # print('run_metadata', feed_dict);

                # We perform one update step by evaluating the optimizer op (including it
                # in the list of returned values for session.run()
                # Also, evaluate the merged op to get all summaries from the returned "summary" variable.
                # Feed metadata variable to session for visualizing the graph in TensorBoard.
                _, summary, loss_val = session.run(
                    [optimizer, merged, loss],
                    feed_dict=feed_dict,
                    run_metadata=run_metadata)
                average_loss += loss_val

                # Add returned summaries to writer in each step.
                writer.add_summary(summary, step)
                # Add metadata to visualize the graph for the last run.
                if step == (num_steps - 1):
                    writer.add_run_metadata(run_metadata, 'step%d' % step)

                if step % 2000 == 0:
                    if step > 0:
                        average_loss /= 2000
                    # The average loss is an estimate of the loss over the last 2000 batches.
                    print('Average loss at step ', step, ': ', average_loss)
                    average_loss = 0

                # Note that this is expensive (~20% slowdown if computed every 500 steps)
                if step % 10000 == 0:
                    sim = similarity.eval()
                    for i in xrange(valid_size):
                        valid_word = reverse_dictionary[valid_examples[i]]
                        top_k = 8  # number of nearest neighbors
                        nearest = (-sim[i, :]).argsort()[1:top_k + 1]
                        log_str = 'Nearest to %s:' % valid_word
                        for k in xrange(top_k):
                            close_word = reverse_dictionary[nearest[k]]
                            log_str = '%s %s,' % (log_str, close_word)
                        print(log_str)
            final_embeddings = normalized_embeddings.eval()

            # Write corresponding labels for the embeddings.
            with open(FLAGS.log_dir + '/metadata.tsv', 'w') as f:
                for i in xrange(self.vocab_size):
                    f.write(reverse_dictionary[i] + '\n')

            # Save the model for checkpoints.
            saver.save(session, os.path.join(FLAGS.log_dir, 'model.ckpt'))

            # Create a configuration for visualizing embeddings with the labels in TensorBoard.
            config = projector.ProjectorConfig()
            embedding_conf = config.embeddings.add()
            embedding_conf.tensor_name = embeddings.name
            embedding_conf.metadata_path = os.path.join(FLAGS.log_dir, 'metadata.tsv')
            projector.visualize_embeddings(writer, config)

        writer.close()

        # add <unk> and <nil> to vocabulary

        vocab.append(('<nil>', 0))
        vocab.append(('<unk>', self.vocab_size - 1))
        assert self.vocab_size == len(vocab)
        print ('vocabulary size:', self.vocab_size)
        return dict(vocab)
Beispiel #13
0
def restore(checkpoint_file=MODEL_FILE):
    print("restoring session")
    with tf.Session() as session:
        restore_saver = tf.train.import_meta_graph(MODEL_FILE + ".meta")
        restore_saver.restore(session, tf.train.latest_checkpoint(MODEL_DIR))
        graph = tf.get_default_graph()

        labels = graph.get_tensor_by_name("output:0")
        inputs = graph.get_tensor_by_name("input:0")

        in_embedding_var = tf.Variable(tf.zeros([batch_size, feature_size]),
                                       name="in_embedding")
        in_embedding_op = in_embedding_var.assign(inputs)

        l1 = graph.get_tensor_by_name("dnn/hiddenlayer_0/hiddenlayer_0/Relu:0")
        l1_embedding_var = tf.Variable(tf.zeros([batch_size, 1024]),
                                       name="l1_embedding")
        l1_embedding_op = l1_embedding_var.assign(l1)

        l2 = graph.get_tensor_by_name("dnn/hiddenlayer_1/hiddenlayer_1/Relu:0")
        l2_embedding_var = tf.Variable(tf.zeros([batch_size, 1024]),
                                       name="l2_embedding")
        l2_embedding_op = l2_embedding_var.assign(l2)

        l3 = graph.get_tensor_by_name("dnn/hiddenlayer_2/hiddenlayer_2/Relu:0")
        l3_embedding_var = tf.Variable(tf.zeros([batch_size, 1024]),
                                       name="l3_embedding")
        l3_embedding_op = l3_embedding_var.assign(l3)

        session.run(
            tf.variables_initializer([
                in_embedding_var, l1_embedding_var, l2_embedding_var,
                l3_embedding_var
            ]))

        [Train_index, Val_index, Test_index] = restore_index(WORKINDEX_FILE)
        print('building filtered batch set')
        # whitelist follows layout of target
        # white_list = [[],[KLINKER],[],[],[],[],[],[],[],[b'a', b'A:', b'A+', b'A', b'A~']]
        white_list = []
        batch = get_filtered_batch(Test_index,
                                   batch_size,
                                   white_listed=white_list)
        test_features, test_targets, test_labels, test_spectra = batch
        test_labels = add_prediction(test_features, test_labels)

        print('running tensorflow with emmbeddings')
        session.run([
            in_embedding_op, l1_embedding_op, l2_embedding_op, l3_embedding_op
        ], {
            inputs: test_features,
            labels: test_targets
        })
        saver = tf.train.Saver()
        saver.save(session, EMBEDDING_FILE, 2)
        graph_new = tf.get_default_graph()

        create_metafile(test_labels, EMBEDDING_LABEL_FILE)

        print("creating embedding projector")
        # Format: tensorflow/tensorboard/plugins/projector/projector_config.proto
        config = projector.ProjectorConfig()

        embedding_in = config.embeddings.add()
        embedding_in.tensor_name = in_embedding_var.name
        embedding_in.metadata_path = EMBEDDING_LABEL_FILE

        embedding_l1 = config.embeddings.add()
        embedding_l1.tensor_name = l1_embedding_var.name
        embedding_l1.metadata_path = EMBEDDING_LABEL_FILE

        embedding_l2 = config.embeddings.add()
        embedding_l2.tensor_name = l2_embedding_var.name
        embedding_l2.metadata_path = EMBEDDING_LABEL_FILE

        embedding_l3 = config.embeddings.add()
        embedding_l3.tensor_name = l3_embedding_var.name
        embedding_l3.metadata_path = EMBEDDING_LABEL_FILE

        if USE_SPECTRO:
            plot_width, plot_height = sprite.create_sprite(
                test_spectra, SPRITE_FILE)

            embedding_in.sprite.image_path = SPRITE_FILE
            embedding_in.sprite.single_image_dim.extend(
                [plot_width, plot_height])

            embedding_l1.sprite.image_path = SPRITE_FILE
            embedding_l1.sprite.single_image_dim.extend(
                [plot_width, plot_height])

            embedding_l2.sprite.image_path = SPRITE_FILE
            embedding_l2.sprite.single_image_dim.extend(
                [plot_width, plot_height])

            embedding_l3.sprite.image_path = SPRITE_FILE
            embedding_l3.sprite.single_image_dim.extend(
                [plot_width, plot_height])

        writer = tf.summary.FileWriter(EMBEDDING_DIR)
        writer.add_graph(graph_new)
        # The next line writes a projector_config.pbtxt in the LOG_DIR. TensorBoard will
        # read this file during startup.
        projector.visualize_embeddings(writer, config)
Beispiel #14
0
def run_training(netname, hidden_units, data_sets, epoch_iterations,
                 initial_learning_rate, num_epochs_per_decay,
                 learning_rate_decay_factor):
    """Train neuralnet for a number of steps."""
    #data_sets = input_data.read_data_sets(FLAGS.input_data_dir, FLAGS.fake_data)
    epoch_length = data_sets.train.num_examples
    # Tell TensorFlow that the model will be built into the default Graph.
    with tf.Graph().as_default():
        # Generate placeholders for the images and labels.
        images_placeholder, labels_placeholder = placeholder_inputs(
            FLAGS.batch_size)

        # Build a Graph that computes predictions from the inference model.
        if not CIFAR_MODE:
            logits = neuralnet.inference(images_placeholder, hidden_units)
        else:
            logits = neuralnet.CIFAR_inference(images_placeholder,
                                               hidden_units[2])

        # Add to the Graph the Ops for loss calculation.
        loss = neuralnet.loss(logits, labels_placeholder)

        # Add to the Graph the Ops that calculate and apply gradients.
        train_op, learning_rate = neuralnet.training(
            loss, epoch_length, initial_learning_rate, num_epochs_per_decay,
            learning_rate_decay_factor)

        # Add the Op to compare the logits to the labels during evaluation.
        eval_correct = neuralnet.evaluation(logits, labels_placeholder)

        # Embedding
        N = epoch_length
        EMB = np.zeros((N, neuralnet.IMAGE_PIXELS), dtype='float32')

        for i in range(N):
            for j in range(neuralnet.IMAGE_PIXELS):
                EMB[i][j] = data_sets.train.images[i][j]

        # The embedding variable, which needs to be stored
        # Note this must a Variable not a Tensor!
        embedding_var = tf.Variable(EMB, name='Embedding_%s' % (netname))

        # Build the summary Tensor based on the TF collection of Summaries.
        summary = tf.summary.merge_all()

        # Create a session for running Ops on the Graph.
        sess = tf.Session()

        best_pres = 0
        last_pres = 0

        # Add the variable initializer Op.
        init = tf.global_variables_initializer()
        sess.run(init)

        # Instantiate a SummaryWriter to output summaries and the Graph.
        summary_writer = tf.summary.FileWriter(FLAGS.log_dir, sess.graph)

        config = projector.ProjectorConfig()
        embedding = config.embeddings.add()
        embedding.tensor_name = embedding_var.name

        names = [
            '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12'
        ]
        metadata_file = open(
            os.path.join(FLAGS.log_dir, 'metadata_%s.tsv' % (netname)), 'w')
        metadata_file.write('Name\tClass\n')
        for i in range(N):
            metadata_file.write('%06d\t%s\n' %
                                (i, names[data_sets.train.labels[i]]))
        metadata_file.close()

        # Comment out if you don't have metadata
        embedding.metadata_path = os.path.join(FLAGS.log_dir,
                                               'metadata_%s.tsv' % (netname))

        projector.visualize_embeddings(summary_writer, config)
        saver = tf.train.Saver()
        saver.save(sess, os.path.join(FLAGS.log_dir,
                                      'model%s.ckpt' % (netname)), 1)

        #run_testing(0, logits, images_placeholder, sess)
        accuracy_array = np.zeros([epoch_iterations])
        test_ind = 0
        for step in range(epoch_length * num_epochs_per_decay *
                          epoch_iterations):
            start_time = time.time()

            # Fill a feed dictionary with the actual set of images and labels
            # for this particular training step.
            feed_dict = fill_feed_dict(data_sets.train, images_placeholder,
                                       labels_placeholder)

            # Run one step of the model.  The return values are the activations
            # from the `train_op` (which is discarded) and the `loss` Op.  To
            # inspect the values of your Ops or variables, you may include them
            # in the list passed to sess.run() and the value tensors will be
            # returned in the tuple from the call.
            _, loss_value = sess.run([train_op, loss], feed_dict=feed_dict)

            duration = time.time() - start_time

            # Write the summaries and print an overview fairly often.
            if step % 100 == 0:
                # Print status to stdout.
                lr_value = sess.run(learning_rate)
                #print('[%s] Step %d: loss = %.8f (%.3f sec) lr = %.6f  bp = %0.04f  lp = %0.04f' % (netname, step, loss_value, duration, lr_value, best_pres, last_pres))
                # Update the events file.
                summary_str = sess.run(summary, feed_dict=feed_dict)
                summary_writer.add_summary(summary_str, step)
                summary_writer.flush()

            # Save a checkpoint and evaluate the model periodically.
            if (step + 1) % (epoch_length * num_epochs_per_decay) == 0 or (
                    step + 1
            ) == epoch_length * num_epochs_per_decay * epoch_iterations:
                print(
                    '[%s] CHECKPOINT Step %d: loss = %.8f (%.3f sec) lr = %.6f  bp = %0.04f  lp = %0.04f'
                    % (netname, step, loss_value, duration, lr_value,
                       best_pres, last_pres))
                checkpoint_file = os.path.join(FLAGS.log_dir,
                                               'model%s.ckpt' % (netname))
                saver.save(sess, checkpoint_file, global_step=step)
                # Evaluate against the training set.
                print('Training Data Eval:')
                do_eval_accuracy(sess, eval_correct, images_placeholder,
                                 labels_placeholder, data_sets.train)

                confmatr = do_eval_confmatr(sess, logits, hidden_units[-1],
                                            images_placeholder,
                                            labels_placeholder,
                                            data_sets.train)

                prec_rec = do_eval_prec_rec(confmatr, hidden_units[-1])

                f1score = do_eval_f1(prec_rec, hidden_units[-1])

                # Evaluate against the test set.
                print('Test Data Eval (accuracy):')
                accuracy_array[test_ind] = do_eval_accuracy(
                    sess, eval_correct, images_placeholder, labels_placeholder,
                    data_sets.test)

                print('Test Data Eval (confmatr):')
                confmatr = do_eval_confmatr(sess, logits, hidden_units[-1],
                                            images_placeholder,
                                            labels_placeholder, data_sets.test)

                prec_rec = do_eval_prec_rec(confmatr, hidden_units[-1])

                f1score = do_eval_f1(prec_rec, hidden_units[-1])

                print("Test #%d ended" % (test_ind))
                test_ind += 1

    print("Accuracy array: ")
    print(accuracy_array)

    checkpoint_file = os.path.join(FLAGS.log_dir,
                                   'model%s_last.ckpt' % (netname))
    saver.save(sess, checkpoint_file)
    return {'logits': logits, 'images_ph': images_placeholder, 'sess': sess}
Beispiel #15
0
def main(_):
    df_train = pd.read_csv(os.getenv('PREPARED_TRAINING'))
    df_valid = pd.read_csv(os.getenv('PREPARED_VALIDATING'))
    df_test = pd.read_csv(os.getenv('PREPARED_TESTING'))

    feature_cols = list(df_train.columns[:-1])
    target_col = df_train.columns[-1]

    X_train = df_train[feature_cols].values
    y_train = df_train[target_col].values

    X_valid = df_valid[feature_cols].values
    y_valid = df_valid[target_col].values

    X_test = df_test[feature_cols].values

    dimensions = (5, 5)
    single = 5
    sprites = [None] * 2
    sprites[0] = [[1, 1, 1, 1, 1], [1, 1, 1, 1, 1], [1, 0, 0, 0, 1],
                  [1, 1, 1, 1, 1], [1, 1, 1, 1, 1]]
    sprites[1] = [[1, 1, 1, 1, 1], [1, 1, 0, 1, 1], [1, 1, 0, 1, 1],
                  [1, 1, 0, 1, 1], [1, 1, 1, 1, 1]]
    sprites[0] = Image.fromarray(np.uint8(sprites[0]) * 0xFF)
    sprites[1] = Image.fromarray(np.uint8(sprites[1]) * 0xFF)
    count = X_train.shape[0]
    size = int(math.ceil(math.sqrt(count)))
    image = Image.new('1', (size * single, size * single))
    logdir = os.path.join(os.getenv('STORING'), 'logs',
                          'adversarial_{}'.format(int(time.time())))
    os.makedirs(logdir, exist_ok=True)
    handle = open(os.path.join(logdir, 'metadata.tsv'), 'wb')
    for i in range(count):
        location = ((i % size) * single, (i // size) * single)
        label = int(y_train[i])
        image.paste(sprites[label], location)
        handle.write(b'1\n' if label == 1 else b'0\n')
    handle.close()
    image.save(os.path.join(logdir, 'sprites.png'))

    num_features = len(feature_cols)
    features = tf.placeholder(tf.float32,
                              shape=[None, num_features],
                              name='features')
    targets = tf.placeholder(tf.int32, shape=[None], name='targets')

    with tf.name_scope('training'):
        with tf.variable_scope('adversarial'):
            train_model = Model(num_features,
                                features,
                                targets,
                                is_training=True)

    with tf.name_scope('evaluation'):
        with tf.variable_scope('adversarial', reuse=True):
            test_model = Model(num_features,
                               features,
                               targets,
                               is_training=False)

    summary_op = tf.summary.merge_all()
    supervisor = tf.train.Supervisor(logdir=logdir, summary_op=None)
    with supervisor.managed_session() as sess:
        print('Training model with {} parameters...'.format(
            train_model.num_parameters))
        optimize_d, optimize_g = True, True
        with tqdm(total=FLAGS.num_epochs) as pbar:
            for epoch in range(FLAGS.num_epochs):
                summary_writer = tf.summary.FileWriter(logdir, sess.graph)

                X_train_epoch, y_train_epoch = shuffle(X_train, y_train)

                losses_d, losses_g = [], []
                if optimize_d:
                    _, loss_d = sess.run([
                        train_model.train_step_d,
                        train_model.loss_d,
                    ],
                                         feed_dict={
                                             features: X_train_epoch,
                                             targets: y_train_epoch,
                                         })
                else:
                    loss_d = sess.run(train_model.loss_d,
                                      feed_dict={
                                          features: X_train_epoch,
                                          targets: y_train_epoch,
                                      })

                if optimize_g:
                    _, loss_g = sess.run([
                        train_model.train_step_g,
                        train_model.loss_g,
                    ],
                                         feed_dict={
                                             features: X_train_epoch,
                                             targets: y_train_epoch,
                                         })
                else:
                    loss_g = sess.run(train_model.loss_g,
                                      feed_dict={
                                          features: X_train_epoch,
                                          targets: y_train_epoch,
                                      })

                losses_d.append(loss_d)
                losses_g.append(loss_g)

                loss_train_d = np.mean(losses_d)
                loss_train_g = np.mean(losses_g)

                summary_str = sess.run(summary_op,
                                       feed_dict={
                                           features: X_valid,
                                           targets: y_valid,
                                       })

                optimize_d = epoch % 2 == 0
                optimize_g = True

                if not optimize_d and not optimize_g:
                    optimize_d = True
                    optimize_g = True

                summary_writer.add_summary(summary_str, epoch)
                summary_writer.flush()

                pbar.set_description(
                    '[{}] loss_train_d ({}): {:.8f}, loss_train_g ({}): {:.8f}'
                    .format(epoch, optimize_d, loss_train_d, optimize_g,
                            loss_train_g))
                pbar.update()

        summary_writer.add_graph(sess.graph)

        loss_valid_d, loss_valid_g, summary_str = sess.run([
            test_model.loss_d,
            test_model.loss_g,
            summary_op,
        ],
                                                           feed_dict={
                                                               features:
                                                               X_valid,
                                                               targets:
                                                               y_valid,
                                                           })
        print('Validation loss (d): {:.8f}, loss (g): {:.8f}'.format(
            loss_valid_d, loss_valid_g))

        z_train = sess.run(test_model.z, feed_dict={features: X_train})
        z_valid = sess.run(test_model.z, feed_dict={features: X_valid})
        z_test = sess.run(test_model.z, feed_dict={features: X_test})

        summary_writer.flush()
        summary_writer.close()

        np.savez(os.path.join(os.getenv('STORING'), 'adversarial.npz'),
                 z_train=z_train,
                 z_valid=z_valid,
                 z_test=z_test)

    tf.reset_default_graph()
    tf.Graph().as_default()
    embedding_variable = tf.Variable(z_train, name='adversarial_embedding')
    summary_writer = tf.summary.FileWriter(logdir)
    config = projector.ProjectorConfig()
    embedding = config.embeddings.add()
    embedding.tensor_name = embedding_variable.name
    embedding.metadata_path = os.path.join(logdir, 'metadata.tsv')
    embedding.sprite.image_path = os.path.join(logdir, 'sprites.png')
    embedding.sprite.single_image_dim.extend(dimensions)
    projector.visualize_embeddings(summary_writer, config)
    session = tf.InteractiveSession()
    session.run(tf.global_variables_initializer())
    saver = tf.train.Saver()
    saver.save(session, os.path.join(logdir, 'model.ckpt'), 0)
Beispiel #16
0
def word2vec_basic(log_dir):
    """Example of building, training and visualizing a word2vec model."""
    # Create the directory for TensorBoard variables if there is not.
    if not os.path.exists(log_dir):
        os.makedirs(log_dir)

    # Step 1: Analysis the words frequency.
    # Add all filenames
    def add_all_files():
        global need_to_handle_dirs
        files = [
            os.path.join(d, file) for d in need_to_handle_dirs
            for file in os.listdir(d)
        ]
        file_queue = tf.train.string_input_producer(files)
        reader = tf.TextLineReader()
        return reader.read(file_queue)

    def traverse_all_files(handler, param=None):
        key, value = add_all_files()
        key_map = dict()
        with tf.Session() as sess:
            # Start populating the filename queue.
            coord = tf.train.Coordinator()
            threads = tf.train.start_queue_runners(coord=coord)
            running = True
            while running:
                kama, vamp = sess.run([key, value])
                key_map[kama] = 1 if key_map.get(
                    kama) is None else key_map[kama] + 1
                if key_map[kama] > 1:
                    running = False
                else:
                    handler(key_map[kama],
                            tf.compat.as_str(vamp.decode("utf-8")).split(),
                            sess, param)
            coord.request_stop()
            coord.join(threads)

    def count_vocabulary(_, words, sess, param):
        for word in words:
            global vocabulary_count_table
            vocabulary_count_table[word] = 1 if vocabulary_count_table.get(word) is None \
                else vocabulary_count_table[word] + 1

    count_map_file = os.path.join(out_put_dir, "./count_map.txt")
    frequency_file = os.path.join(out_put_dir, "./frequency.txt")
    global all_retrain
    global append_new_data_source
    global vocabulary_count_table
    if all_retrain:
        # delete all cache file:
        if os.path.exists(count_map_file):
            os.remove(count_map_file)
        if os.path.exists(frequency_file):
            os.remove(frequency_file)
        append_new_data_source = False
    if not os.path.exists(frequency_file) or append_new_data_source:
        if not os.path.exists(count_map_file):
            traverse_all_files(count_vocabulary)
            # 没有count_map文件的话一定不是追加模式
            append_new_data_source = False
            with open(count_map_file, "w", encoding="utf-8") as f:
                for (k, v) in vocabulary_count_table.items():
                    f.write(k + " " + str(v))
                    f.write("\n")

        vocabulary_count_table = {}
        with open(count_map_file, "r", encoding="utf-8") as f:
            line = f.readline()
            while line:
                if line != "":
                    strs = line.split(" ")
                    vocabulary_count_table[strs[0]] = int(strs[1])
                line = f.readline()
        # 如果是追加模式 则需要遍历所有新增的文件然后 更新词频统计vocabulary_count_table
        if append_new_data_source:
            traverse_all_files(count_vocabulary)
            with open(count_map_file, "w", encoding="utf-8") as f:
                for (k, v) in vocabulary_count_table.items():
                    f.write(k + " " + str(v))
                    f.write("\n")
            print("append " + str(need_to_handle_dirs) + "finished.")
        print("vocabulary_count_table loaded.")
        #
        # # Read the data into a list of strings.
        # def read_data(filename):
        #     """Extract the first file enclosed in a zip file as a list of words."""
        #     with zipfile.ZipFile(filename) as f:
        #         data = tf.compat.as_str(f.read(f.namelist()[0])).split()
        #     return data
        #
        # vocabulary = read_data(filename)
        # print('Data size', len(vocabulary))
        #
        # # Step 2: Build the dictionary and replace rare words with UNK token.
        vocabulary_size = 200000

        #
        def build_dataset(words, n_words):
            """Process raw inputs into a dataset."""
            count = [['UNK', -1]]
            count.extend(collections.Counter(words).most_common(n_words - 1))
            dictionary = dict()
            for word, _ in count:
                dictionary[word] = len(dictionary)
            # data = list()
            # unk_count = 0
            # for word in words:
            #     index = dictionary.get(word, 0)
            #     if index == 0:  # dictionary['UNK']
            #         unk_count += 1
            #     data.append(index)
            # count[0][1] = unk_count
            reversed_dictionary = dict(
                zip(dictionary.values(), dictionary.keys()))
            global vocabulary_size
            vocabulary_size = len(dictionary)
            return count, dictionary, reversed_dictionary

        # Filling 4 global variables:
        # data - list of codes (integers from 0 to vocabulary_size-1).
        #   This is the original text but words are replaced by their codes
        # count - map of words(strings) to count of occurrences
        # dictionary - map of words(strings) to their codes(integers)
        # reverse_dictionary - maps codes(integers) to words(strings)
        count, unused_dictionary, reverse_dictionary = build_dataset(
            vocabulary_count_table, vocabulary_size)
        del vocabulary_count_table  # Hint to reduce memory.

        with open(frequency_file, "w", encoding="utf-8") as f:
            for (k, v) in reverse_dictionary.items():
                f.write(str(k) + " " + str(v))
                f.write("\n")
        del count, unused_dictionary, reverse_dictionary

    with open(frequency_file, "r", encoding="utf-8") as f:
        line = f.readline()
        if line.endswith("\n"):
            line = line[:-1]
        global frequency
        global frequency_reverse
        while line:
            if "" is not line:
                line_tuple = line.split(" ")
                frequency[int(line_tuple[0])] = line_tuple[1]
                frequency_reverse[line_tuple[1]] = int(line_tuple[0])
            line = f.readline()
            line = line[:-1]
    # print(frequency_reverse)
    print("frequency loaded.")
    print("start training.")

    def generate_batch(batch_size, num_skips, skip_window, data):
        global data_index
        assert batch_size % num_skips == 0
        assert num_skips <= 2 * skip_window
        batch = np.ndarray(shape=(batch_size), dtype=np.int32)
        labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
        span = 2 * skip_window + 1  # [ skip_window target skip_window ]
        buffer = collections.deque(maxlen=span)  # pylint: disable=redefined-builtin
        if data_index + span > len(data):
            data_index = 0
            return None, None
        buffer.extend(data[data_index:data_index + span])
        data_index += span
        for i in range(batch_size // num_skips):
            context_words = [w for w in range(span) if w != skip_window]
            words_to_use = random.sample(context_words, num_skips)
            for j, context_word in enumerate(words_to_use):
                batch[i * num_skips + j] = buffer[skip_window]
                labels[i * num_skips + j, 0] = buffer[context_word]
            if data_index == len(data):
                buffer.extend(data[0:span])
                # data_index = span
            else:
                buffer.append(data[data_index])
                data_index += 1
        # Backtrack a little bit to avoid skipping words in the end of a batch
        # data_index = (data_index + len(data) - span) % len(data)
        return batch, labels

    def train_each_doc(filename, words_sequence, sess, param):
        words_sequence_filtered = [
            'UNK' if frequency_reverse.get(w) is None else w
            for w in words_sequence
        ]
        word_indexs_sequence = [
            frequency_reverse.get(w) for w in words_sequence_filtered
        ]
        batch, labels = generate_batch(batch_size, num_skips, skip_window,
                                       word_indexs_sequence)
        while batch is not None:
            # print([frequency[w] + ":"+frequency[e[0]] for w, e in zip(batch, labels)])
            feed_dict = {train_inputs: batch, train_labels: labels}

            # Define metadata variable.
            global run_metadata
            run_metadata = tf.RunMetadata()

            # We perform one update step by evaluating the optimizer op (including it
            # in the list of returned values for session.run()
            # Also, evaluate the merged op to get all summaries from the returned
            # "summary" variable. Feed metadata variable to session for visualizing
            # the graph in TensorBoard.
            _, summary, loss_val = session.run([optimizer, merged, loss],
                                               feed_dict=feed_dict,
                                               run_metadata=run_metadata)
            global average_loss, step
            average_loss += loss_val

            # Add returned summaries to writer in each step.
            writer.add_summary(summary, step)
            step = step + 1
            if step % 2000 == 0:
                if step > 0:
                    average_loss /= 2000
                # The average loss is an estimate of the loss over the last 2000
                # batches.
                print('Average loss at step ', step, ': ', average_loss)
                if average_loss > 300:
                    print("warning")
                average_loss = 0

            # Note that this is expensive (~20% slowdown if computed every 500 steps)
            if step % 10000 == 0:
                similarity = param[0]
                sim = session.run(similarity)
                for i in xrange(valid_size):
                    valid_word = frequency[valid_examples[i]]
                    top_k = 8  # number of nearest neighbors
                    nearest = (-sim[i, :]).argsort()[1:top_k + 1]
                    log_str = 'Nearest to %s:' % valid_word
                    for k in xrange(top_k):
                        close_word = frequency[nearest[k]]
                        log_str = '%s %s,' % (log_str, close_word)
                    print(log_str)
            batch, labels = generate_batch(batch_size, num_skips, skip_window,
                                           word_indexs_sequence)
        pass

    # We pick a random validation set to sample nearest neighbors. Here we limit
    # the validation samples to the words that have a low numeric ID, which by

    graph = tf.Graph()

    with graph.as_default():

        # Input data.
        with tf.name_scope('inputs'):
            train_inputs = tf.placeholder(tf.int32, shape=[batch_size])
            train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])
            valid_dataset = tf.constant(valid_examples, dtype=tf.int32)

        # Ops and variables pinned to the CPU because of missing GPU implementation
        with tf.device('/cpu:0'):
            # Look up embeddings for inputs.
            with tf.name_scope('embeddings'):
                embeddings = tf.Variable(
                    tf.random_uniform([vocabulary_size, embedding_size], -1.0,
                                      1.0))
                embed = tf.nn.embedding_lookup(embeddings, train_inputs)

            # Construct the variables for the NCE loss
            with tf.name_scope('weights'):
                nce_weights = tf.Variable(
                    tf.truncated_normal([vocabulary_size, embedding_size],
                                        stddev=1.0 /
                                        math.sqrt(embedding_size)))
            with tf.name_scope('biases'):
                nce_biases = tf.Variable(tf.zeros([vocabulary_size]))

        # Compute the average NCE loss for the batch.
        # tf.nce_loss automatically draws a new sample of the negative labels each
        # time we evaluate the loss.
        # Explanation of the meaning of NCE loss:
        #   http://mccormickml.com/2016/04/19/word2vec-tutorial-the-skip-gram-model/
        with tf.name_scope('loss'):
            loss = tf.reduce_mean(
                tf.nn.nce_loss(weights=nce_weights,
                               biases=nce_biases,
                               labels=train_labels,
                               inputs=embed,
                               num_sampled=num_sampled,
                               num_classes=vocabulary_size))

        # Add the loss value as a scalar to summary.
        tf.summary.scalar('loss', loss)

        # Construct the SGD optimizer using a learning rate of 1.0.
        with tf.name_scope('optimizer'):
            optimizer = tf.train.GradientDescentOptimizer(1.0).minimize(loss)

        # Compute the cosine similarity between minibatch examples and all
        # embeddings.
        norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keepdims=True))
        normalized_embeddings = embeddings / norm
        valid_embeddings = tf.nn.embedding_lookup(normalized_embeddings,
                                                  valid_dataset)
        similarity = tf.matmul(valid_embeddings,
                               normalized_embeddings,
                               transpose_b=True)

        # Merge all summaries.
        merged = tf.summary.merge_all()

        # Add variable initializer.
        init = tf.global_variables_initializer()

        # Create a saver.
        saver = tf.train.Saver()

    # Step 5: Begin training.
    # num_steps = 1000001

    with tf.Session(graph=graph) as session:
        # Open a writer to write summaries.
        writer = tf.summary.FileWriter(log_dir, session.graph)

        # We must initialize all variables before we use them.
        init.run()
        print('Initialized')

        # for step in xrange(num_steps):
        traverse_all_files(train_each_doc, [similarity, session])
        global run_metadata, step
        writer.add_run_metadata(run_metadata, 'step%d' % step)
        final_embeddings = normalized_embeddings.eval()

        # Write corresponding labels for the embeddings.
        with open(log_dir + '/metadata.tsv', 'w', encoding='utf-8') as f:
            for i in xrange(vocabulary_size):
                f.write(frequency[i] + '\n' + str(final_embeddings[i]) + '\n')

        # Save the model for checkpoints.
        saver.save(session, os.path.join(log_dir, 'model.ckpt'))

        # Create a configuration for visualizing embeddings with the labels in
        # TensorBoard.
        config = projector.ProjectorConfig()
        embedding_conf = config.embeddings.add()
        embedding_conf.tensor_name = embeddings.name
        embedding_conf.metadata_path = os.path.join(log_dir, 'metadata.tsv')
        projector.visualize_embeddings(writer, config)

    writer.close()

    # Step 6: Visualize the embeddings.

    # pylint: disable=missing-docstring
    # Function to draw visualization of distance between embeddings.
    def plot_with_labels(low_dim_embs, labels, filename):
        assert low_dim_embs.shape[0] >= len(
            labels), 'More labels than embeddings'
        plt.rcParams['font.sans-serif'] = ['SimHei']  # 用来正常显示中文标签
        plt.rcParams['axes.unicode_minus'] = False
        plt.figure(figsize=(18, 18))  # in inches
        for i, label in enumerate(labels):
            x, y = low_dim_embs[i, :]
            plt.scatter(x, y)
            plt.annotate(label,
                         xy=(x, y),
                         xytext=(5, 2),
                         textcoords='offset points',
                         ha='right',
                         va='bottom')

        plt.savefig(filename)

    try:
        # pylint: disable=g-import-not-at-top
        from sklearn.manifold import TSNE
        import matplotlib.pyplot as plt

        tsne = TSNE(perplexity=30,
                    n_components=2,
                    init='pca',
                    n_iter=5000,
                    method='exact')
        plot_only = 500
        low_dim_embs = tsne.fit_transform(final_embeddings[:plot_only, :])
        labels = [frequency[i] for i in xrange(plot_only)]
        plot_with_labels(low_dim_embs, labels, os.path.join('./', 'tsne.png'))

    except ImportError as ex:
        print(
            'Please install sklearn, matplotlib, and scipy to show embeddings.'
        )
        print(ex)
Beispiel #17
0
def main():

    cfg = TrainConfig().parse()
    print(cfg.name)
    result_dir = os.path.join(
        cfg.result_root,
        cfg.name + '_' + datetime.strftime(datetime.now(), '%Y%m%d-%H%M%S'))
    if not os.path.isdir(result_dir):
        os.makedirs(result_dir)
    utils.write_configure_to_file(cfg, result_dir)
    np.random.seed(seed=cfg.seed)

    # prepare dataset
    train_session = cfg.train_session
    train_set = prepare_multimodal_dataset(cfg.feature_root, train_session,
                                           cfg.feat, cfg.label_root)
    if cfg.task == "supervised":  # fully supervised task
        train_set = train_set[:cfg.label_num]
    batch_per_epoch = len(train_set) // cfg.sess_per_batch
    labeled_session = train_session[:cfg.label_num]

    val_session = cfg.val_session
    val_set = prepare_multimodal_dataset(cfg.feature_root, val_session,
                                         cfg.feat, cfg.label_root)

    # construct the graph
    with tf.Graph().as_default():
        tf.set_random_seed(cfg.seed)
        global_step = tf.Variable(0, trainable=False)
        lr_ph = tf.placeholder(tf.float32, name='learning_rate')

        ####################### Load models here ########################
        sensors_emb_dim = 32
        segment_emb_dim = 32

        with tf.variable_scope("modality_core"):
            # load backbone model
            if cfg.network == "convtsn":
                model_emb = networks.ConvTSN(n_seg=cfg.num_seg,
                                             emb_dim=cfg.emb_dim)
            elif cfg.network == "convrtsn":
                model_emb = networks.ConvRTSN(n_seg=cfg.num_seg,
                                              emb_dim=cfg.emb_dim)
            elif cfg.network == "convbirtsn":
                model_emb = networks.ConvBiRTSN(n_seg=cfg.num_seg,
                                                emb_dim=cfg.emb_dim)
            else:
                raise NotImplementedError

            input_ph = tf.placeholder(
                tf.float32, shape=[None, cfg.num_seg, None, None, None])
            dropout_ph = tf.placeholder(tf.float32, shape=[])
            model_emb.forward(input_ph,
                              dropout_ph)  # for lstm has variable scope

        with tf.variable_scope("modality_sensors"):
            model_emb_sensors = networks.RTSN(n_seg=cfg.num_seg,
                                              emb_dim=sensors_emb_dim)
            model_pairsim_sensors = networks.PDDM(n_input=sensors_emb_dim)

            input_sensors_ph = tf.placeholder(tf.float32,
                                              shape=[None, cfg.num_seg, 8])
            model_emb_sensors.forward(input_sensors_ph, dropout_ph)

            var_list = {}
            for v in tf.global_variables():
                if v.op.name.startswith("modality_sensors"):
                    var_list[v.op.name.replace("modality_sensors/", "")] = v
            restore_saver_sensors = tf.train.Saver(var_list)

        with tf.variable_scope("modality_segment"):
            model_emb_segment = networks.RTSN(n_seg=cfg.num_seg,
                                              emb_dim=segment_emb_dim,
                                              n_input=357)
            model_pairsim_segment = networks.PDDM(n_input=segment_emb_dim)

            input_segment_ph = tf.placeholder(tf.float32,
                                              shape=[None, cfg.num_seg, 357])
            model_emb_segment.forward(input_segment_ph, dropout_ph)

            var_list = {}
            for v in tf.global_variables():
                if v.op.name.startswith("modality_segment"):
                    var_list[v.op.name.replace("modality_segment/", "")] = v
            restore_saver_segment = tf.train.Saver(var_list)

        ############################# Forward Pass #############################

        # Core branch
        if cfg.normalized:
            embedding = tf.nn.l2_normalize(model_emb.hidden,
                                           axis=-1,
                                           epsilon=1e-10)
        else:
            embedding = model_emb.hidden

        # get the number of multimodal triplets (x3)
        mul_num_ph = tf.placeholder(tf.int32, shape=[])

        # variable for visualizing the embeddings
        emb_var = tf.Variable([0.0], name='embeddings')
        set_emb = tf.assign(emb_var, embedding, validate_shape=False)

        # calculated for monitoring all-pair embedding distance
        diffs = utils.all_diffs_tf(embedding, embedding)
        all_dist = utils.cdist_tf(diffs)
        tf.summary.histogram('embedding_dists', all_dist)

        # split embedding into anchor, positive and negative and calculate triplet loss
        anchor, positive, negative = tf.unstack(
            tf.reshape(embedding[:(tf.shape(embedding)[0] - mul_num_ph)],
                       [-1, 3, cfg.emb_dim]), 3, 1)
        anchor_mul, positive_mul, negative_mul = tf.unstack(
            tf.reshape(embedding[-mul_num_ph:], [-1, 3, cfg.emb_dim]), 3, 1)

        # Sensors branch
        emb_sensors = model_emb_sensors.hidden
        A_sensors, B_sensors, C_sensors = tf.unstack(
            tf.reshape(emb_sensors, [-1, 3, sensors_emb_dim]), 3, 1)
        model_pairsim_sensors.forward(tf.stack([A_sensors, B_sensors], axis=1))
        pddm_AB_sensors = model_pairsim_sensors.prob[:, 1]
        model_pairsim_sensors.forward(tf.stack([A_sensors, C_sensors], axis=1))
        pddm_AC_sensors = model_pairsim_sensors.prob[:, 1]

        # Segment branch
        emb_segment = model_emb_segment.hidden
        A_segment, B_segment, C_segment = tf.unstack(
            tf.reshape(emb_segment, [-1, 3, segment_emb_dim]), 3, 1)
        model_pairsim_segment.forward(tf.stack([A_segment, B_segment], axis=1))
        pddm_AB_segment = model_pairsim_segment.prob[:, 1]
        model_pairsim_segment.forward(tf.stack([A_segment, C_segment], axis=1))
        pddm_AC_segment = model_pairsim_segment.prob[:, 1]

        # fuse prob from all modalities
        prob_AB = 0.5 * (pddm_AB_sensors + pddm_AB_segment)
        prob_AC = 0.5 * (pddm_AC_sensors + pddm_AC_segment)

        ############################# Calculate loss #############################

        # triplet loss for labeled inputs
        metric_loss1 = networks.triplet_loss(anchor, positive, negative,
                                             cfg.alpha)

        # weighted triplet loss for multimodal inputs
        #        if cfg.weighted:
        #            metric_loss2, _ = networks.weighted_triplet_loss(anchor_mul, positive_mul, negative_mul, prob_AB, prob_AC, cfg.alpha)
        #        else:
        metric_loss2 = networks.triplet_loss(anchor_mul, positive_mul,
                                             negative_mul, cfg.alpha)

        # whether to apply joint optimization
        if cfg.no_joint:
            unimodal_var_list = [
                v for v in tf.global_variables()
                if v.op.name.startswith("modality_core")
            ]
            train_var_list = unimodal_var_list
        else:
            multimodal_var_list = [
                v for v in tf.global_variables()
                if not (v.op.name.startswith("modality_sensors/RTSN")
                        or v.op.name.startswith("modality_segment/RTSN"))
            ]
            train_var_list = multimodal_var_list

        regularization_loss = tf.reduce_sum(
            tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES))
        total_loss = tf.cond(
            tf.greater(mul_num_ph, 0), lambda: tf.cond(
                tf.equal(mul_num_ph,
                         tf.shape(embedding)[0]), lambda: metric_loss2 * cfg.
                lambda_multimodal + regularization_loss * cfg.lambda_l2,
                lambda: metric_loss1 + metric_loss2 * cfg.lambda_multimodal +
                regularization_loss * cfg.lambda_l2),
            lambda: metric_loss1 + regularization_loss * cfg.lambda_l2)

        tf.summary.scalar('learning_rate', lr_ph)
        train_op = utils.optimize(total_loss, global_step, cfg.optimizer,
                                  lr_ph, train_var_list)

        saver = tf.train.Saver(max_to_keep=10)
        summary_op = tf.summary.merge_all(
        )  # not logging histogram of variables because it will cause problem when only unimodal_train_op is called

        summ_prob_AB = tf.summary.histogram('Prob_AB_histogram', prob_AB)
        summ_prob_AC = tf.summary.histogram('Prob_AC_histogram', prob_AC)
        #        summ_weights = tf.summary.histogram('Weights_histogram', weights)

        #########################################################################

        # session iterator for session sampling
        feat_paths_ph = tf.placeholder(tf.string,
                                       shape=[None, cfg.sess_per_batch])
        feat2_paths_ph = tf.placeholder(tf.string,
                                        shape=[None, cfg.sess_per_batch])
        feat3_paths_ph = tf.placeholder(tf.string,
                                        shape=[None, cfg.sess_per_batch])
        label_paths_ph = tf.placeholder(tf.string,
                                        shape=[None, cfg.sess_per_batch])
        train_data = multimodal_session_generator(
            feat_paths_ph,
            feat2_paths_ph,
            feat3_paths_ph,
            label_paths_ph,
            sess_per_batch=cfg.sess_per_batch,
            num_threads=2,
            shuffled=False,
            preprocess_func=[
                model_emb.prepare_input, model_emb_sensors.prepare_input,
                model_emb_segment.prepare_input
            ])
        train_sess_iterator = train_data.make_initializable_iterator()
        next_train = train_sess_iterator.get_next()

        # prepare validation data
        val_sess = []
        val_feats = []
        val_feats2 = []
        val_feats3 = []
        val_labels = []
        val_boundaries = []
        for session in val_set:
            session_id = os.path.basename(session[1]).split('_')[0]
            eve_batch, lab_batch, boundary = load_data_and_label(
                session[0], session[-1], model_emb.prepare_input_test
            )  # use prepare_input_test for testing time
            val_feats.append(eve_batch)
            val_labels.append(lab_batch)
            val_sess.extend([session_id] * eve_batch.shape[0])
            val_boundaries.extend(boundary)

            eve2_batch, _, _ = load_data_and_label(
                session[1], session[-1], model_emb_sensors.prepare_input_test)
            val_feats2.append(eve2_batch)

            eve3_batch, _, _ = load_data_and_label(
                session[2], session[-1], model_emb_segment.prepare_input_test)
            val_feats3.append(eve3_batch)
        val_feats = np.concatenate(val_feats, axis=0)
        val_feats2 = np.concatenate(val_feats2, axis=0)
        val_feats3 = np.concatenate(val_feats3, axis=0)
        val_labels = np.concatenate(val_labels, axis=0)
        print("Shape of val_feats: ", val_feats.shape)

        # generate metadata.tsv for visualize embedding
        with open(os.path.join(result_dir, 'metadata_val.tsv'), 'w') as fout:
            fout.write('id\tlabel\tsession_id\tstart\tend\n')
            for i in range(len(val_sess)):
                fout.write('{0}\t{1}\t{2}\t{3}\t{4}\n'.format(
                    i, val_labels[i, 0], val_sess[i], val_boundaries[i][0],
                    val_boundaries[i][1]))

        # compute pairwise embedding distance for each class on validation set
        # FIXME: don't use np.max


#        dist_dict = {}
#        for i in range(1, np.max(val_labels)+1):
#            temp_feat = val_feats[np.where(val_labels==i)[0]]
#            dist_dict[i] = [np.mean(utils.cdist(utils.all_diffs(temp_feat, temp_feat),
#                                metric=cfg.metric))]
#        pdb.set_trace()

#########################################################################

# Start running the graph
        if cfg.gpu:
            os.environ['CUDA_VISIBLE_DEVICES'] = cfg.gpu

        gpu_options = tf.GPUOptions(allow_growth=True)
        sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))

        summary_writer = tf.summary.FileWriter(result_dir, sess.graph)

        with sess.as_default():

            sess.run(tf.global_variables_initializer())

            # load pretrain model, if needed
            if cfg.model_path:
                print("Restoring pretrained model: %s" % cfg.model_path)
                saver.restore(sess, cfg.model_path)

            print("Restoring sensors model: %s" % cfg.sensors_path)
            restore_saver_sensors.restore(sess, cfg.sensors_path)
            print("Restoring segment model: %s" % cfg.segment_path)
            restore_saver_segment.restore(sess, cfg.segment_path)

            ################## Training loop ##################
            epoch = -1
            while epoch < cfg.max_epochs - 1:
                step = sess.run(global_step, feed_dict=None)
                epoch = step // batch_per_epoch

                # learning rate schedule, reference: "In defense of Triplet Loss"
                if epoch < cfg.static_epochs:
                    learning_rate = cfg.learning_rate
                else:
                    learning_rate = cfg.learning_rate * \
                            0.01**((epoch-cfg.static_epochs)/(cfg.max_epochs-cfg.static_epochs))

                # prepare data for this epoch
                random.shuffle(train_set)

                paths = list(zip(*[iter(train_set)] * cfg.sess_per_batch))

                feat_paths = [[p[0] for p in path] for path in paths]
                feat2_paths = [[p[1] for p in path] for path in paths]
                feat3_paths = [[p[2] for p in path] for path in paths]
                label_paths = [[p[-1] for p in path] for path in paths]

                sess.run(train_sess_iterator.initializer,
                         feed_dict={
                             feat_paths_ph: feat_paths,
                             feat2_paths_ph: feat2_paths,
                             feat3_paths_ph: feat3_paths,
                             label_paths_ph: label_paths
                         })

                # for each epoch
                batch_count = 1
                while True:
                    try:
                        ##################### Data loading ########################
                        start_time = time.time()
                        eve, eve_sensors, eve_segment, lab, batch_sess = sess.run(
                            next_train)

                        # for memory concern, 1000 events are used in maximum
                        if eve.shape[0] > cfg.event_per_batch:
                            idx = np.random.permutation(
                                eve.shape[0])[:cfg.event_per_batch]
                            eve = eve[idx]
                            eve_sensors = eve_sensors[idx]
                            eve_segment = eve_segment[idx]
                            lab = lab[idx]
                            batch_sess = batch_sess[idx]
                        load_time = time.time() - start_time

                        ##################### Triplet selection #####################
                        start_time = time.time()
                        # Get the embeddings of all events
                        eve_embedding = np.zeros((eve.shape[0], cfg.emb_dim),
                                                 dtype='float32')
                        for start, end in zip(
                                range(0, eve.shape[0], cfg.batch_size),
                                range(cfg.batch_size,
                                      eve.shape[0] + cfg.batch_size,
                                      cfg.batch_size)):
                            end = min(end, eve.shape[0])
                            emb = sess.run(embedding,
                                           feed_dict={
                                               input_ph: eve[start:end],
                                               dropout_ph: 1.0
                                           })
                            eve_embedding[start:end] = np.copy(emb)

                        # sample triplets within sampled sessions
                        all_diff = utils.all_diffs(eve_embedding,
                                                   eve_embedding)
                        triplet_input_idx, active_count = utils.select_triplets_facenet(
                            lab,
                            utils.cdist(all_diff, metric=cfg.metric),
                            cfg.triplet_per_batch,
                            cfg.alpha,
                            num_negative=cfg.num_negative)
                        if len(triplet_input_idx) == 0:
                            continue

                        triplet_count = len(triplet_input_idx) // 3
                        multimodal_count = 0
                        if epoch >= cfg.multimodal_epochs:
                            # Get the similarity of all events
                            sim_prob = np.zeros((eve.shape[0], eve.shape[0]),
                                                dtype='float32') * np.nan
                            comb = list(
                                itertools.combinations(range(eve.shape[0]), 2))
                            for start, end in zip(
                                    range(0, len(comb), cfg.batch_size),
                                    range(cfg.batch_size,
                                          len(comb) + cfg.batch_size,
                                          cfg.batch_size)):
                                end = min(end, len(comb))
                                comb_idx = []
                                for c in comb[start:end]:
                                    comb_idx.extend([c[0], c[1], c[1]])
                                sim = sess.run(prob_AB,
                                               feed_dict={
                                                   input_sensors_ph:
                                                   eve_sensors[comb_idx],
                                                   input_segment_ph:
                                                   eve_segment[comb_idx],
                                                   dropout_ph:
                                                   1.0
                                               })
                                for i in range(sim.shape[0]):
                                    sim_prob[comb[start + i][0],
                                             comb[start + i][1]] = sim[i]
                                    sim_prob[comb[start + i][1],
                                             comb[start + i][0]] = sim[i]

                            # sample triplets from similarity prediction
                            # maximum number not exceed the number of triplet_input from facenet selection

                            # hard samples mining
                            triplet_input_idx, triplet_count, multimodal_count1 = select_triplets_mul_hard(
                                triplet_input_idx, lab, sim_prob,
                                cfg.triplet_per_batch, 3, 0.8, 0.2)

                            sensors_input = eve_sensors[
                                triplet_input_idx[-(3 * multimodal_count1):]]
                            segment_input = eve_segment[
                                triplet_input_idx[-(3 * multimodal_count1):]]

                            # add up all multimodal triplets
                            multimodal_count = multimodal_count1

                        print(triplet_count, multimodal_count)
                        triplet_input = eve[triplet_input_idx]

                        select_time = time.time() - start_time

                        if len(triplet_input.shape) > 5:  # debugging
                            pdb.set_trace()

                        ##################### Start training  ########################

                        # supervised initialization
                        if multimodal_count == 0:
                            if triplet_count == 0:
                                continue
                            err, metric_err1, _, step, summ = sess.run(
                                [
                                    total_loss, metric_loss1, train_op,
                                    global_step, summary_op
                                ],
                                feed_dict={
                                    input_ph: triplet_input,
                                    dropout_ph: cfg.keep_prob,
                                    mul_num_ph: 0,
                                    lr_ph: learning_rate
                                })
                            metric_err2 = 0
                        else:
                            err, metric_err1, metric_err2, _, step, summ, s_AB, s_AC = sess.run(
                                [
                                    total_loss, metric_loss1, metric_loss2,
                                    train_op, global_step, summary_op,
                                    summ_prob_AB, summ_prob_AC
                                ],
                                feed_dict={
                                    input_ph: triplet_input,
                                    input_sensors_ph: sensors_input,
                                    input_segment_ph: segment_input,
                                    mul_num_ph: multimodal_count * 3,
                                    dropout_ph: cfg.keep_prob,
                                    lr_ph: learning_rate
                                })
                            summary_writer.add_summary(s_AB, step)
                            summary_writer.add_summary(s_AC, step)


                        print ("%s\tEpoch: [%d][%d/%d]\tEvent num: %d\tTriplet num: %d\tLoad time: %.3f\tSelect time: %.3f\tLoss %.4f" % \
                                (cfg.name, epoch+1, batch_count, batch_per_epoch, eve.shape[0], triplet_count+multimodal_count, load_time, select_time, err))

                        summary = tf.Summary(value=[
                            tf.Summary.Value(tag="train_loss",
                                             simple_value=err),
                            tf.Summary.Value(tag="active_count",
                                             simple_value=active_count),
                            tf.Summary.Value(tag="triplet_count",
                                             simple_value=triplet_count),
                            tf.Summary.Value(tag="multimodal_count",
                                             simple_value=multimodal_count),
                            tf.Summary.Value(tag="metric_loss1",
                                             simple_value=metric_err1),
                            tf.Summary.Value(tag="metric_loss2",
                                             simple_value=metric_err2)
                        ])

                        summary_writer.add_summary(summary, step)
                        summary_writer.add_summary(summ, step)

                        batch_count += 1

                    except tf.errors.OutOfRangeError:
                        print("Epoch %d done!" % (epoch + 1))
                        break

                # validation on val_set
                print("Evaluating on validation set...")
                val_embeddings, _ = sess.run([embedding, set_emb],
                                             feed_dict={
                                                 input_ph: val_feats,
                                                 dropout_ph: 1.0
                                             })
                mAP, mPrec, recall = utils.evaluate_simple(
                    val_embeddings, val_labels)
                summary = tf.Summary(value=[
                    tf.Summary.Value(tag="Valiation mAP", simple_value=mAP),
                    tf.Summary.Value(tag="Validation Recall@1",
                                     simple_value=recall),
                    tf.Summary.Value(tag="Validation [email protected]",
                                     simple_value=mPrec)
                ])
                summary_writer.add_summary(summary, step)
                print("Epoch: [%d]\tmAP: %.4f\tmPrec: %.4f" %
                      (epoch + 1, mAP, mPrec))

                # config for embedding visualization
                config = projector.ProjectorConfig()
                visual_embedding = config.embeddings.add()
                visual_embedding.tensor_name = emb_var.name
                visual_embedding.metadata_path = os.path.join(
                    result_dir, 'metadata_val.tsv')
                projector.visualize_embeddings(summary_writer, config)

                # save model
                saver.save(sess,
                           os.path.join(result_dir, cfg.name + '.ckpt'),
                           global_step=step)
def train_model(model, batch_gen, num_train_steps, weights_fld):
    saver = tf.train.Saver(
    )  # defaults to saving all variables - in this case embed_matrix, nce_weight, nce_bias

    initial_step = 0
    utils.make_dir('checkpoints')
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        ckpt = tf.train.get_checkpoint_state(
            os.path.dirname('checkpoints/checkpoint'))
        # if that checkpoint exists, restore from checkpoint
        if ckpt and ckpt.model_checkpoint_path:
            saver.restore(sess, ckpt.model_checkpoint_path)

        else:

            total_loss = 0.0  # we use this to calculate late average loss in the last SKIP_STEP steps
            writer = tf.summary.FileWriter(
                'improved_graph/lr' + str(LEARNING_RATE), sess.graph)
            initial_step = model.global_step.eval()
            for index in range(initial_step, initial_step + num_train_steps):
                centers, targets = next(batch_gen)
                feed_dict = {
                    model.center_words: centers,
                    model.target_words: targets
                }
                loss_batch, _, summary = sess.run(
                    [model.loss, model.optimizer, model.summary_op],
                    feed_dict=feed_dict)
                writer.add_summary(summary, global_step=index)
                total_loss += loss_batch
                if (index + 1) % SKIP_STEP == 0:
                    print('Average loss at step {}: {:5.1f}'.format(
                        index, total_loss / SKIP_STEP))
                    total_loss = 0.0
                    saver.save(sess, 'checkpoints/skip-gram', index)

        ####################
        # code to visualize the embeddings. uncomment the below to visualize embeddings
        # run "'tensorboard --logdir='processed'" to see the embeddings
        final_embed_matrix = sess.run(model.embed_matrix)

        # it has to variable. constants don't work here. you can't reuse model.embed_matrix
        embedding_var = tf.Variable(final_embed_matrix[:1000],
                                    name='embedding')
        sess.run(embedding_var.initializer)

        config = projector.ProjectorConfig()
        summary_writer = tf.summary.FileWriter('processed')

        # add embedding to the config file
        embedding = config.embeddings.add()
        embedding.tensor_name = embedding_var.name

        # link this tensor to its metadata file, in this case the first 500 words of vocab
        # embedding.metadata_path = 'processed/vocab_1000.tsv'
        embedding.metadata_path = 'vocab_1000.tsv'

        # saves a configuration file that TensorBoard will read during startup.
        projector.visualize_embeddings(summary_writer, config)
        saver_embed = tf.train.Saver([embedding_var])
        saver_embed.save(sess, 'processed/model3.ckpt', 1)
Beispiel #19
0
    def set_model(self, model):
        self.model = model
        if K.backend() == 'tensorflow':
            self.sess = K.get_session()
        if self.histogram_freq and self.merged is None:
            for layer in self.model.layers:

                for weight in layer.weights:
                    mapped_weight_name = weight.name.replace(':', '_')
                    tf.summary.histogram(mapped_weight_name, weight)
                    if self.write_grads:
                        grads = model.optimizer.get_gradients(
                            model.total_loss, weight)

                        def is_indexed_slices(grad):
                            return type(grad).__name__ == 'IndexedSlices'

                        grads = [
                            grad.values if is_indexed_slices(grad) else grad
                            for grad in grads
                        ]
                        tf.summary.histogram(
                            '{}_grad'.format(mapped_weight_name), grads)
                    if self.write_images:
                        w_img = tf.squeeze(weight)
                        shape = K.int_shape(w_img)
                        if len(shape) == 2:  # dense layer kernel case
                            if shape[0] > shape[1]:
                                w_img = tf.transpose(w_img)
                                shape = K.int_shape(w_img)
                            w_img = tf.reshape(w_img,
                                               [1, shape[0], shape[1], 1])
                        elif len(shape) == 3:  # convnet case
                            if K.image_data_format() == 'channels_last':
                                # switch to channels_first to display
                                # every kernel as a separate image
                                w_img = tf.transpose(w_img, perm=[2, 0, 1])
                                shape = K.int_shape(w_img)
                            w_img = tf.reshape(
                                w_img, [shape[0], shape[1], shape[2], 1])
                        elif len(shape) == 1:  # bias case
                            w_img = tf.reshape(w_img, [1, shape[0], 1, 1])
                        else:
                            # not possible to handle 3D convnets etc.
                            continue

                        shape = K.int_shape(w_img)
                        assert len(shape) == 4 and shape[-1] in [1, 3, 4]
                        tf.summary.image(mapped_weight_name, w_img)

                if hasattr(layer, 'output'):
                    tf.summary.histogram('{}_out'.format(layer.name),
                                         layer.output)
        self.merged = tf.summary.merge_all()

        if self.write_graph:
            self.writer = tf.summary.FileWriter(self.log_dir, self.sess.graph)
        else:
            self.writer = tf.summary.FileWriter(self.log_dir)

        if self.embeddings_freq:
            embeddings_layer_names = self.embeddings_layer_names

            if not embeddings_layer_names:
                embeddings_layer_names = [
                    layer.name for layer in self.model.layers
                    if type(layer).__name__ == 'Embedding'
                ]

            embeddings = {
                layer.name: layer.weights[0]
                for layer in self.model.layers
                if layer.name in embeddings_layer_names
            }

            self.saver = tf.train.Saver(list(embeddings.values()))

            embeddings_metadata = {}

            if not isinstance(self.embeddings_metadata, str):
                embeddings_metadata = self.embeddings_metadata
            else:
                embeddings_metadata = {
                    layer_name: self.embeddings_metadata
                    for layer_name in embeddings.keys()
                }

            config = projector.ProjectorConfig()
            self.embeddings_ckpt_path = os.path.join(self.log_dir,
                                                     'keras_embedding.ckpt')

            for layer_name, tensor in embeddings.items():
                embedding = config.embeddings.add()
                embedding.tensor_name = tensor.name

                if layer_name in embeddings_metadata:
                    embedding.metadata_path = embeddings_metadata[layer_name]

            projector.visualize_embeddings(self.writer, config)
Beispiel #20
0
    def fit(self):
        parameters = self.parameters
        conf_parameters = self.conf_parameters
        dataset_filepaths = self.dataset_filepaths
        dataset = self.dataset
        dataset_brat_folders = self.dataset_brat_folders
        sess = self.sess
        model = self.model
        transition_params_trained = self.transition_params_trained
        stats_graph_folder, experiment_timestamp = self._create_stats_graph_folder(
            parameters)

        # Initialize and save execution details
        start_time = time.time()
        results = {}
        results['epoch'] = {}
        results['execution_details'] = {}
        results['execution_details']['train_start'] = start_time
        results['execution_details']['time_stamp'] = experiment_timestamp
        results['execution_details']['early_stop'] = False
        results['execution_details']['keyboard_interrupt'] = False
        results['execution_details']['num_epochs'] = 0
        results['model_options'] = copy.copy(parameters)

        model_folder = os.path.join(stats_graph_folder, 'model')
        utils.create_folder_if_not_exists(model_folder)
        with open(os.path.join(model_folder, 'parameters.ini'),
                  'w') as parameters_file:
            conf_parameters.write(parameters_file)
        pickle.dump(dataset,
                    open(os.path.join(model_folder, 'dataset.pickle'), 'wb'))

        tensorboard_log_folder = os.path.join(stats_graph_folder,
                                              'tensorboard_logs')
        utils.create_folder_if_not_exists(tensorboard_log_folder)
        tensorboard_log_folders = {}
        for dataset_type in dataset_filepaths.keys():
            tensorboard_log_folders[dataset_type] = os.path.join(
                stats_graph_folder, 'tensorboard_logs', dataset_type)
            utils.create_folder_if_not_exists(
                tensorboard_log_folders[dataset_type])

        # Instantiate the writers for TensorBoard
        writers = {}
        for dataset_type in dataset_filepaths.keys():
            writers[dataset_type] = tf.summary.FileWriter(
                tensorboard_log_folders[dataset_type], graph=sess.graph)
        embedding_writer = tf.summary.FileWriter(
            model_folder
        )  # embedding_writer has to write in model_folder, otherwise TensorBoard won't be able to view embeddings

        embeddings_projector_config = projector.ProjectorConfig()
        tensorboard_token_embeddings = embeddings_projector_config.embeddings.add(
        )
        tensorboard_token_embeddings.tensor_name = model.token_embedding_weights.name
        token_list_file_path = os.path.join(model_folder,
                                            'tensorboard_metadata_tokens.tsv')
        tensorboard_token_embeddings.metadata_path = os.path.relpath(
            token_list_file_path, '..')

        tensorboard_character_embeddings = embeddings_projector_config.embeddings.add(
        )
        tensorboard_character_embeddings.tensor_name = model.character_embedding_weights.name
        character_list_file_path = os.path.join(
            model_folder, 'tensorboard_metadata_characters.tsv')
        tensorboard_character_embeddings.metadata_path = os.path.relpath(
            character_list_file_path, '..')

        projector.visualize_embeddings(embedding_writer,
                                       embeddings_projector_config)

        # Write metadata for TensorBoard embeddings
        token_list_file = codecs.open(token_list_file_path, 'w', 'UTF-8')
        for token_index in range(dataset.vocabulary_size):
            token_list_file.write('{0}\n'.format(
                dataset.index_to_token[token_index]))
        token_list_file.close()

        character_list_file = codecs.open(character_list_file_path, 'w',
                                          'UTF-8')
        for character_index in range(dataset.alphabet_size):
            if character_index == dataset.PADDING_CHARACTER_INDEX:
                character_list_file.write('PADDING\n')
            else:
                character_list_file.write('{0}\n'.format(
                    dataset.index_to_character[character_index]))
        character_list_file.close()

        # Start training + evaluation loop. Each iteration corresponds to 1 epoch.
        bad_counter = 0  # number of epochs with no improvement on the validation test in terms of F1-score
        previous_best_valid_f1_score = 0
        epoch_number = -1
        try:
            while True:
                step = 0
                epoch_number += 1
                print('\nStarting epoch {0}'.format(epoch_number))

                epoch_start_time = time.time()

                if epoch_number != 0:
                    # Train model: loop over all sequences of training set with shuffling
                    sequence_numbers = list(
                        range(len(dataset.token_indices['train'])))
                    random.shuffle(sequence_numbers)
                    for sequence_number in sequence_numbers:
                        transition_params_trained = train.train_step(
                            sess, dataset, sequence_number, model, parameters)
                        step += 1
                        if step % 10 == 0:
                            print('Training {0:.2f}% done'.format(
                                step / len(sequence_numbers) * 100),
                                  end='\r',
                                  flush=True)

                epoch_elapsed_training_time = time.time() - epoch_start_time
                print('Training completed in {0:.2f} seconds'.format(
                    epoch_elapsed_training_time),
                      flush=True)

                y_pred, y_true, output_filepaths = train.predict_labels(
                    sess, model, transition_params_trained, parameters,
                    dataset, epoch_number, stats_graph_folder,
                    dataset_filepaths)

                # Evaluate model: save and plot results
                evaluate.evaluate_model(results, dataset, y_pred, y_true,
                                        stats_graph_folder, epoch_number,
                                        epoch_start_time, output_filepaths,
                                        parameters)

                if parameters['use_pretrained_model'] and not parameters[
                        'train_model']:
                    conll_to_brat.output_brat(output_filepaths,
                                              dataset_brat_folders,
                                              stats_graph_folder)
                    break

                # Save model
                model.saver.save(
                    sess,
                    os.path.join(model_folder,
                                 'model_{0:05d}.ckpt'.format(epoch_number)))

                # Save TensorBoard logs
                summary = sess.run(model.summary_op, feed_dict=None)
                writers['train'].add_summary(summary, epoch_number)
                writers['train'].flush()
                utils.copytree(writers['train'].get_logdir(), model_folder)

                # Early stop
                valid_f1_score = results['epoch'][epoch_number][0]['valid'][
                    'f1_score']['micro']
                if valid_f1_score > previous_best_valid_f1_score:
                    bad_counter = 0
                    previous_best_valid_f1_score = valid_f1_score
                    conll_to_brat.output_brat(output_filepaths,
                                              dataset_brat_folders,
                                              stats_graph_folder,
                                              overwrite=True)
                    self.transition_params_trained = transition_params_trained
                else:
                    bad_counter += 1
                print(
                    "The last {0} epochs have not shown improvements on the validation set."
                    .format(bad_counter))

                if bad_counter >= parameters['patience']:
                    print('Early Stop!')
                    results['execution_details']['early_stop'] = True
                    break

                if epoch_number >= parameters['maximum_number_of_epochs']:
                    break

        except KeyboardInterrupt:
            results['execution_details']['keyboard_interrupt'] = True
            print('Training interrupted')

        print('Finishing the experiment')
        end_time = time.time()
        results['execution_details']['train_duration'] = end_time - start_time
        results['execution_details']['train_end'] = end_time
        evaluate.save_results(results, stats_graph_folder)
        for dataset_type in dataset_filepaths.keys():
            writers[dataset_type].close()
Beispiel #21
0
def train():
    train_data      = load_file("train.csv")
    validation_data = load_file("validation.csv")



    m = model.JointEmbeddingModelForBinaryClassification(conf.embedded_word_size, conf.init_stddev)

    checkpoint_base_path = conf.log_path + "/" + conf.run_name + "/checkpoint"
    vocab_size = utils.get_vocab_size(conf.data_dir)

    with tf.Session() as sess:
        model_params = m.graph(
                conf.minibatch_size,
                vocab_size,
                conf.word_vector_size,
                conf.conv_size,
                conf.conv_stride,
                conf.conv_features,
                conf.weights_reg_scale,
                conf.activity_reg_scale,
                conf.embedding_reg_scale
                )

        config    = projector.ProjectorConfig()
        embedding = config.embeddings.add()
        embedding.tensor_name = model_params.word_vectors.name
        embedding.metadata_path = os.path.join(conf.data_dir, "vocab.txt")

        summary_op = tf.merge_all_summaries()
        optimiser  = tf.train.AdamOptimizer(conf.learning_rate)
        train_op   = optimiser.minimize(model_params.loss, var_list=tf.trainable_variables())

        if not os.path.exists(conf.log_path):
            os.makedirs(conf.log_path)

        writer = tf.train.SummaryWriter(conf.log_path + "/" + conf.run_name, sess.graph)
        saver  = tf.train.Saver()

        projector.visualize_embeddings(writer, config)

        latest_checkpoint = tf.train.latest_checkpoint(conf.log_path)

        if conf.reuse_checkpoints and latest_checkpoint is not None:
            print("Restoring checkpoint...: " + latest_checkpoint)
            saver.restore(sess, latest_checkpoint)
            starting_iteration = int(latest_checkpoint.split('-')[-1]) + 1
        else:
            print("Initialising new model...")
            sess.run(tf.initialize_all_variables())
            starting_iteration = 0


        for i in range(starting_iteration, conf.iterations + 1):

            X1, X2, Y, M1, M2, S1, S2, _ = get_random_datapoints(train_data)
            data = {model_params.wordset_1: X1,
                    model_params.wordset_2: X2,
                    model_params.probs: Y,
                    model_params.wordset_1_masks: M1,
                    model_params.wordset_2_masks: M2,
                    model_params.wordset_1_lengths: S1,
                    model_params.wordset_2_lengths: S2}

            _, summary_value, alpha, loss_value = sess.run([train_op, summary_op, model_params.alpha, model_params.loss], feed_dict=data)

            writer.add_summary(summary_value, i)
            writer.flush()

            if i % conf.report_frequency == 0:
                print("Iteration #{}, Loss: {}, α: {}.".format(i, loss_value, alpha))


            if i % conf.checkpoint_frequency == 0:
                checkpoint_path = saver.save(sess, checkpoint_base_path, global_step=i)
                print("Checkpointed: {}.".format(checkpoint_path))


            if i % conf.validation_frequency == 0:
                X1, X2, Y, M1, M2, S1, S2, _ = get_random_datapoints(validation_data)
                data = {model_params.wordset_1: X1,
                        model_params.wordset_2: X2,
                        model_params.probs: Y,
                        model_params.wordset_1_masks: M1,
                        model_params.wordset_2_masks: M2,
                        model_params.wordset_1_lengths: S1,
                        model_params.wordset_2_lengths: S2}

                accuracy_value = sess.run(model_params.accuracy, feed_dict=data)

                print("Iteration #{}, Validation-set accuracy: {}.".format(i, accuracy_value))

        if not os.path.exists(conf.save_path):
            os.makedirs(conf.save_path)
            
        saver.save(sess, conf.save_path + "/last_checkpoint")
Beispiel #22
0
    def __init__(self,
                 filename,
                 batch_size=2000,
                 doc_embed_dim=0,
                 wrd_embed_dim=64,
                 wrd_size_max=10000,
                 loss_type='sampled_softmax_loss',
                 optimizer_type='Adagrad',
                 learning_rate=1.0,
                 n_neg_samples=5,
                 eval_words=None):

        self.docs, self.doc2id, self.id2doc, self.wrd2id, self.id2wrd, self.cat2id = build_dataset(
            filename, wrd_size_max)
        self.doc_size = len(self.doc2id)
        self.wrd_size = len(self.wrd2id)

        # bind params to class
        self.batch_size = batch_size
        self.doc_embed_dim = doc_embed_dim
        self.wrd_embed_dim = wrd_embed_dim
        self.loss_type = loss_type
        self.optimizer_type = optimizer_type

        self.learning_rate = learning_rate
        self.n_neg_samples = n_neg_samples
        self.eval_examples = [self.wrd2id[wrd] for wrd in eval_words] if eval_words \
            else np.random.choice(self.wrd_size, size=10, replace=False)

        self._init_graph()
        self.sess = tf.Session(graph=self.graph)
        self.step = 0
        self.epoch = 0
        self.doc_idx = 0  # fetch training batch

        print('doc size {}, word size {}, doc dim {}, word dim {}'.format(
            self.doc_size, self.wrd_size, self.doc_embed_dim,
            self.wrd_embed_dim))
        print('Sample doc: doc id {}, word id {}\n words {}'.format(
            self.id2doc[0], self.docs[0],
            [self.id2wrd[wrd] for wrd in self.docs[0]]))

        # embedding projector
        if not os.path.exists(FLAGS.checkpoint_dir):
            try:
                os.makedirs(FLAGS.checkpoint_dir)
            except Exception as e:
                raise e
        meta_path = os.path.join(FLAGS.checkpoint_dir, 'metadata.tsv')
        with open(meta_path, 'w') as f:
            f.write('word\tlabel\n')
            for idx in xrange(self.wrd_size):
                f.write('{}\t{}\n'.format(self.id2wrd[idx],
                                          self.id2wrd[idx].split('#')[0]))

        from tensorflow.contrib.tensorboard.plugins import projector
        # Use the same LOG_DIR where you stored your checkpoint.
        summary_writer = tf.summary.FileWriter(FLAGS.checkpoint_dir)

        # Format: tensorflow/contrib/tensorboard/plugins/projector/projector_config.proto
        config = projector.ProjectorConfig()

        # You can add multiple embeddings. Here we add only one.
        embedding = config.embeddings.add()
        embedding.tensor_name = self.normalized.name
        embedding.metadata_path = meta_path

        embedding2 = config.embeddings.add()
        embedding2.tensor_name = self.wrd_embeddings.name
        embedding2.metadata_path = meta_path

        # Saves a configuration file that TensorBoard will read during startup.
        projector.visualize_embeddings(summary_writer, config)
Beispiel #23
0
    def set_model(self, model):
        self.model = model
        self.sess = K.get_session()
        if self.histogram_freq and self.merged is None:
            for layer in self.model.layers:

                for weight in layer.weights:
                    tf.summary.histogram(weight.name, weight)
                    if self.write_images:
                        w_img = tf.squeeze(weight)
                        shape = w_img.get_shape()
                        if len(shape) > 1 and shape[0] > shape[1]:
                            w_img = tf.transpose(w_img)
                        if len(shape) == 1:
                            w_img = tf.expand_dims(w_img, 0)
                        w_img = tf.expand_dims(tf.expand_dims(w_img, 0), -1)
                        tf.summary.image(weight.name, w_img)

                if hasattr(layer, 'output'):
                    tf.summary.histogram('{}_out'.format(layer.name),
                                         layer.output)
        self.merged = tf.summary.merge_all()

        if self.write_graph:
            self.writer = tf.summary.FileWriter(self.log_dir, self.sess.graph)
        else:
            self.writer = tf.summary.FileWriter(self.log_dir)

        if self.embeddings_freq:
            self.saver = tf.train.Saver()

            embeddings_layer_names = self.embeddings_layer_names

            if not embeddings_layer_names:
                embeddings_layer_names = [
                    layer.name for layer in self.model.layers
                    if type(layer).__name__ == 'Embedding'
                ]

            embeddings = {
                layer.name: layer.weights[0]
                for layer in self.model.layers
                if layer.name in embeddings_layer_names
            }

            embeddings_metadata = {}

            if not isinstance(self.embeddings_metadata, str):
                embeddings_metadata = self.embeddings_metadata
            else:
                embeddings_metadata = {
                    layer_name: self.embeddings_metadata
                    for layer_name in embeddings.keys()
                }

            config = projector.ProjectorConfig()
            self.embeddings_logs = []

            for layer_name, tensor in embeddings.items():
                embedding = config.embeddings.add()
                embedding.tensor_name = tensor.name

                self.embeddings_logs.append(
                    os.path.join(self.log_dir, layer_name + '.ckpt'))

                if layer_name in embeddings_metadata:
                    embedding.metadata_path = embeddings_metadata[layer_name]

            projector.visualize_embeddings(self.writer, config)
Beispiel #24
0
def train_skip_gram(V, data_folder, data_folders, dataset_size,
                    reverse_dictionary, param, valid_examples, log_dir,
                    vocab_metada_file, embeddings_pickle, ckpt_saver_file,
                    ckpt_saver_file_init, ckpt_saver_file_final,
                    restore_variables):
    """
    Train embeddings (Skip-Gram model)
    :param V: vocabulary size
    :param data_folder: string containing the path to the parent directory of raw data sub-folders
    :param data_folders: list of sub-folders containing pre-processed LLVM IR code
    :param dataset_size: number of data pairs in total in the training data set
    :param reverse_dictionary: [keys=statement index, values=statement]
    :param param: parameters of the inst2vec training
    :param valid_examples: statements to be used as validation examples (list of indices)
    :param log_dir: logging directory for Tensorboard output
    :param vocab_metada_file: vocabulary metadata file for Tensorboard
    :param embeddings_pickle: file in which to pickle embeddings
    :param ckpt_saver_file: checkpoint saver file (intermediate states of training)
    :param ckpt_saver_file_init: checkpoint saver file (initial state of training)
    :param ckpt_saver_file_final: checkpoint saver file (final state of training)
    :param restore_variables: boolean: whether to restore variables from a previous training
    :return: embeddings matrix
    """
    ####################################################################################################################
    # Extract parameters from dictionary "param"
    N = param['embedding_size']
    mini_batch_size = param['mini_batch_size']
    num_sampled = param['num_sampled']
    num_epochs = param['num_epochs']
    learning_rate = param['learning_rate']
    l2_reg_scale = param['beta']
    freq_print_loss = param['freq_print_loss']
    step_print_neighbors = param['step_print_neighbors']
    context_width = param['context_width']

    ####################################################################################################################
    # Set up for analogies
    #analogies, analogy_types, n_questions_total, n_questions_relevant = i2v_eval.load_analogies(data_folder)
    folder_evaluation = embeddings_pickle.replace('.p', '') + 'eval'
    if not os.path.exists(folder_evaluation):
        os.makedirs(folder_evaluation)
    analogy_evaluation_file = os.path.join(folder_evaluation,
                                           "analogy_results")

    config = None
    options = None
    metadata = None
    if FLAGS.profile:
        options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)
        metadata = tf.RunMetadata()
    if FLAGS.xla:
        config = tf.ConfigProto()
        config.graph_options.optimizer_options.global_jit_level = tf.OptimizerOptions.ON_1

    ####################################################################################################################
    # Read data using Tensorflow's data API
    data_files = get_data_pair_files(data_folders, context_width)
    print('\ttraining with data from files:', data_files)
    with tf.name_scope("Reader") as scope:

        random.shuffle(data_files)
        dataset_raw = tf.data.FixedLengthRecordDataset(
            filenames=data_files,
            record_bytes=8)  # <TFRecordDataset shapes: (), types: tf.string>
        dataset = dataset_raw.map(record_parser)
        dataset = dataset.shuffle(int(1e5))
        dataset_batched = dataset.apply(
            tf.contrib.data.batch_and_drop_remainder(mini_batch_size))
        dataset_batched = dataset_batched.prefetch(int(100000000))
        iterator = dataset_batched.make_initializable_iterator()
        saveable_iterator = tf.contrib.data.make_saveable_from_iterator(
            iterator)
        next_batch = iterator.get_next(
        )  # Tensor("Shape:0", shape=(2,), dtype=int32)

    ####################################################################################################################
    # Tensorflow computational graph
    # Placeholders for inputs
    with tf.name_scope("Input_Data") as scope:
        train_inputs = next_batch[:, 0]
        train_labels = tf.reshape(next_batch[:, 1],
                                  shape=[mini_batch_size, 1],
                                  name="training_labels")

    # (input) Embedding matrix
    with tf.name_scope("Input_Layer") as scope:
        W_in = tf.Variable(tf.random_uniform([V, N], -1.0, 1.0),
                           name="input-embeddings")

        # Look up the vector representing each source word in the batch (fetches rows of the embedding matrix)
        h = tf.nn.embedding_lookup(W_in,
                                   train_inputs,
                                   name="input_embedding_vectors")

    # Normalized embedding matrix
    with tf.name_scope("Embeddings_Normalized") as scope:
        normalized_embeddings = tf.nn.l2_normalize(
            W_in, name="embeddings_normalized")

    # (output) Embedding matrix ("output weights")
    with tf.name_scope("Output_Layer") as scope:
        if FLAGS.softmax:
            W_out = tf.Variable(tf.truncated_normal([N, V],
                                                    stddev=1.0 / math.sqrt(N)),
                                name="output_embeddings")
        else:
            W_out = tf.Variable(tf.truncated_normal([V, N],
                                                    stddev=1.0 / math.sqrt(N)),
                                name="output_embeddings")

        # Biases between hidden layer and output layer
        b_out = tf.Variable(tf.zeros([V]), name="nce_bias")

    # Optimization
    with tf.name_scope("Optimization_Block") as scope:
        # Loss function
        if FLAGS.softmax:
            logits = tf.layers.dense(inputs=h, units=V)
            onehot = tf.one_hot(train_labels, V)
            loss_tensor = tf.nn.softmax_cross_entropy_with_logits_v2(
                labels=onehot, logits=logits)
        else:
            loss_tensor = tf.nn.nce_loss(weights=W_out,
                                         biases=b_out,
                                         labels=train_labels,
                                         inputs=h,
                                         num_sampled=num_sampled,
                                         num_classes=V)
        train_loss = tf.reduce_mean(loss_tensor, name="nce_loss")

        # Regularization (optional)
        if l2_reg_scale > 0:
            tf.add_to_collection(tf.GraphKeys.REGULARIZATION_LOSSES, W_in)
            tf.add_to_collection(tf.GraphKeys.REGULARIZATION_LOSSES, W_out)
            regularizer = tf.contrib.layers.l2_regularizer(l2_reg_scale)
            reg_variables = tf.get_collection(
                tf.GraphKeys.REGULARIZATION_LOSSES)
            reg_term = tf.contrib.layers.apply_regularization(
                regularizer, reg_variables)
            loss = train_loss + reg_term
        else:
            loss = train_loss

        # Optimizer
        if FLAGS.optimizer == 'adam':
            optimizer = tf.train.AdamOptimizer(
                learning_rate=learning_rate).minimize(loss)
        elif FLAGS.optimizer == 'nadam':
            optimizer = tf.contrib.opt.NadamOptimizer(
                learning_rate=learning_rate).minimize(loss)
        elif FLAGS.optimizer == 'momentum':
            global_train_step = tf.Variable(0,
                                            trainable=False,
                                            dtype=tf.int32,
                                            name="global_step")
            # Passing global_step to minimize() will increment it at each step.
            optimizer = (tf.train.MomentumOptimizer(
                learning_rate, 0.95).minimize(loss,
                                              global_step=global_train_step))
        else:
            raise ValueError('Unrecognized optimizer ' + FLAGS.optimizer)

    if FLAGS.optimizer != 'momentum':
        global_train_step = tf.Variable(0,
                                        trainable=False,
                                        dtype=tf.int32,
                                        name="global_step")

    ####################################################################################################################
    # Validation block
    with tf.name_scope("Validation_Block") as scope:
        valid_dataset = tf.constant(valid_examples,
                                    dtype=tf.int32,
                                    name="validation_data_size")
        valid_embeddings = tf.nn.embedding_lookup(normalized_embeddings,
                                                  valid_dataset)
        cosine_similarity = tf.matmul(valid_embeddings,
                                      normalized_embeddings,
                                      transpose_b=True)

    ####################################################################################################################
    # Summaries
    with tf.name_scope("Summaries") as scope:
        tf.summary.histogram("input_embeddings", W_in)
        tf.summary.histogram("input_embeddings_normalized",
                             normalized_embeddings)
        tf.summary.histogram("output_embeddings", W_out)
        tf.summary.scalar("nce_loss", loss)

        analogy_score_tensor = tf.Variable(0,
                                           trainable=False,
                                           dtype=tf.int32,
                                           name="analogy_score")
        tf.summary.scalar("analogy_score", analogy_score_tensor)

    ####################################################################################################################
    # Misc.
    restore_completed = False
    init = tf.global_variables_initializer()  # variables initializer
    summary_op = tf.summary.merge_all()  # merge summaries into one operation

    ####################################################################################################################
    # Training
    with tf.Session(config=config) as sess:

        # Add TensorBoard components
        writer = tf.summary.FileWriter(log_dir)  # create summary writer
        writer.add_graph(sess.graph)
        gvars = [
            gvar for gvar in tf.global_variables()
            if 'analogy_score' not in gvar.name
        ]
        saver = tf.train.Saver(gvars, max_to_keep=5)  # create checkpoint saver
        config = projector.ProjectorConfig()  # create projector config
        embedding = config.embeddings.add()  # add embeddings visualizer
        embedding.tensor_name = W_in.name
        embedding.metadata_path = vocab_metada_file  # link metadata
        projector.visualize_embeddings(
            writer, config)  # add writer and config to projector

        # Set up variables
        if restore_variables:  # restore variables from disk
            restore_file = tf.train.latest_checkpoint(log_dir)
            assert restore_file is not None, "No restore file found in folder " + log_dir
            assert os.path.exists(restore_file + ".index"), \
                "Trying to restore Tensorflow session from non-existing file: " + restore_file + ".index"
            init.run()
            saver.restore(sess, restore_file)
            print("\tVariables restored from file", ckpt_saver_file,
                  "in TensorFlow ")

        else:  # save the computational graph to file and initialize variables

            graph_saver = tf.train.Saver(allow_empty=True)
            init.run()
            graph_saver.save(sess,
                             ckpt_saver_file_init,
                             global_step=0,
                             write_meta_graph=True)
            tf.add_to_collection(tf.GraphKeys.SAVEABLE_OBJECTS,
                                 saveable_iterator)
            print("\tVariables initialized in TensorFlow")

        # Compute the necessary number of steps for this epoch as well as how often to print the avg loss
        num_steps = int(math.ceil(dataset_size / mini_batch_size))
        step_print_loss = int(math.ceil(num_steps / freq_print_loss))
        print('\tPrinting loss every ', step_print_loss, 'steps, i.e.',
              freq_print_loss, 'times per epoch')

        ################################################################################################################
        # Epoch loop
        epoch = 0
        global_step = 0
        while epoch < int(num_epochs):
            print('\n\tStarting epoch ', epoch)
            sess.run(iterator.initializer)  # initialize iterator

            # If restoring a previous training session, set the right training epoch
            if restore_variables and not restore_completed:
                epoch = int(
                    math.floor(global_train_step.eval() /
                               (dataset_size / mini_batch_size)))
                global_step = global_train_step.eval()
                print('Starting from epoch', epoch)

            ############################################################################################################
            # Loop over steps (mini batches) inside of epoch
            step = 0
            avg_loss = 0
            while True:

                try:

                    # Print average loss every x steps
                    if step_print_loss > 0 and step % int(
                            step_print_loss) == 0:  # update step with logging

                        # If restoring a previous training session, set the right training epoch
                        if restore_variables and not restore_completed:
                            restore_completed = True

                        # Write global step
                        if FLAGS.optimizer != 'momentum':
                            global_train_step.assign(global_step).eval()

                        # Perform an update
                        # print('\tStarting local step {:>6}'.format(step))  # un-comment for debugging
                        [_, loss_val, train_loss_val, global_step] = sess.run(
                            [optimizer, loss, train_loss, global_train_step],
                            options=options,
                            run_metadata=metadata)
                        assert not np.isnan(
                            loss_val), "Loss at step " + str(step) + " is nan"
                        assert not np.isinf(
                            loss_val), "Loss at step " + str(step) + " is inf"
                        avg_loss += loss_val

                        if step > 0:
                            avg_loss /= step_print_loss
                        '''
                        analogy_score = i2v_eval.evaluate_analogies(W_in.eval(), reverse_dictionary, analogies,
                                                                    analogy_types, analogy_evaluation_file,
                                                                    session=sess, print=i2v_eval.nop)
                        total_analogy_score = sum([a[0] for a in analogy_score])
                        analogy_score_tensor.assign(total_analogy_score).eval()  # for tf.summary
                        '''
                        [summary, W_in_val] = sess.run([summary_op, W_in])

                        if FLAGS.savebest is not None:
                            filelist = [f for f in os.listdir(FLAGS.savebest)]
                            scorelist = [
                                int(s.split('-')[1]) for s in filelist
                            ]
                            if len(scorelist
                                   ) == 0 or total_analogy_score > sorted(
                                       scorelist)[-1]:
                                i2v_utils.safe_pickle(
                                    W_in_val, FLAGS.savebest + '/' + 'score-' +
                                    str(total_analogy_score) + '-w.p')

                        # Display average loss
                        '''
                        print('{} Avg. loss at epoch {:>6,d}, step {:>12,d} of {:>12,d}, global step {:>15} : {:>12.3f}, analogies: {})'.format(
                            str(datetime.now()), epoch, step, num_steps, global_step, avg_loss, str(analogy_score)))
                        '''
                        print(
                            '{} Avg. loss at epoch {:>6,d}, step {:>12,d} of {:>12,d}, global step {:>15} : {:>12.3f})'
                            .format(str(datetime.now()), epoch, step,
                                    num_steps, global_step, avg_loss))
                        avg_loss = 0

                        # Pickle intermediate embeddings
                        i2v_utils.safe_pickle(W_in_val, embeddings_pickle)

                        # Write to TensorBoard
                        saver.save(sess,
                                   ckpt_saver_file,
                                   global_step=global_step,
                                   write_meta_graph=False)
                        writer.add_summary(summary, global_step=global_step)

                        if FLAGS.profile:
                            fetched_timeline = timeline.Timeline(
                                metadata.step_stats)
                            chrome_trace = fetched_timeline.generate_chrome_trace_format(
                            )
                            with open('timeline_step_%d.json' % step,
                                      'w') as f:
                                f.write(chrome_trace)

                        if step > 0 and FLAGS.extreme:
                            sys.exit(22)

                    else:  # ordinary update step
                        [_, loss_val] = sess.run([optimizer, loss])
                        avg_loss += loss_val

                    # Compute and print nearest neighbors every x steps
                    if step_print_neighbors > 0 and step % int(
                            step_print_neighbors) == 0:
                        print_neighbors(op=cosine_similarity,
                                        examples=valid_examples,
                                        top_k=6,
                                        reverse_dictionary=reverse_dictionary)

                    # Update loop index (steps in epoch)
                    step += 1
                    global_step += 1

                except tf.errors.OutOfRangeError:

                    # We reached the end of the epoch
                    print('\n\t Writing embeddings to file ',
                          embeddings_pickle)
                    i2v_utils.safe_pickle([W_in.eval()],
                                          embeddings_pickle)  # WEIRD!
                    epoch += 1  # update loop index (epochs)
                    break  # from this inner loop

        ################################################################################################################
        # End of training:
        # Print the nearest neighbors at the end of the run
        if step_print_neighbors == -1:
            print_neighbors(op=cosine_similarity,
                            examples=valid_examples,
                            top_k=6,
                            reverse_dictionary=reverse_dictionary)

        # Save state of training and close the TensorBoard summary writer
        save_path = saver.save(sess, ckpt_saver_file_final, global_step)
        writer.add_summary(summary, global_step)
        writer.close()

        return W_in.eval()
Beispiel #25
0
def train(conf):
    if conf.dataset_name not in NUM_DATASET_MAP:
        num_dataset, num_classes = dataset.make_tfrecord(
            conf.dataset_name, conf.dataset_dir, conf.train_fraction,
            conf.num_channel, conf.num_dataset_parallel)
        if num_dataset is None:
            metadata = json.load(
                open(os.path.join(conf.dataset_dir, "metadata")))
            NUM_DATASET_MAP[conf.dataset_name] = [
                metadata["num_train"], metadata["num_validation"],
                metadata["num_classes"], conf.num_channel
            ]
        else:
            NUM_DATASET_MAP[conf.dataset_name] = [
                num_dataset * conf.train_fraction,
                num_dataset * (1 - conf.train_fraction), num_classes,
                conf.num_channel
            ]
    num_channel = NUM_DATASET_MAP[conf.dataset_name][3]
    num_classes = NUM_DATASET_MAP[conf.dataset_name][2]

    is_training = tf.placeholder(tf.bool, shape=(), name="is_training")

    if conf.model_name[:6] == "nasnet":
        model_f = model_factory.get_network_fn(conf.model_name,
                                               num_classes,
                                               weight_decay=conf.weight_decay,
                                               is_training=True)
    else:
        model_f = model_factory.get_network_fn(conf.model_name,
                                               num_classes,
                                               weight_decay=conf.weight_decay,
                                               is_training=is_training)

    model_image_size = conf.model_image_size or model_f.default_image_size

    def pre_process(example_proto, training):
        features = {
            "image/encoded": tf.FixedLenFeature((),
                                                tf.string,
                                                default_value=""),
            "image/class/label": tf.FixedLenFeature((),
                                                    tf.int64,
                                                    default_value=0),
            'image/height': tf.FixedLenFeature((), tf.int64, default_value=0),
            'image/width': tf.FixedLenFeature((), tf.int64, default_value=0)
        }

        parsed_features = tf.parse_single_example(example_proto, features)
        if conf.preprocessing_name:
            image_preprocessing_fn = preprocessing_factory.get_preprocessing(
                conf.preprocessing_name, is_training=training)
            image = tf.image.decode_image(parsed_features["image/encoded"],
                                          num_channel)
            image = tf.clip_by_value(
                image_preprocessing_fn(image, model_image_size,
                                       model_image_size), -1, 1.0)
        else:
            image = tf.clip_by_value(
                tf.image.per_image_standardization(
                    tf.image.resize_images(
                        tf.image.decode_jpeg(parsed_features["image/encoded"],
                                             num_channel),
                        [model_image_size, model_image_size])), -1., 1.0)

        if len(parsed_features["image/class/label"].get_shape()) == 0:
            label = tf.one_hot(parsed_features["image/class/label"],
                               num_classes)
        else:
            label = parsed_features["image/class/label"]

        return image, label

    def train_dataset_map(example_proto):
        return pre_process(example_proto, True)

    def test_dataset_map(example_proto):
        return pre_process(example_proto, False)

    def get_model():
        model_name = conf.model_name
        inputs = tf.placeholder(
            tf.float32,
            shape=[None, model_image_size, model_image_size, num_channel],
            name="inputs")

        labels = tf.placeholder(tf.float32,
                                shape=[None, num_classes],
                                name="labels")
        global_step = tf.Variable(0, trainable=False)
        learning_rate = optimizer.configure_learning_rate(
            NUM_DATASET_MAP[conf.dataset_name][0], global_step, conf)
        # learning_rate = tf.placeholder(tf.float32, shape=(), name="learning_rate")
        conf.num_channel = num_channel
        conf.num_classes = num_classes
        if model_name in ["deconv", "ed", "deconv_conv"]:
            logits, gen_x, gen_x_ = model_f(inputs, model_conf=conf)
            class_loss_op = tf.reduce_mean(
                tf.nn.softmax_cross_entropy_with_logits(labels=labels,
                                                        logits=logits))
            gen_loss_op = tf.log(
                tf.reduce_mean(
                    tf.nn.softmax_cross_entropy_with_logits(labels=gen_x_,
                                                            logits=gen_x)))
            loss_op = tf.add(class_loss_op, gen_loss_op)

            ops = [class_loss_op, loss_op, gen_loss_op]
            ops_key = ["class_loss_op", "loss_op", "gen_loss_op"]
        else:
            if model_name == "conv":
                logits = model_f(inputs, model_conf=conf)
            else:
                logits, end_points = model_f(inputs)
            if model_name == "resnet":
                logits = tf.reshape(logits, [-1, num_classes])
            loss_op = tf.reduce_mean(
                tf.nn.softmax_cross_entropy_with_logits(labels=labels,
                                                        logits=logits))
            ops = [loss_op]
            ops_key = ["loss_op"]
        if conf.use_regularizer:
            weights = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)
            regularizer = 0
            for weight in weights:
                regularizer += tf.nn.l2_loss(weight)
            regularizer *= conf.weight_decay
            loss_op += regularizer
        tf.summary.scalar('loss', loss_op)
        opt = optimizer.configure_optimizer(learning_rate, conf)
        train_op = opt.minimize(loss_op, global_step=global_step)
        accuracy_op = tf.reduce_mean(
            tf.cast(tf.equal(tf.argmax(logits, 1), tf.argmax(labels, 1)),
                    tf.float32))
        ops.append(tf.argmax(logits, 1))
        ops_key.append("predict_idx")
        tf.summary.scalar('accuracy', accuracy_op)
        merged = tf.summary.merge_all()

        return inputs, labels, train_op, accuracy_op, merged, ops, ops_key, logits, end_points

    if not os.path.exists(conf.dataset_dir):
        conf.dataset_dir = os.path.join("/home/data", conf.dataset_name)

    train_filenames = glob.glob(
        os.path.join(conf.dataset_dir,
                     conf.dataset_name + ("_%s*tfrecord" % conf.train_name)))
    test_filenames = glob.glob(
        os.path.join(conf.dataset_dir,
                     conf.dataset_name + ("_%s*tfrecord" % conf.test_name)))

    inputs, labels, train_op, accuracy_op, merged, ops, ops_key, logits, end_points = get_model(
    )

    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    sess = tf.Session(config=config)
    summary_dir = os.path.join(conf.log_dir, "summary")
    train_writer = tf.summary.FileWriter(summary_dir + '/train', sess.graph)
    test_writer = tf.summary.FileWriter(summary_dir + '/test')
    sess.run(tf.global_variables_initializer())
    saver = tf.train.Saver()
    if conf.restore_model_path and len(
            glob.glob(conf.restore_model_path + ".data-00000-of-00001")) > 0:
        print("restore!!")
        saver.restore(sess, conf.restore_model_path)

    train_iterator = tf.data.TFRecordDataset(train_filenames).map(
        train_dataset_map, conf.num_dataset_parallel).shuffle(
            buffer_size=conf.shuffle_buffer).batch(
                conf.batch_size).make_initializable_iterator()
    train_next = train_iterator.get_next()
    test_iterator = tf.data.TFRecordDataset(test_filenames).map(
        test_dataset_map, conf.num_dataset_parallel).batch(
            conf.batch_size).make_initializable_iterator()
    test_next = test_iterator.get_next()

    num_train = NUM_DATASET_MAP[conf.dataset_name][0] // conf.batch_size
    num_test = NUM_DATASET_MAP[conf.dataset_name][1] // conf.batch_size
    if conf.vis_epoch is not None:
        config = projector.ProjectorConfig()
        vis_dir = os.path.join(conf.log_dir, "embedding")
        total_dataset = None
        total_labels = None
        total_activations = None
    heatmap_imgs = {}
    bb_imgs = {}
    for epoch in range(conf.epoch):
        train_step = 0
        if conf.vis_epoch is not None and total_dataset is not None:
            total_dataset = None
            total_labels = None
            total_activations = None
        if conf.train:
            sess.run(train_iterator.initializer)
            total_train_accuracy = .0
            inner_train_step = 0
            while True:
                try:
                    batch_xs, batch_ys = sess.run(train_next)
                    results = sess.run([
                        train_op,
                        merged,
                        accuracy_op,
                    ] + ops,
                                       feed_dict={
                                           inputs: batch_xs,
                                           labels: batch_ys,
                                           is_training: True
                                       })
                    total_train_accuracy += results[2]
                    now = datetime.now().strftime('%Y/%m/%d %H:%M:%S')
                    if train_step % conf.summary_interval == 0:
                        ops_results = " ".join(
                            list(
                                map(lambda x: str(x),
                                    list(zip(ops_key, results[3:])))))
                        print(
                            ("[%s TRAIN %d epoch, %d / %d step] accuracy: %f" %
                             (now, epoch, train_step, num_train, results[2])) +
                            ops_results)
                        train_writer.add_summary(
                            results[1], train_step + epoch * num_train)
                    train_step += 1
                    inner_train_step += 1
                except tf.errors.OutOfRangeError:
                    break
            if inner_train_step > 0:
                print("Avg Train Accuracy : %f" %
                      (float(total_train_accuracy) / inner_train_step))
            if epoch % conf.num_save_interval == 0:
                saver.save(sess, conf.log_dir + "/model_epoch_%d.ckpt" % epoch)
        if conf.eval:
            total_accuracy = 0
            test_step = 0
            sess.run(test_iterator.initializer)

            while True:
                try:
                    test_xs, test_ys = sess.run(test_next)
                    results = sess.run([merged, accuracy_op, logits] + ops,
                                       feed_dict={
                                           inputs: test_xs,
                                           labels: test_ys,
                                           is_training: False
                                       })
                    total_accuracy += results[1]
                    now = datetime.now().strftime('%Y/%m/%d %H:%M:%S')
                    ops_results = " ".join(
                        list(
                            map(lambda x: str(x),
                                list(zip(ops_key, results[3:])))))
                    print(("[%s TEST %d epoch, %d /%d step] accuracy: %f" %
                           (now, epoch, test_step, num_test, results[1])) +
                          ops_results + "labels", test_ys.argmax(1))
                    test_writer.add_summary(
                        results[0],
                        test_step + (train_step + epoch * num_train))
                    test_step += 1
                    if conf.vis_epoch is not None and epoch % conf.vis_epoch == 0:
                        if conf.num_vis_steps >= test_step:

                            if conf.use_predict_of_test_for_embed_vis:
                                predict_y = np.zeros(
                                    (conf.batch_size, num_classes))
                                predict_y[np.arange(conf.batch_size),
                                          results[-1]] = 1
                                tmp_labels = predict_y
                            else:
                                tmp_labels = test_ys

                            if total_dataset is None:
                                total_dataset = test_xs
                                total_labels = tmp_labels

                                total_activations = results[2]
                            else:
                                total_dataset = np.append(test_xs,
                                                          total_dataset,
                                                          axis=0)
                                total_labels = np.append(tmp_labels,
                                                         total_labels,
                                                         axis=0)
                                total_activations = np.append(
                                    results[2], total_activations, axis=0)

                            ### Create CAM image
                            if end_points and conf.num_cam:
                                grad_cam_plus_plus = GradCamPlusPlus(
                                    end_points[
                                        model_f.default_logit_layer_name],
                                    end_points[
                                        model_f.default_last_conv_layer_name],
                                    inputs, is_training)
                                cam_imgs, class_indices = grad_cam_plus_plus.create_cam_imgs(
                                    sess, test_xs, results[2])
                                for i in range(conf.num_cam):
                                    box_img = np.copy(test_xs[i])
                                    # for j in range(GradCamPlusPlus.TOP3):
                                    ### Overlay heatmap
                                    heapmap = grad_cam_plus_plus.convert_cam_2_heatmap(
                                        cam_imgs[i][0])
                                    overlay_img = grad_cam_plus_plus.overlay_heatmap(
                                        test_xs[i], heapmap)
                                    if test_ys[i].argmax(
                                    ) == results[2][i].argmax():
                                        key = "label_%dd" % test_ys[i].argmax()
                                    else:
                                        key = "fail_label_%d_pred_%d" % (
                                            test_ys[i].argmax(),
                                            results[2][i].argmax())
                                    if key not in heatmap_imgs:
                                        heatmap_imgs[key] = []
                                        bb_imgs[key] = []
                                    if len(test_xs[i].shape
                                           ) != 3 or test_xs[i].shape[2] != 3:
                                        test = cv2.cvtColor(
                                            test_xs[i],
                                            cv2.COLOR_GRAY2BGR)[..., ::-1]
                                    else:
                                        test = test_xs[i]
                                    heatmap_imgs[key].append(
                                        overlay_img[..., ::-1])
                                    heatmap_imgs[key].append(test)

                                    ### Boxing
                                    box_img = grad_cam_plus_plus.draw_rectangle(
                                        box_img, cam_imgs[i][0], [255, 0, 0])
                                    bb_imgs[key].append(box_img)

                except tf.errors.OutOfRangeError:
                    break
            if conf.vis_epoch is not None and epoch % conf.vis_epoch == 0:
                for key in heatmap_imgs:
                    write_summary(test_writer,
                                  "heatmap_epoch_%d_%s" % (epoch, key),
                                  heatmap_imgs[key], sess)
                    write_summary(test_writer, "bb_epoch_%d_%s" % (epoch, key),
                                  bb_imgs[key], sess)
                heatmap_imgs = {}
                bb_imgs = {}
            if test_step > 0:
                print("Avg Accuracy : %f" %
                      (float(total_accuracy) / test_step))
                if conf.vis_epoch is not None and epoch % conf.vis_epoch == 0:
                    # vis_dir = os.path.join(conf.log_dir, "embed_vis_%d" % epoch)
                    visualizer.add_embedding(
                        config,
                        sess=sess,
                        embedding_list=[total_activations],
                        embedding_path=vis_dir,
                        image_size=model_image_size,
                        channel=num_channel,
                        labels=total_labels,
                        prefix="epoch" + str(epoch))
        if not conf.train:
            break

    ### Write summary
    # write_summary(test_writer, summary_names, result_imgs, sess)

    if conf.vis_epoch is not None and conf.eval and total_dataset is not None:
        visualizer.write_embedding(config,
                                   sess,
                                   total_dataset,
                                   embedding_path=vis_dir,
                                   image_size=model_image_size,
                                   channel=num_channel,
                                   labels=total_labels)

    sess.close()
Beispiel #26
0
    def save_tensorboard(self, dirpath=None):

        if not self.trained:
            raise Exception('Train `Word2Vec` first.')

        os.makedirs(dirpath, exist_ok=True)

        weights = self.embed.wv.vectors
        idx2words = self.embed.wv.index2word

        vocab_size = weights.shape[0]
        embedding_dim = weights.shape[1]

        with open(os.path.join(dirpath, "metadata.tsv"), 'w') as f:
            f.writelines("\n".join(idx2words))

        tf.reset_default_graph()

        W = tf.Variable(
            tf.constant(0., shape=[vocab_size, embedding_dim]),
            trainable=False,
            name="W",
        )
        embedding_placeholder = tf.placeholder(
            tf.float32,
            [vocab_size, embedding_dim],
        )
        embedding_init = W.assign(embedding_placeholder)

        writer = tf.summary.FileWriter(
            dirpath,
            graph=tf.get_default_graph(),
        )
        saver = tf.train.Saver()

        # tf.contrib.tensorboard.plugins.projector.projector_config.proto
        config = projector.ProjectorConfig()

        embedding = config.embeddings.add()
        embedding.tensor_name = W.name
        embedding.metadata_path = os.path.join(
            # filepath,
            "metadata.tsv",
        )
        # Saves a configuration file that TensorBoard will read during startup.
        projector.visualize_embeddings(writer, config)

        with tf.Session() as sess:
            sess.run(
                embedding_init,
                feed_dict={embedding_placeholder: weights},
            )
            save_path = saver.save(
                sess,
                os.path.join(
                    dirpath,
                    "tf-model.cpkt",
                ),
            )
        
        print(f"'Projector Saved: '{save_path}'")

        return save_path
Beispiel #27
0
def build_graph():

    vocabulary_size = FLAGS.vocabulary_size or int(
        redis_client.zcard(FLAGS.redis_key_vocabulary))
    valid_examples = np.random.choice(100, FLAGS.valid_size, replace=False)

    graph = tf.Graph()

    with graph.as_default():
        with tf.name_scope('Inputs'):
            # input data
            train_inputs = tf.placeholder(tf.int32, shape=[FLAGS.batch_size])
            train_labels = tf.placeholder(tf.int32,
                                          shape=[FLAGS.batch_size, 1])
            valid_dataset = tf.constant(valid_examples)

        with tf.device('/cpu:0'):
            with tf.name_scope('embedings'):
                # Look up embeddings for inputs.
                embeddings = tf.Variable(
                    tf.random_uniform([vocabulary_size, FLAGS.embedding_size],
                                      -1.0, 1.0))

                embeded = tf.nn.embedding_lookup(embeddings, train_inputs)

            with tf.name_scope('weight'):
                # Construct the variables for the NCE loss
                nce_weights = tf.Variable(
                    tf.truncated_normal(
                        [vocabulary_size, FLAGS.embedding_size],
                        stddev=1.0 / np.sqrt(FLAGS.embedding_size)))
            with tf.name_scope('biases'):
                nce_biases = tf.Variable(tf.zeros([vocabulary_size]))

        # Compute the average NCE loss for the batch.
        # tf.nce_loss automatically draws a new sample of the negative labels each
        # time we evaluate the loss.
        # TODO num_sampled?
        loss = tf.reduce_mean(
            tf.nn.nce_loss(weights=nce_weights,
                           biases=nce_biases,
                           labels=train_labels,
                           inputs=embeded,
                           num_sampled=64,
                           num_classes=vocabulary_size))
        # Add the loss value as a scalar to summary.
        tf.summary.scalar('loss', loss)
        with tf.name_scope('optimizer'):
            optimizer = tf.train.GradientDescentOptimizer(1.0).minimize(loss)

        # Compute the cosine similarity between minibatch examples and all embeddings.
        norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keep_dims=True))
        normalized_embeddings = embeddings / norm
        valid_embeddings = tf.nn.embedding_lookup(normalized_embeddings,
                                                  valid_dataset)
        # 余弦相似性
        # (valid_size, embeddings_size) * (vocabulary_size, embeddings_size)^T
        similarity = tf.matmul(valid_embeddings,
                               normalized_embeddings,
                               transpose_b=True)

        # Merge all summaries.
        merged = tf.summary.merge_all()

        # Add variable initializer.
        init = tf.global_variables_initializer()

        # Create a saver.
        saver = tf.train.Saver()

    with tf.Session(graph=graph) as session:
        writer = tf.summary.FileWriter(FLAGS.log_dir, session.graph)

        init.run()
        print('Initialized')

        target_index = 0
        average_loss = 0
        for step in range(FLAGS.num_steps):
            batch_train, labels_train, target_index = generate_train_batch(
                target_index, FLAGS.window_width, FLAGS.batch_size,
                FLAGS.num_skips, FLAGS.redis_key_index)
            feed_dict = {train_inputs: batch_train, train_labels: labels_train}
            # Define metadata variable.
            run_metadata = tf.RunMetadata()

            _, loss_val, summary = session.run([optimizer, loss, merged],
                                               feed_dict=feed_dict)

            average_loss += loss_val
            # Add returned summaries to writer in each step.
            writer.add_summary(summary, step)

            # Add metadata to visualize the graph for the last run.
            if step == (FLAGS.num_steps - 1):
                writer.add_run_metadata(run_metadata, 'step%d' % step)

            if step % 2000 == 0:
                if step > 0:
                    average_loss /= 2000
                print('Average loss at step ', step, ': ', average_loss)
                average_loss = 0

            # 查看相似的词组
            if step % 10000 == 0:
                sim = similarity.eval()

                for i in range(FLAGS.valid_size):
                    valid_word = \
                        redis_client.zrevrange(FLAGS.redis_key_vocabulary, valid_examples[i], valid_examples[i])[
                            0]
                    top_k = 8  # 最相似的8个

                    nearest = (-sim[i, :]).argsort()[1:top_k + 1]

                    log_str = 'Nearest {}:'.format(valid_word.decode('utf8'))
                    for k in range(top_k):
                        close_word = redis_client.zrevrange(
                            FLAGS.redis_key_vocabulary, nearest[k],
                            nearest[k])[0]
                        log_str = '{} {},'.format(log_str,
                                                  close_word.decode('utf8'))
                    print(log_str)

        final_embeddings = normalized_embeddings.eval()

        # Write corresponding labels for the embeddings.
        with open(FLAGS.log_dir + '/metadata.tsv', 'w') as f:
            for i in range(vocabulary_size):
                f.write(
                    redis_client.zrevrange(FLAGS.redis_key_vocabulary, i, i)
                    [0].decode('utf8') + '\n')
        # Save the model for checkpoints.
        saver.save(session, os.path.join(FLAGS.log_dir, 'model.ckpt'))
        # Create a configuration for visualizing embeddings with the labels in TensorBoard.
        config = projector.ProjectorConfig()
        embedding_conf = config.embeddings.add()
        embedding_conf.tensor_name = embeddings.name
        embedding_conf.metadata_path = os.path.join('metadata.tsv')
        projector.visualize_embeddings(writer, config)

        plot_samples(final_embeddings, vocabulary_size)
Beispiel #28
0
    def set_model(self, model):
        self.model = model
        if K.backend() == 'tensorflow':
            self.sess = K.get_session()
        if self.histogram_freq and self.merged is None:
            for layer in self.model.layers:

                for weight in layer.weights:
                    mapped_weight_name = weight.name.replace(':', '_')
                    tf.summary.histogram(mapped_weight_name, weight)
                    if self.write_grads:
                        grads = model.optimizer.get_gradients(
                            model.total_loss, weight)

                        def is_indexed_slices(grad):
                            return type(grad).__name__ == 'IndexedSlices'

                        grads = [
                            grad.values if is_indexed_slices(grad) else grad
                            for grad in grads
                        ]
                        tf.summary.histogram(
                            '{}_grad'.format(mapped_weight_name), grads)
                    if self.write_images:
                        w_img = tf.squeeze(weight)
                        shape = K.int_shape(w_img)
                        if len(shape) == 2:  # dense layer kernel case
                            if shape[0] > shape[1]:
                                w_img = tf.transpose(w_img)
                                shape = K.int_shape(w_img)
                            w_img = tf.reshape(w_img,
                                               [1, shape[0], shape[1], 1])
                        elif len(shape) == 3:  # convnet case
                            if K.image_data_format() == 'channels_last':
                                # switch to channels_first to display
                                # every kernel as a separate image
                                w_img = tf.transpose(w_img, perm=[2, 0, 1])
                                shape = K.int_shape(w_img)
                            w_img = tf.reshape(
                                w_img, [shape[0], shape[1], shape[2], 1])
                        elif len(shape) == 1:  # bias case
                            w_img = tf.reshape(w_img, [1, shape[0], 1, 1])
                        else:
                            # not possible to handle 3D convnets etc.
                            continue

                        shape = K.int_shape(w_img)
                        assert len(shape) == 4 and shape[-1] in [1, 3, 4]
                        tf.summary.image(mapped_weight_name, w_img)

                if hasattr(layer, 'output'):
                    if isinstance(layer.output, list):
                        for i, output in enumerate(layer.output):
                            tf.summary.histogram(
                                '{}_out_{}'.format(layer.name, i), output)
                    else:
                        tf.summary.histogram('{}_out'.format(layer.name),
                                             layer.output)
        self.merged = tf.summary.merge_all()

        if self.write_graph:
            self.writer = tf.summary.FileWriter(self.log_dir, self.sess.graph)
        else:
            self.writer = tf.summary.FileWriter(self.log_dir)

        if self.embeddings_freq and self.embeddings_data is not None:
            self.embeddings_data = standardize_input_data(
                self.embeddings_data, model.input_names)

            embeddings_layer_names = self.embeddings_layer_names

            if not embeddings_layer_names:
                embeddings_layer_names = [
                    layer.name for layer in self.model.layers
                    if type(layer).__name__ == 'Embedding'
                ]
            self.assign_embeddings = []
            embeddings_vars = {}

            self.batch_id = batch_id = tf.placeholder(tf.int32)
            self.step = step = tf.placeholder(tf.int32)

            for layer in self.model.layers:
                if layer.name in embeddings_layer_names:
                    embedding_input = self.model.get_layer(layer.name).output
                    embedding_size = np.prod(embedding_input.shape[1:])
                    embedding_input = tf.reshape(embedding_input,
                                                 (step, int(embedding_size)))
                    shape = (self.embeddings_data[0].shape[0],
                             int(embedding_size))
                    embedding = tf.Variable(tf.zeros(shape),
                                            name=layer.name + '_embedding')
                    embeddings_vars[layer.name] = embedding
                    batch = tf.assign(embedding[batch_id:batch_id + step],
                                      embedding_input)
                    self.assign_embeddings.append(batch)

            self.saver = tf.train.Saver(list(embeddings_vars.values()))

            embeddings_metadata = {}

            if not isinstance(self.embeddings_metadata, str):
                embeddings_metadata = self.embeddings_metadata
            else:
                embeddings_metadata = {
                    layer_name: self.embeddings_metadata
                    for layer_name in embeddings_vars.keys()
                }

            config = projector.ProjectorConfig()

            for layer_name, tensor in embeddings_vars.items():
                embedding = config.embeddings.add()
                embedding.tensor_name = tensor.name

                if layer_name in embeddings_metadata:
                    embedding.metadata_path = embeddings_metadata[layer_name]

            projector.visualize_embeddings(self.writer, config)
Beispiel #29
0
#产生metdata文件
if tf.gfile.Exists(DIR + 'projector\\projector\\metadata.tsv'):
    tf.gfile.DeleteRecursively(DIR + 'projector\\progector\\metadata.tsv')
with open(DIR + 'projector\\progector\\metadata.tsv', 'w') as f:
    labels = sess.run(tf.argmax(mnist.test.labels[:], 1))
    for i in range(image_num):
        f.write(str(labels[i]) + '\n')

#合并所有的summary
merged = tf.summary.merge_all()

projector_writer = tf.summary.FileWriter(DIR + 'projector/progector/',
                                         sess.graph)
saver = tf.train.Saver()
config = projector.ProjectorConfig()
embed = config.embeddings.add()
embed.metadata_path = DIR + 'projector/progector/metadata.tsv'
embed.sprite.image_path = DIR + 'projector/data/mnist_10k_sprite.png'
embed.sprite.single_image_dim.extend([28, 28])
projector.visualize_embeddings(projector_writer, config)

for i in range(max_steps):
    #每个批次100个样本
    batch_xs, batch_ys = mnist.train.next_batch(100)
    run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)
    run_metadata = tf.RunMetadata()
    summary, _ = sess.run([merged, train_step],
                          feed_dict={
                              x: batch_xs,
                              y: batch_ys
Beispiel #30
0
def train(FLAGS,
          graph=None,
          init=None,
          optimizer=None,
          merged=None,
          loss=None,
          similarity=None,
          reverse_dictionary=None,
          normalized_embeddings=None,
          saver=None,
          embeddings=None,
          vocabulary_size=None,
          valid_size=None,
          valid_examples=None,
          data=None,
          train_inputs=None,
          train_labels=None):
    # We pick a random validation set to sample nearest neighbors. Here we limit the
    # validation samples to the words that have a low numeric ID, which by
    # construction are also the most frequent. These 3 variables are used only for
    # displaying model accuracy, they don't affect calculation.

    global i, final_embeddings
    num_steps = 100001
    with tf.Session(graph=graph) as session:
        # Open a writer to write summaries.
        writer = tf.summary.FileWriter(FLAGS.log_dir, session.graph)

        # We must initialize all variables before we use them.
        init.run()
        print('Initialized')

        average_loss = 0
        for step in xrange(num_steps):
            batch_inputs, batch_labels = generate_batch(
                batch_size, num_skips, skip_window, data)
            feed_dict = {
                train_inputs: batch_inputs,
                train_labels: batch_labels
            }

            # Define metadata variable.
            run_metadata = tf.RunMetadata()

            # We perform one update step by evaluating the optimizer op (including it
            # in the list of returned values for session.run()
            # Also, evaluate the merged op to get all summaries from the returned "summary" variable.
            # Feed metadata variable to session for visualizing the graph in TensorBoard.
            _, summary, loss_val = session.run([optimizer, merged, loss],
                                               feed_dict=feed_dict,
                                               run_metadata=run_metadata)
            average_loss += loss_val

            # Add returned summaries to writer in each step.
            writer.add_summary(summary, step)
            # Add metadata to visualize the graph for the last run.
            if step == (num_steps - 1):
                writer.add_run_metadata(run_metadata, 'step%d' % step)

            if step % 2000 == 0:
                if step > 0:
                    average_loss /= 2000
                # The average loss is an estimate of the loss over the last 2000 batches.
                print('Average loss at step ', step, ': ', average_loss)
                average_loss = 0

            # Note that this is expensive (~20% slowdown if computed every 500 steps)
            if step % 10000 == 0:
                sim = similarity.eval()
                for i in xrange(valid_size):
                    valid_word = reverse_dictionary[valid_examples[i]]
                    top_k = 8  # number of nearest neighbors
                    nearest = (-sim[i, :]).argsort()[1:top_k + 1]
                    log_str = 'Nearest to %s:' % valid_word
                    for k in xrange(top_k):
                        close_word = reverse_dictionary[nearest[k]]
                        log_str = '%s %s,' % (log_str, close_word)
                    print(log_str)
        final_embeddings = normalized_embeddings.eval()

        # Write corresponding labels for the embeddings.
        with open(FLAGS.log_dir + '/metadata.tsv', 'w') as f:
            for i in xrange(vocabulary_size):
                f.write(reverse_dictionary[i] + '\n')

        # Save the model for checkpoints.
        saver.save(session, os.path.join(FLAGS.log_dir, 'model.ckpt'))

        # Create a configuration for visualizing embeddings with the labels in TensorBoard.
        config = projector.ProjectorConfig()
        embedding_conf = config.embeddings.add()
        embedding_conf.tensor_name = embeddings.name
        embedding_conf.metadata_path = os.path.join(FLAGS.log_dir,
                                                    'metadata.tsv')
        projector.visualize_embeddings(writer, config)
    writer.close()
    return final_embeddings