Exemple #1
0
def train():
    with tf.Graph().as_default():
        global_step = tf.Variable(0,trainable=False)

        images,labels = cifar10.distorted_inputs()

        logits = cifar10.inference(images)

        loss = cifar10.loss(logits,labels)

        train_op = cifar10.train(loss,global_step)

        saver = tf.train.Saver(tf.all_variables())

        summary_op = tf.summary.merge_all()

        init = tf.global_variables_initializer()

        sess = tf.Session(config=tf.ConfigProto(
            log_device_placement = FLAGS.log_device_placement))
        sess.run(init)

        #
        tf.train.start_queue_runners(sess=sess)
        summary_writer = tf.summary.FileWriter(FLAGS.train_dir,graph_def=sess.graph_def)

        for step in range(FLAGS.max_steps):
            start_time = time.time()
            _,loss_value = sess.run([train_op,loss])
            duration = time.time() - start_time

            assert not np.isnan(loss_value), 'Model diverged with loss = NaN'

            if step % 10 ==0:
                num_examples_per_step = FLAGS.batch_size
                examples_per_sec = num_examples_per_step / duration
                sec_per_batch = float(duration)

                format_str = ('%s: step %d, loss = %.2f (%.1f examples/sec; %.3f '
                              'sec/batch)')
                print(format_str % (datetime.now(), step, loss_value,
                                    examples_per_sec, sec_per_batch))

            if step % 100 == 0:
                summary_str = sess.run(summary_op)
                summary_writer.add_summary(summary_str, step)


            if step % 1000 == 0 or (step + 1) == FLAGS.max_steps:
                checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt')
                saver.save(sess, checkpoint_path, global_step=step)
                use_mini_batch=True)

# 构建 KMeans 的图
training_graph = kmeans.training_graph()

if len(training_graph) > 6:
    (all_scores, cluster_idx, scores, cluster_centers_initialized,
     cluster_centers_var, init_op, train_op) = training_graph
else:
    (all_scores, cluster_idx, scores, cluster_centers_initialized,
     cluster_centers_var, init_op, train_op) = training_graph

cluster_idx = cluster_idx[0]
avg_distance = tf.reduce_mean(scores)

init_vars = tf.global_variables_initializer()

sess = tf.Session()

sess.run(init_vars, feed_dict={X: full_data_x})
sess.run(init_op, feed_dict={X: full_data_x})

# training
for i in range(1, num_steps + 1):
    _, d, idx = sess.run([train_op, avg_distance, cluster_idx],
                         feed_dict={X: full_data_x})
    if i % 10 == 0 or i == 1:
        print("Step %i, Avg Distance: %f" % (i, d))

# 为质心分配标签
# 使用每次训练的标签,汇总每个质心的所有标签总数
Exemple #3
0
def word2vec_basic(log_dir):
    # 创建tensorboard的可视化目录
    if not os.path.exists(log_dir):
        os.makedirs(log_dir)

    # 第一步,下载数据
    url = 'http://mattmahoney.net/dc/'

    def maybe_download(filename, expected_bytes, sha256=None):
        local_filename = os.path.join(gettempdir(), filename)
        if not os.path.exists(local_filename):
            local_filename, _ = urllib.request.urlretrieve(
                url + filename, local_filename)
        statinfo = os.stat(local_filename)

        if sha256 and _hash_file(local_filename) != sha256:
            raise Exception('Failed to verify ' + local_filename +
                            ' due to hash '
                            'mismatch. Can you get to it with a browser?')

        if statinfo.st_size == expected_bytes:
            print("found and verified", filename)
        else:
            print(statinfo.st_size)
            raise Exception('Failed to verify ' + local_filename +
                            '. Can you get to it with a browser?')
        return local_filename

    filename = maybe_download(
        'text8.zip',
        31344016,
        sha256=
        'a6640522afe85d1963ad56c05b0ede0a0c000dddc9671758a6cc09b7a38e5232')

    # 数据转为List<String>
    def read_data(filename):
        with zipfile.ZipFile(filename) as f:
            data = tf.compat.as_str(f.read(f.namelist()[0])).split()
        return data

    vocabulary = read_data(filename)
    print('data_size', len(vocabulary))

    # 第二步,建词典并且把罕见词替换成UNK
    vocabulary_size = 50000

    def build_dataset(words, n_words):

        count = [['UNK', -1]]
        count.extend(collections.Counter(words).most_common(n_words - 1))
        dictionary = {word: index for index, (word, _) in enumerate(count)}
        data = []
        unk_count = 0
        for word in words:
            index = dictionary.get(word, 0)
            if index == 0:  # dictionary['UNK']
                unk_count += 1
            data.append(index)
        count[0][1] = unk_count
        reversed_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
        return data, count, dictionary, reversed_dictionary

    # data: 词表中的所有的词的id
    # count: 单词和出现次数的map
    # dictionary: 单词-->index 的映射
    # reverse_dictionary:index -->单词
    data, count, dictionary, reversed_dictionary = build_dataset(
        vocabulary, vocabulary_size)
    del vocabulary
    print('Most common words (+UNK)', count[:5])
    print('Sample data', data[:10],
          [reversed_dictionary[i] for i in data[:10]])

    # 针对skip-gram模型生成batch数据
    def generate_batch(batch_size, num_skips, skip_window):
        global data_index
        assert batch_size % num_skips == 0
        assert num_skips <= 2 * skip_window
        batch = np.ndarray(shape=(batch_size), dtype=np.int32)
        labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
        # skip的范围
        span = 2 * skip_window + 1
        buffer = collections.deque(maxlen=span)
        if data_index + span > len(data):
            data_index = 0
        buffer.extend(data[data_index:data_index + span])  # 向后取一个窗口内的结果
        data_index += span
        for i in range(batch_size // num_skips):
            context_words = [w for w in range(span) if w != skip_window]
            words_to_use = random.sample(context_words, num_skips)
            for j, context_words in enumerate(words_to_use):
                batch[i * num_skips + j] = buffer[skip_window]
                labels[i * num_skips + j, 0] = buffer[context_words]
            if data_index == len(data):
                buffer.extend(data[0:span])
                data_index = span
            else:
                buffer.append(data[data_index])
                data_index += 1
        # Backtrack a little bit to avoid skipping words in the end of a batch
        data_index = (data_index - span) % len(data)
        return batch, labels

    batch, labels = generate_batch(batch_size=8, num_skips=2, skip_window=1)
    for i in range(8):
        print(batch[i], reversed_dictionary[batch[i]], '->', labels[i, 0],
              reversed_dictionary[labels[i, 0]])

    # 建立并且训练模型

    batch_size = 128
    embedding_size = 128  # 词向量维度
    skip_window = 1  # 考虑左右几个单词
    num_skips = 2  # 复用输入生成标签的次数
    num_sampled = 64  # 负样本数量

    # 采样一个样本的近邻作为随机验证机,将验证集样本限制为 较低id的单词,是比较高频的构造词汇
    # 这三个变量用作显示模型准确率,不影响计算。
    valid_size = 16  # 用于评估相似性的随机单词集合
    valid_window = 100  #
    valid_examples = np.random.choice(valid_window, valid_size, replace=False)

    graph = tf.Graph()

    with graph.as_default():

        # 输入数据
        with tf.name_scope('input'):
            train_inputs = tf.placeholder(tf.int32, shape=[batch_size])
            train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])
            valid_dataset = tf.constant(valid_examples, dtype=tf.int32)

        # 操作op和变量variables 固定在CPU上。
        with tf.device('/cpu:0'):
            with tf.name_scope('embeddings'):
                embeddings = tf.Variable(
                    tf.random_uniform([vocabulary_size, embedding_size], -1.0,
                                      1.0))
                embed = tf.nn.embedding_lookup(embeddings, train_inputs)

            # 构造NCE损失的变量
            with tf.name_scope('weights'):
                nce_weights = tf.Variable(
                    tf.truncated_normal([vocabulary_size, embedding_size],
                                        stddev=1.0 /
                                        math.sqrt(embedding_size)))

            with tf.name_scope('biases'):
                nce_biases = tf.Variable(tf.zeros([vocabulary_size]))

        # 计算该批次的平均nce损失,当评估损失的时候,自动绘制一个新的负样本。
        with tf.name_scope('loss'):
            loss = tf.reduce_mean(
                tf.nn.nce_loss(weights=nce_weights,
                               biases=nce_biases,
                               labels=train_labels,
                               inputs=embed,
                               num_sampled=num_sampled,
                               num_classes=vocabulary_size))
        # 汇总损失
        tf.summary.scalar('loss', loss)

        # 构造SGD
        with tf.name_scope('opytimizer'):
            optimizer = tf.train.GradientDescentOptimizer(1.0).minimize(loss)
        # 计算小批次样本和所有样本之间的余弦相似度
        norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keep_dims=True))
        normalized_embeddings = embeddings / norm
        valid_embeddings = tf.nn.embedding_lookup(normalized_embeddings,
                                                  valid_dataset)
        similarity = tf.matmul(valid_embeddings,
                               normalized_embeddings,
                               transpose_b=True)

        # merge all summary
        merged = tf.summary.merge_all()

        init = tf.global_variables_initializer()

        saver = tf.train.Saver()

    # 开始训练
    num_steps = 1000001

    with tf.compat.v1.Session(graph=graph) as session:
        # 写入摘要
        writer = tf.summary.FileWriter(log_dir, session.graph)

        init.run()
        print('inited..')
        average_loss = 0
        for step in range(num_steps):
            batch_inputs, batch_labels = generate_batch(
                batch_size, num_skips, skip_window)
            feed_dict = {
                train_inputs: batch_inputs,
                train_labels: batch_labels
            }
            # 定义元变量
            run_metadata = tf.RunMetadata()

            _, summary, loss_val = session.run([optimizer, merged, loss],
                                               feed_dict=feed_dict,
                                               run_metadata=run_metadata)
            average_loss += loss_val

            writer.add_summary(summary, step)

            if step == (num_steps - 1):
                writer.add_run_metadata(run_metadata, 'step%d' % step)

            if step % 2000 == 0:
                if step > 0:
                    average_loss /= 2000
                    # 平均损失是对最近的2000个批次样本的估计。
                print('Average loss at step ', step, ': ', average_loss)
                average_loss = 0

            if step % 10000 == 0:
                sim = similarity.eval()
                for i in range(valid_size):
                    valid_word = reversed_dictionary[valid_examples[i]]
                    top_k = 8
                    nearest = (-sim[i, :]).argsort()[1:top_k + 1]
                    log_str = 'Nearest to %s:' % valid_word

                    print(
                        log_str, ', '.join([
                            reversed_dictionary[nearest[k]]
                            for k in range(top_k)
                        ]))
        final_embeddings = normalized_embeddings.eval()

        # 写下embedding的相应标签
        with open(log_dir + '/metadata.tsv', 'w') as f:
            for i in range(vocabulary_size):
                f.write(reversed_dictionary[i] + '\n')

        # 保存checkpoint
        saver.save(session, os.path.join(log_dir, 'model.ckpt'))

        # 配置Tensorboard
        config = projector.ProjectorConfig()
        embedding_conf = config.embeddings.add()
        embedding_conf.tensor_name = embeddings.name
        embedding_conf.metadata_path = os.path.join(log_dir, 'metadata.tsv')
        projector.visualize_embeddings(writer, config)
    writer.close()

    # Step 6: Visualize the embeddings.

    # pylint: disable=missing-docstring
    # Function to draw visualization of distance between embeddings.
    def plot_with_labels(low_dim_embs, labels, filename):
        assert low_dim_embs.shape[0] >= len(
            labels), 'More labels than embeddings'
        plt.figure(figsize=(18, 18))  # in inches
        for i, label in enumerate(labels):
            x, y = low_dim_embs[i, :]
            plt.scatter(x, y)
            plt.annotate(label,
                         xy=(x, y),
                         xytext=(5, 2),
                         textcoords='offset points',
                         ha='right',
                         va='bottom')

        plt.savefig(filename)

    try:
        # pylint: disable=g-import-not-at-top
        from sklearn.manifold import TSNE
        import matplotlib.pyplot as plt

        tsne = TSNE(perplexity=30,
                    n_components=2,
                    init='pca',
                    n_iter=5000,
                    method='exact')
        plot_only = 500
        low_dim_embs = tsne.fit_transform(final_embeddings[:plot_only, :])
        labels = [reversed_dictionary[i] for i in xrange(plot_only)]
        plot_with_labels(low_dim_embs, labels,
                         os.path.join(gettempdir(), 'tsne.png'))

    except ImportError as ex:
        print(
            'Please install sklearn, matplotlib, and scipy to show embeddings.'
        )
        print(ex)
sess = tf.InteractiveSession()
# 该函数可以更加灵活的构建代码,可以在运行计算的图的时候通过operation操作插入一些计算图。

x = tf.placeholder("float", shape=[None, 784])
y_ = tf.placeholder("float", shape=[None, 10])  # 占位符

W = tf.Variable(tf.zeros([784, 10]))
b = tf.Variable(tf.zeros([10]))  # 变量,跟占位符一样作为额外的输入量
sess.run(tf.initialize_all_variables())
y = tf.nn.softmax(tf.matmul(x, W) + b)  # 使用softmax计算每个分类的概率

cross_entropy = -tf.reduce_sum(y_ * tf.log(y))  # 交叉熵

train_step = tf.train.GradientDescentOptimizer(0.01).minimize(
    cross_entropy)  # 训练使用最小梯度下降,且最小化交叉熵loss
init = tf.global_variables_initializer()
for i in range(1000):
    batch = mnist.train.next_batch(50)  # load  mini-batchsize dataset
    train_step.run(feed_dict={x: batch[0], y_: batch[1]})
print("训练结束..")
"""
这段表达特别好:tf.argmax 是一个非常有用的函数,它能给出某个tensor对象在某一维上的其数据最大值所在的索引值。
由于标签向量是由0,1组成,因此最大值1所在的索引位置就是类别标签,比如tf.argmax(y,1)返回的是模型对于任一输入x预测到的标签值,
而 tf.argmax(y_,1) 代表正确的标签,我们可以用 tf.equal 来检测我们的预测是否真实标签匹配(索引位置一样表示匹配)。
"""
correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(y_, 1))
accuarcy = tf.reduce_mean(tf.cast(correct_prediction, "float"))

print(accuarcy.eval(feed_dict={
    x: mnist.test.images,
    y_: mnist.test.labels
Exemple #5
0
def run_training():
  """Train MNIST for a number of steps."""
  # Get the sets of images and labels for training, validation, and
  # test on MNIST.
  # 加载训练数据,(data_set.train 图像和标签 data_sets.validation 图像和标签,迭代验证 )
  data_sets = input_data.read_data_sets(FLAGS.input_data_dir, FLAGS.fake_data)

  # Tell TensorFlow that the model will be built into the default Graph.
  with tf.Graph().as_default():
    # Generate placeholders for the images and labels.
    images_placeholder, labels_placeholder = placeholder_inputs(
        FLAGS.batch_size)

    # Build a Graph that computes predictions from the inference model.
    logits = mnist.inference(images_placeholder,
                             FLAGS.hidden1,
                             FLAGS.hidden2)

    # Add to the Graph the Ops for loss calculation.
    loss = mnist.loss(logits, labels_placeholder)

    # Add to the Graph the Ops that calculate and apply gradients.
    train_op = mnist.training(loss, FLAGS.learning_rate)

    # Add the Op to compare the logits to the labels during evaluation.
    eval_correct = mnist.evaluation(logits, labels_placeholder)

    # Build the summary Tensor based on the TF collection of Summaries.
    summary = tf.summary.merge_all()

    # Add the variable initializer Op.
    init = tf.global_variables_initializer()

    # Create a saver for writing training checkpoints.
    saver = tf.train.Saver()

    # Create a session for running Ops on the Graph.
    sess = tf.Session()

    # Instantiate a SummaryWriter to output summaries and the Graph.
    summary_writer = tf.summary.FileWriter(FLAGS.log_dir, sess.graph)

    # And then after everything is built:

    # Run the Op to initialize the variables.
    sess.run(init)

    # Start the training loop.
    for step in xrange(FLAGS.max_steps):
      start_time = time.time()

      # Fill a feed dictionary with the actual set of images and labels
      # for this particular training step.
      feed_dict = fill_feed_dict(data_sets.train,
                                 images_placeholder,
                                 labels_placeholder)

      # Run one step of the model.  The return values are the activations
      # from the `train_op` (which is discarded) and the `loss` Op.  To
      # inspect the values of your Ops or variables, you may include them
      # in the list passed to sess.run() and the value tensors will be
      # returned in the tuple from the call.
      _, loss_value = sess.run([train_op, loss],
                               feed_dict=feed_dict)

      duration = time.time() - start_time

      # Write the summaries and print an overview fairly often.
      if step % 100 == 0:
        # Print status to stdout.
        print('Step %d: loss = %.2f (%.3f sec)' % (step, loss_value, duration))
        # Update the events file.
        summary_str = sess.run(summary, feed_dict=feed_dict)
        summary_writer.add_summary(summary_str, step)
        summary_writer.flush()

      # Save a checkpoint and evaluate the model periodically.
      if (step + 1) % 1000 == 0 or (step + 1) == FLAGS.max_steps:
        checkpoint_file = os.path.join(FLAGS.log_dir, 'model.ckpt')
        saver.save(sess, checkpoint_file, global_step=step)
        # Evaluate against the training set.
        print('Training Data Eval:')
        do_eval(sess,
                eval_correct,
                images_placeholder,
                labels_placeholder,
                data_sets.train)
        # Evaluate against the validation set.
        print('Validation Data Eval:')
        do_eval(sess,
                eval_correct,
                images_placeholder,
                labels_placeholder,
                data_sets.validation)
        # Evaluate against the test set.
        print('Test Data Eval:')
        do_eval(sess,
                eval_correct,
                images_placeholder,
                labels_placeholder,
                data_sets.test)