Example #1
0
 def test_embedding_layer(self):
     with tf.Graph().as_default():
         tf.set_random_seed(10)
         ids_ = tf.constant([[0, 127, 512],
                             [63, 191,  0]], dtype=tf.int32)
         xs_ = models.embedding_layer(ids_, V=512,
                                      embed_dim=100,
                                      init_scale=1.0)
         self.assertEqual(xs_.get_shape().as_list(), [2, 3, 100])
         var_names = [v.name for v in tf.trainable_variables()]
         self.assertEqual(var_names, ["W_embed:0"])
         self.assertEqual(tf.trainable_variables("W_embed")[0].get_shape().as_list(),
                          [512, 100])
 def test_embedding_layer(self):
     with tf.Graph().as_default():
         tf.set_random_seed(10)
         ids_ = tf.constant([[0, 127, 512],
                             [63, 191,  0]], dtype=tf.int32)
         xs_ = models.embedding_layer(ids_, V=512,
                                      embed_dim=100,
                                      init_scale=1.0)
         self.assertEqual(xs_.get_shape().as_list(), [2, 3, 100])
         var_names = [v.name for v in tf.trainable_variables()]
         self.assertEqual(var_names, ["W_embed:0"])
         self.assertEqual(tf.trainable_variables("W_embed")[0].get_shape().as_list(),
                          [512, 100])
Example #3
0
def embedding_parag(input_paragraphs):
    """
    :param input_paragraphs: list of paragraphs
    :return: list of preprocessed paragraphs, list of number of paragraphs
    """
    preprocessed, sentence_len = list(), list()

    with tf.device("/gpu:0"):
        with tf.Graph().as_default():
            sentences, embeddings = embedding_layer()
            with tf.Session() as sess:
                sess.run(tf.global_variables_initializer())
                for paragraph in input_paragraphs:
                    paragraph = sent_tokenize(paragraph)
                    length = len(paragraph)
                    sentence_len.append(length)
                    sentence_rep = sess.run(embeddings, feed_dict={sentences: paragraph})
                    pad = [[0] * 1024]*100
                    pad[:length] = sentence_rep.tolist()
                    preprocessed.append(pad)

    return preprocessed, sentence_len
Example #4
0
    _essays = np.array(_df['essay'])
    _essays = [sent_tokenize(_essay) for _essay in _essays]
    _scores = [
        get_nom_score(_row['essay_set'], _row['score'])
        for _, _row in _df.iterrows()
    ]
    del [[_df_essays, _df_scores, _df]]
    gc.collect()
    for e, s in zip(_essays, _scores):
        yield e, s


if __name__ == '__main__':
    with tf.device("/gpu:0"):
        with tf.Graph().as_default():
            sentences, embeddings = embedding_layer()
            with tf.Session() as sess:
                sess.run(tf.global_variables_initializer())

                print("Train Data Preprocessing")
                now_time = -time.time()
                for file_count, (essay,
                                 score) in enumerate(get_train_data(paths[0])):
                    df = pd.DataFrame()
                    sentence_rep = sess.run(embeddings,
                                            feed_dict={sentences: essay})
                    pad = [[0] * 1024 for _ in range(100)]
                    pad[:len(essay)] = sentence_rep.tolist()
                    df = df.append(pad)
                    pad = [0] * 1024
                    pad[:2] = [len(essay), score]