def test_embedding_layer(self): with tf.Graph().as_default(): tf.set_random_seed(10) ids_ = tf.constant([[0, 127, 512], [63, 191, 0]], dtype=tf.int32) xs_ = models.embedding_layer(ids_, V=512, embed_dim=100, init_scale=1.0) self.assertEqual(xs_.get_shape().as_list(), [2, 3, 100]) var_names = [v.name for v in tf.trainable_variables()] self.assertEqual(var_names, ["W_embed:0"]) self.assertEqual(tf.trainable_variables("W_embed")[0].get_shape().as_list(), [512, 100])
def embedding_parag(input_paragraphs): """ :param input_paragraphs: list of paragraphs :return: list of preprocessed paragraphs, list of number of paragraphs """ preprocessed, sentence_len = list(), list() with tf.device("/gpu:0"): with tf.Graph().as_default(): sentences, embeddings = embedding_layer() with tf.Session() as sess: sess.run(tf.global_variables_initializer()) for paragraph in input_paragraphs: paragraph = sent_tokenize(paragraph) length = len(paragraph) sentence_len.append(length) sentence_rep = sess.run(embeddings, feed_dict={sentences: paragraph}) pad = [[0] * 1024]*100 pad[:length] = sentence_rep.tolist() preprocessed.append(pad) return preprocessed, sentence_len
_essays = np.array(_df['essay']) _essays = [sent_tokenize(_essay) for _essay in _essays] _scores = [ get_nom_score(_row['essay_set'], _row['score']) for _, _row in _df.iterrows() ] del [[_df_essays, _df_scores, _df]] gc.collect() for e, s in zip(_essays, _scores): yield e, s if __name__ == '__main__': with tf.device("/gpu:0"): with tf.Graph().as_default(): sentences, embeddings = embedding_layer() with tf.Session() as sess: sess.run(tf.global_variables_initializer()) print("Train Data Preprocessing") now_time = -time.time() for file_count, (essay, score) in enumerate(get_train_data(paths[0])): df = pd.DataFrame() sentence_rep = sess.run(embeddings, feed_dict={sentences: essay}) pad = [[0] * 1024 for _ in range(100)] pad[:len(essay)] = sentence_rep.tolist() df = df.append(pad) pad = [0] * 1024 pad[:2] = [len(essay), score]