Python padding_sentencesの例、data_helpers.padding_sentences Pythonの例

コード例 #1

0

ファイルを表示

def pre_type(string,max_document_length):
    x_raw = [data_helpers.clean_str(data_helpers.jieba_line(string))]
    # Get Embedding vector x_test
    sentences, max_document_length = data_helpers.padding_sentences(x_raw, '<PADDING>',
                                                                    padding_sentence_length=max_document_length)
    x_test = np.array(word2vec_helpers.embedding_sentences(sentences, file_to_load=trained_word2vec_model_file))
    # print("x_test.shape = {}".format(x_test.shape))
    pred = sess.run(predictions, {input_x: x_test, dropout_keep_prob: 1.0})[0]
    type=''
    if pred == 0:
        type = type + 'neg'
    else:
        type = type + 'pos'
    return type

コード例 #2

0

ファイルを表示

ファイル: save_load_data.py プロジェクト: zpnew/nlp-learning-tutorials

def save_data_vector(contents_dir, labels_dir, out_dir):
    x_text, y = load_files_labels(contents_dir, labels_dir, one_hot=True)
    # Get embedding vector, 句子padding到最大长度190，pandding内容为： '<PADDING>'
    sentences, max_document_length = data_helpers.padding_sentences(
        x_text, '<PADDING>', padding_sentence_length=190)
    embedding_dim = 128
    x = np.array(
        word2vec_helpers.embedding_sentences(sentences,
                                             embedding_size=embedding_dim,
                                             file_to_save=os.path.join(
                                                 out_dir,
                                                 'trained_word2vec.model')))
    print("x.shape = {}".format(x.shape)
          )  # shape=(10000, 190, 128)->(样本个数10000,每个样本的字词个数190，每个字词的向量长度128)
    print("y.shape = {}".format(
        y.shape))  # y.shape = (10000, 2)->样本的labels,以one-hot编码
    np.save(os.path.join(out_dir, 'data_vector.npy'), x)
    np.save(os.path.join(out_dir, 'labels.npy'), y)

コード例 #3

0

ファイルを表示

def data_preprocess():
    print("Loading data...")
    x, y = data_helpers.load_data_and_labels(FLAGS.positive_data_file,
                                             FLAGS.negative_data_file)
    sentences, max_document_length = data_helpers.padding_sentences(x)
    if not os.path.exists(W2VMODEL):
        x = np.array(
            word2vec_helper.embedding_sentences(
                x, embedding_size=FLAGS.embedding_dim, file_to_save=W2VMODEL))
    else:
        print('w2c model found...')
        x = np.array(
            word2vec_helper.embedding_sentences(
                sentences,
                embedding_size=FLAGS.embedding_dim,
                file_to_load=W2VMODEL))
    y = np.array(y)
    del sentences
    return x, y

コード例 #4

0

ファイルを表示

ファイル: eval.py プロジェクト: 17307/emailcheck

    params = data_helpers.loadDict(training_params_file)
    num_labels = int(params['num_labels'])
    max_document_length = int(params['max_document_length'])

    # Load data
    if FLAGS.eval_train:
        x_raw, y_test = data_helpers.load_data_and_labels(
            FLAGS.input_text_file, FLAGS.input_label_file, num_labels)
    else:
        x_raw = [
            "a masterpiece four years in the making", "everything is off."
        ]
        y_test = [1, 0]

    # Get Embedding vector x_test
    sentences, max_document_length = data_helpers.padding_sentences(
        x_raw, '<PADDING>', padding_sentence_length=max_document_length)
    x_test = np.array(
        word2vec_helpers.embedding_sentences(
            sentences, file_to_load=trained_word2vec_model_file))
    print("x_test.shape = {}".format(x_test.shape))

    # Evaluation
    # ==================================================
    print("\nEvaluating...\n")
    checkpoint_file = tf.train.latest_checkpoint(FLAGS.checkpoint_dir)
    graph = tf.Graph()
    with graph.as_default():
        session_conf = tf.ConfigProto(
            allow_soft_placement=FLAGS.allow_soft_placement,
            log_device_placement=FLAGS.log_device_placement)
        sess = tf.Session(config=session_conf)

コード例 #5

0

ファイルを表示

ファイル: train.py プロジェクト: onionwyc/zh_cnn_text_classify

timestamp = str(int(time.time()))
out_dir = os.path.abspath(os.path.join(os.path.curdir, "runs", timestamp))
print("Writing to {}\n".format(out_dir))
if not os.path.exists(out_dir):
    os.makedirs(out_dir)

# Data preprocess
# =======================================================

# Load data
print("Loading data...")
x_text, y = data_helpers.load_positive_negative_data_files(FLAGS.positive_data_file, FLAGS.negative_data_file)

# Get embedding vector
sentences, max_document_length = data_helpers.padding_sentences(x_text, '<PADDING>')
x = np.array(word2vec_helpers.embedding_sentences(sentences, embedding_size = FLAGS.embedding_dim, file_to_save = os.path.join(out_dir, 'trained_word2vec.model')))
print("x.shape = {}".format(x.shape))
print("y.shape = {}".format(y.shape))

# Save params
training_params_file = os.path.join(out_dir, 'training_params.pickle')
params = {'num_labels' : FLAGS.num_labels, 'max_document_length' : max_document_length}
data_helpers.saveDict(params, training_params_file)

# Shuffle data randomly
np.random.seed(10)
shuffle_indices = np.random.permutation(np.arange(len(y)))
x_shuffled = x[shuffle_indices]
y_shuffled = y[shuffle_indices]

コード例 #6

0

ファイルを表示

ファイル: train.py プロジェクト: zbxzc35/zh_cnn_text_classify

_w2v_path = os.path.join(os.path.curdir, "runs", FLAGS.wordembedding_name)
print("Writing to {}\n".format(out_dir))
if not os.path.exists(out_dir):
    os.makedirs(out_dir)

# Data preprocess
# =======================================================

# Load data
print("Loading data...")
x_text, y = data_helpers.load_positive_negative_data_files(FLAGS)

# Get embedding vector
sentences, max_document_length = data_helpers.padding_sentences(
    x_text,
    '<PADDING>',
    word_segment=FLAGS.word_segment,
    padding_sentence_length=FLAGS.max_document_len)
if not os.path.exists(_w2v_path):
    _, w2vModel = word2vec_helpers.embedding_sentences(
        sentences=sentences,
        embedding_size=FLAGS.embedding_dim,
        file_to_save=_w2v_path)
else:
    _, w2vModel = word2vec_helpers.embedding_sentences(
        sentences=None,
        embedding_size=FLAGS.embedding_dim,
        file_to_load=_w2v_path)
FLAGS.embedding_dim = w2vModel.vector_size
print('wordembedding.dim = {}'.format(FLAGS.embedding_dim))
print('wordembedding.lenth = {}'.format(len(w2vModel.wv.vocab)))

コード例 #7

0

ファイルを表示

ファイル: eval.py プロジェクト: qiuchaofan/chin_weibo_remark_classfify

def validate_method(x_raw, y_test, max_document_length):

    # Get Embedding vector x_test
    sentences, max_document_length = data_helpers.padding_sentences(
        x_raw, '<PADDING>', padding_sentence_length=max_document_length)
    x_test = np.array(
        word2vec_helpers.embedding_sentences(
            sentences, file_to_load=trained_word2vec_model_file))
    print("x_test.shape = {}".format(x_test.shape))

    # Evaluation
    # ==================================================
    print("\nEvaluating...\n")
    checkpoint_file = tf.train.latest_checkpoint(FLAGS.checkpoint_dir)
    graph = tf.Graph()
    with graph.as_default():
        session_conf = tf.ConfigProto(
            allow_soft_placement=FLAGS.allow_soft_placement,
            log_device_placement=FLAGS.log_device_placement)
        sess = tf.Session(config=session_conf)
        with sess.as_default():
            # Load the saved meta graph and restore variables
            saver = tf.train.import_meta_graph(
                "{}.meta".format(checkpoint_file))
            saver.restore(sess, checkpoint_file)

            # Get the placeholders from the graph by name
            input_x = graph.get_operation_by_name("input_x").outputs[0]
            # input_y = graph.get_operation_by_name("input_y").outputs[0]
            dropout_keep_prob = graph.get_operation_by_name(
                "dropout_keep_prob").outputs[0]

            # Tensors we want to evaluate
            predictions = graph.get_operation_by_name(
                "output/predictions").outputs[0]

            # Generate batches for one epoch
            batches = data_helpers.batch_iter(list(x_test),
                                              FLAGS.batch_size,
                                              1,
                                              shuffle=False)

            # Collect the predictions here
            all_predictions = []

            for x_test_batch in batches:
                batch_predictions = sess.run(predictions, {
                    input_x: x_test_batch,
                    dropout_keep_prob: 1.0
                })
                all_predictions = np.concatenate(
                    [all_predictions, batch_predictions])

    # Print accuracy if y_test is defined
    if y_test is not None:
        correct_predictions = float(sum(all_predictions == y_test))
        print("Total number of test examples: {}".format(len(y_test)))
        print("Accuracy: {:g}".format(correct_predictions /
                                      float(len(y_test))))

    # Save the evaluation to a csv
    predictions_human_readable = np.column_stack(
        (np.array([text.encode('utf-8') for text in x_raw]), all_predictions))
    out_path = os.path.join(FLAGS.checkpoint_dir, "..", "prediction.csv")
    print("Saving evaluation to {0}".format(out_path))
    with open(out_path, 'a+') as f:
        csv.writer(f).writerows(predictions_human_readable)

コード例 #8

0

ファイルを表示

def sample(args):
    timestamp = str(int(time.time()))
    out_dir = os.path.abspath(os.path.join(os.path.curdir, "runs", timestamp))
    print("Writing to {}\n".format(out_dir))
    if not os.path.exists(out_dir):
        os.makedirs(out_dir)
    print('Loading data')
    #x_text, y = data_helpers.load_positive_negative_data_files1()
    # Get embedding vector
    #sentences, max_document_length = data_helpers.padding_sentences(x_text, '<PADDING>')
    #x = np.array(word2vec_helpers.embedding_sentences(sentences, embedding_size=FLAGS.embedding_dim,
    #file_to_save=os.path.join(out_dir, 'trained_word2vec.model')))

    checkpoint_file = tf.train.latest_checkpoint(args.checkpoint_dir)
    graph = tf.Graph()
    with graph.as_default():
        sess = tf.Session()
        with sess.as_default():
            # Load the saved meta graph and restore variables
            saver = tf.train.import_meta_graph(
                "{}.meta".format(checkpoint_file))
            saver.restore(sess, checkpoint_file)

            # Get the placeholders from the graph by name
            input_x = graph.get_operation_by_name("input_x").outputs[0]
            # input_y = graph.get_operation_by_name("input_y").outputs[0]
            dropout_keep_prob = graph.get_operation_by_name(
                "dropout_keep_prob").outputs[0]

            # Tensors we want to evaluate
            predictions = graph.get_operation_by_name(
                "output/predictions").outputs[0]
            #体育 娱乐 彩票 房产
            textlist = [
                '谁 足 球 踢 得 好 ？', u'彩 票 中 奖 几 乎 是 不 可 能 的',
                u'上 海 的 房 价 始 终 居 高 不 下',
                u'关 晓 彤 主 演 新 版 倚 天 屠 龙 记 让 人 笑 掉 大 牙 ',
                u'杜 兰 特 是 勇 士 的 篮 球 运 动 员 ', u'娱 乐 圈 吸 毒 是 常 有 的 事',
                u'上 海 一 彩 民 中了 二 等 奖', u' 万 达 集 团 再 次 中 标 关键 地 段 的 房 产 开 发 权 ',
                u'很 多 观 众 每 晚 准 时 看 体 育 新 闻 ', u'草 莓 音 乐 节 即 将 开 始',
                u'中 国 福 利 彩 票 是 否 有 黑 幕 不 得 而 知 ', u'房 地 产 行 业 永 远 不 会 倒'
            ]
            for i in textlist:
                textlist = []
                textlist.append(i)
                print(textlist)
                sentences_padded1, max_document_length = data_helpers.padding_sentences(
                    textlist, '<PADDING>')
                raw_x1 = np.array(
                    word2vec_helpers.embedding_sentences(
                        sentences_padded1,
                        embedding_size=FLAGS.embedding_dim,
                        file_to_load=
                        'C:/Users/I343039/PycharmProjects/nlp-multiclass-text-tf/runs/1508811868/trained_word2vec.model'
                    ))
                predicted_result = sess.run(predictions, {
                    input_x: raw_x1,
                    dropout_keep_prob: 1.0
                })
                if (predicted_result[0] == 0):
                    print(i + ": 体育")
                elif (predicted_result[0] == 1):
                    print(i + ": 娱乐")
                elif (predicted_result[0] == 2):
                    print(i + ": 彩票")
                elif (predicted_result[0] == 3):
                    print(i + ": 房产")

コード例 #9

0

ファイルを表示

ファイル: noisy_word2vec_train.py プロジェクト: chencuhk/textCNN-with-differential-privacy

def preprocess():
    # Data Preparation
    # ==================================================
    global out_dir
    timestamp = str(int(time.time()))
    out_dir = os.path.abspath(os.path.join(os.path.curdir, "runs", timestamp))
    print("Writing to {}\n".format(out_dir))
    if not os.path.exists(out_dir):
        os.makedirs(out_dir)
    # Load data
    print("Loading data...")
    x_text, y, pos_len, neg_len = data_helpers.noisy_load_data_and_labels(
        FLAGS.positive_data_file, FLAGS.negative_data_file)
    data_size = len(x_text)
    """
    # Build vocabulary
    max_document_length = max([len(x.split(" ")) for x in x_text])
    vocab_processor = learn.preprocessing.VocabularyProcessor(max_document_length)
    x = np.array(list(vocab_processor.fit_transform(x_text)))
    """
    # Get embedding vector
    sentences, max_document_length = data_helpers.padding_sentences(
        x_text, '<PADDING>')
    x = np.array(
        word2vec_helpers.embedding_sentences(
            sentences,
            embedding_size=FLAGS.embedding_dim,
            file_to_save=os.path.join(out_dir, 'trained_word2vec.model')))
    #x=tf.cast(x, tf.float32)
    #vectors =word2vec_helpers.embedding_sentences([['first', 'sentence'], ['second', 'sentence']], embedding_size = 4, min_count = 1)
    print(x[0].shape)
    #y =np.reshape(y,(-1,1))
    print("x.shape = {}".format(x.shape))
    print("y.shape = {}".format(y.shape))
    #adding noise according to different classes

    data_sigma = data_scale()
    global gradient_sigma
    gradient_sigma = gradient_scale(data_size)
    pos_noise = np.random.normal(0, data_sigma,
                                 [pos_len, x.shape[1], x.shape[2]])
    neg_noise = np.random.normal(0, data_sigma,
                                 [neg_len, x.shape[1], x.shape[2]])
    noise = np.concatenate([pos_noise, neg_noise], 0)
    x = x + noise
    # Save params
    """
    training_params_file = os.path.join(out_dir, 'training_params.pickle')
    params = {'num_labels': FLAGS.num_labels,'max_document_length' : max_document_length}
    data_helpers.saveDict(params, training_params_file)"""
    # Randomly shuffle data
    np.random.seed(10)
    shuffle_indices = np.random.permutation(np.arange(len(y)))
    print(shuffle_indices)
    #x_shuffle_indices=[[index] for index in shuffle_indices]
    print("the shape of x:{}".format(x.shape[0]))
    print("indices shape:{}".format(shuffle_indices))
    """
    x_shuffled=tf.gather_nd(
    x,
    x_shuffle_indices,
    name=None
)"""
    x_shuffled = x[shuffle_indices]
    #x_shuffled = x[x_shuffle_indices]
    y_shuffled = y[shuffle_indices]

    # Split train/test set
    # TODO: This is very crude, should use cross-validation

    dev_sample_index = -1 * int(FLAGS.dev_sample_percentage * float(len(y)))
    x_train, x_dev = x_shuffled[:dev_sample_index], x_shuffled[
        dev_sample_index:]
    y_train, y_dev = y_shuffled[:dev_sample_index], y_shuffled[
        dev_sample_index:]
    print("shape of x:{}".format(x_train.shape))
    print("shape of y:{}".format(y_train.shape))
    del x, y, x_shuffled, y_shuffled
    """
    print("Vocabulary Size: {:d}".format(len(vocab_processor.vocabulary_)))
    print("Train/Dev split: {:d}/{:d}".format(len(y_train), len(y_dev)))
    """
    #return x_train, y_train, vocab_processor, x_dev, y_dev
    return x_train, y_train, x_dev, y_dev

コード例 #10

0

ファイルを表示

ファイル: eval.py プロジェクト: onionwyc/zh_cnn_text_classify

print("Using training params file : {}".format(training_params_file))

# Load params
params = data_helpers.loadDict(training_params_file)
num_labels = int(params['num_labels'])
max_document_length = int(params['max_document_length'])

# Load data
if FLAGS.eval_train:
    x_raw, y_test = data_helpers.load_data_and_labels(FLAGS.input_text_file, FLAGS.input_label_file, num_labels)
else:
    x_raw = ["a masterpiece four years in the making", "everything is off."]
    y_test = [1, 0]

# Get Embedding vector x_test
sentences, max_document_length = data_helpers.padding_sentences(x_raw, '<PADDING>', padding_sentence_length = max_document_length)
x_test = np.array(word2vec_helpers.embedding_sentences(sentences, file_to_load = trained_word2vec_model_file))
print("x_test.shape = {}".format(x_test.shape))


# Evaluation
# ==================================================
print("\nEvaluating...\n")
checkpoint_file = tf.train.latest_checkpoint(FLAGS.checkpoint_dir)
graph = tf.Graph()
with graph.as_default():
    session_conf = tf.ConfigProto(
      allow_soft_placement=FLAGS.allow_soft_placement,
      log_device_placement=FLAGS.log_device_placement)
    sess = tf.Session(config=session_conf)
    with sess.as_default():

コード例 #11

0

ファイルを表示

# Data preprocess
# =======================================================

# Load data
print("Loading data...")
x_text, y = data_helpers.load_positive_negative_data_files(
    FLAGS.cooking_data_file, FLAGS.music_data_file, FLAGS.video_data_file)

#测试集
x_test, y_test = data_helpers.load_positive_negative_data_files(
    FLAGS.cooking_test, FLAGS.music_test, FLAGS.video_test)
print('=============', len(x_test), len(x_test[0]))

# Get embedding vector
sentences, max_document_length = data_helpers.padding_sentences(
    x_text, '<PADDING>')
x = np.array(
    word2vec_helpers.embedding_sentences(sentences,
                                         embedding_size=FLAGS.embedding_dim,
                                         file_to_save=os.path.join(
                                             out_dir,
                                             'trained_word2vec.model')))

#测试集
sentences_test, max_document_length_test = data_helpers.padding_sentences(
    x_test, '<PADDING>', padding_sentence_length=18)
print(len(sentences_test), max_document_length_test)
x1 = np.array(
    word2vec_helpers.embedding_sentences(sentences_test,
                                         embedding_size=FLAGS.embedding_dim,
                                         file_to_save=os.path.join(

コード例 #12

0

ファイルを表示

ファイル: train.py プロジェクト: BlueSummerTrain/classify_text

out_dir = data_helpers.mkdir_if_not_exist("./runs")

# Load data
print("Loading data...")
max_sentence_len = FLAGS.max_sentence_len
x_text, y = data_helpers.load_data_and_labels(FLAGS.caijing_data_file, \
                                              FLAGS.caipiao_data_file, \
                                              FLAGS.fangchan_data_file, \
                                              FLAGS.gupiao_data_file,\
                                              FLAGS.jiaju_data_file,\
                                              FLAGS.jiaoyu_data_file,\
                                              FLAGS.shishang_data_file,\
                                              FLAGS.shizheng_data_file,\
                                              FLAGS.tiyu_data_file,\
                                              FLAGS.yule_data_file)
sentences = data_helpers.padding_sentences(x_text, FLAGS.padding_token,max_sentence_len)

print("len(x_text)",len(x_text))
print("len(y)",len(y))
# Build vocabulary
voc = None
vocsize = None

if os.path.exists('./runs/vocab'):
    # when sess restore,just reload vocab 
    voc,vocsize = data_helpers.read_vocabulary('./runs/vocab') 
else:
    voc,vocsize = data_helpers.build_vocabulary(sentences,'./runs/vocab')

x = np.array(data_helpers.sentence2matrix(sentences,max_sentence_len,voc))

コード例 #13

0

ファイルを表示

# Data preprocess
# =======================================================

# Load data
print("Loading data...")
#x_text为二维列表，第一维以每一句话为元素构成的列表，第二维以该句话的每个词组成的列表构成
#['全国', '少年儿童', '游泳', '锦标赛', '开幕', '新华社', '广州', '月', '日电', '记者', '何惠飞', '年', '喜乐', '杯', '全国', '少年儿童', '游泳', '锦标赛', '昨天', '在', '游泳', '之', '乡', '广东省', '东莞市', '开幕', '参加', '这次', '比赛', '的', '有', '个', '省', '自治区', '直辖市', '的', '名', '男女', '选手', '比赛', '分为', '岁', '组和岁', '以下', '组', '参赛者', '都', '是', '近几年', '涌现', '的', '优秀', '小', '选手', '不少', '是', '本', '年龄组', '的', '全国纪录', '创造者', '这次', '比赛', '是', '对', '我国', '参加', '下', '两届', '奥运会', '游泳赛', '后备力量', '的', '一次', '检阅', '国家体委', '将', '通过', '这次', '比赛', '选拔', '优秀', '选手', '组队参加', '今年', '月', '在', '印度尼西亚', '举行', '的', '亚太区', '年龄组', '游泳', '比赛', '比赛', '将', '于', '日', '结束', '完']
x_text, y = data_helpers.load_data_files(
    FLAGS.sports_file, FLAGS.amusement_file, FLAGS.home_file,
    FLAGS.estate_file, FLAGS.education_file, FLAGS.fashion_file,
    FLAGS.politics_file, FLAGS.game_file, FLAGS.technology_file,
    FLAGS.finance_file)

# Get embedding vector
sentences, max_document_length = data_helpers.padding_sentences(
    x_text, 'PADDING', FLAGS.max_seq_length)
#此时的sentences为每一句话中的词构成的列表为元素而构成的二维列表，其中每一句话的长度都相同，因为都用
#'PADDING'将其补充到最大长度
#将返回的列表转化为数组
x_embedding = word2vec_helpers.embedding_sentences(
    sentences,
    embedding_size=FLAGS.embedding_dim,
    ext_emb_path=FLAGS.word_embedding_file)
x = np.array(x_embedding)
#x的三维分别表示句子总数，每个句子中的单词数(以最长的句子计)，词向量的维数
print("x.shape =", x.shape)
print("y.shape =", y.shape)
#Save params
training_params_file = 'train/training_params.pickle'
params = {
    'num_labels': FLAGS.num_labels,

コード例 #14

0

ファイルを表示

ファイル: train.py プロジェクト: LiXiangting/zh_cnn_text_classify

timestamp = str(int(time.time()))
out_dir = os.path.abspath(os.path.join(os.path.curdir, "runs", timestamp))
print("Writing to {}\n".format(out_dir))
if not os.path.exists(out_dir):
    os.makedirs(out_dir)

# Data preprocess
# =======================================================

# Load data
print("Loading data...")
x_text, y = data_helpers.load_positive_negative_data_files(
    FLAGS.positive_data_file, FLAGS.negative_data_file)

# Get embedding vector
sentences, max_document_length = data_helpers.padding_sentences(
    x_text, '<PADDING>')
print('max_document_length:' + str(max_document_length))
x = np.array(
    word2vec_helpers.embedding_sentences(sentences,
                                         embedding_size=FLAGS.embedding_dim,
                                         file_to_save=os.path.join(
                                             out_dir,
                                             'trained_word2vec.model')))
print("x.shape = {}".format(x.shape))
print("y.shape = {}".format(y.shape))  #原结果是

# Save params
training_params_file = os.path.join(out_dir, 'training_params.pickle')
params = {
    'num_labels': FLAGS.num_labels,
    'max_document_length': max_document_length

コード例 #15

0

ファイルを表示

flags.DEFINE_float("beta1", 0.5, "Momentum term of adam [0.5]")

FLAGS = flags.FLAGS
print "training params:::"
print flags.FLAGS.__flags

# gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.333)
run_config = tf.ConfigProto()
run_config.gpu_options.allow_growth = True

# Output directory for models and summaries
out_dir = data_helpers.mkdir_if_not_exist("./runs")

# Load true_sentences and build vocab
true_sentences = data_helpers.read_and_clean_file(FLAGS.true_data_file)
padding_true_sentences = data_helpers.padding_sentences(true_sentences,
                                                        FLAGS.padding_token, FLAGS.max_sentences_length)

# Question: should we build voc just use true sentences or use all chinese word dic?
# Here we use true sentences
voc,voc_size = data_helpers.build_vocabulary(padding_true_sentences,'./runs/vocab')

true_data = np.array(data_helpers.sentence2matrix(true_sentences,FLAGS.max_sentences_length,voc))
# Randomly shuffle data
np.random.seed(10)
shuffle_indices = np.random.permutation(np.arange(len(true_sentences)))
true_data_shuffled = true_data[shuffle_indices]


#fake_factors = fake_factor_dist.sample((FLAGS.batch_size, FLAGS.max_sentences_length,FLAGS.embedding_dim))

global_graph = tf.Graph()

コード例 #16

0

ファイルを表示

# Data preprocess
# =======================================================
# Load data
print("Loading data...")
x_text, y = data_helpers.load_positive_negative_data_files(
    FLAGS.positive_data_file,
    FLAGS.negative_data_file,
    cut=False,
    stop_words_list_file=None,
)  #不进行切分词
#x_text, y = data_helpers.load_positive_negative_data_files(FLAGS.positive_data_file, FLAGS.negative_data_file,
#                                                           cut=True, stop_words_list_file=FLAGS.stop_word_file) #切分词版本
#print(x_text)
# Get embedding vector
sentences = data_helpers.padding_sentences(x_text,
                                           '<PADDING>',
                                           padding_sentence_length=20)
x = np.array(
    word2vec_helpers.embedding_sentences(sentences,
                                         embedding_size=FLAGS.embedding_dim,
                                         file_to_save=os.path.join(
                                             out_dir,
                                             'trained_word2vec.model')))
print("x.shape = {}".format(x.shape))
print("y.shape = {}".format(y.shape))

# Save params
training_params_file = os.path.join(out_dir, 'training_params.pickle')
params = {'num_labels': FLAGS.num_labels, 'max_document_length': 20}
data_helpers.saveDict(params, training_params_file)

コード例 #17

0

ファイルを表示

ファイル: eval.py プロジェクト: BlueSummerTrain/classify_text

                        "Log placement of ops on devices")

FLAGS = tf.flags.FLAGS
#FLAGS._parse_flags()
print("\nParameters:")
for attr, value in sorted(FLAGS.__flags.items()):
    print("{}={}".format(attr.upper(), value.value))
print("")

# CHANGE THIS: Load data. Load your own data here
if FLAGS.eval_train:
    x_raw, y_test = data_helpers.load_testfile_and_labels(
        FLAGS.input_test_file, FLAGS.input_label_file, FLAGS.num_labels)

# Get Embedding vector x_test
sentences = data_helpers.padding_sentences(x_raw, FLAGS.padding_token,
                                           FLAGS.max_document_length)
print "sentences length : %d" % len(sentences)
voc, _ = data_helpers.read_vocabulary('./runs/vocab')
print "voc length : %d" % len(voc)

x_test = np.array(
    data_helpers.sentence2matrix(sentences, FLAGS.max_document_length, voc))
print("x_test.shape = {}".format(x_test.shape))
print("\nEvaluating...\n")

# Evaluation
# ==================================================
checkpoint_file = tf.train.latest_checkpoint(FLAGS.checkpoint_dir)
#checkpoint_file = "./runs/model/classify_text.ckpt-1000"
graph = tf.Graph()
with graph.as_default():