def pre_type(string,max_document_length): x_raw = [data_helpers.clean_str(data_helpers.jieba_line(string))] # Get Embedding vector x_test sentences, max_document_length = data_helpers.padding_sentences(x_raw, '<PADDING>', padding_sentence_length=max_document_length) x_test = np.array(word2vec_helpers.embedding_sentences(sentences, file_to_load=trained_word2vec_model_file)) # print("x_test.shape = {}".format(x_test.shape)) pred = sess.run(predictions, {input_x: x_test, dropout_keep_prob: 1.0})[0] type='' if pred == 0: type = type + 'neg' else: type = type + 'pos' return type
def save_data_vector(contents_dir, labels_dir, out_dir): x_text, y = load_files_labels(contents_dir, labels_dir, one_hot=True) # Get embedding vector, 句子padding到最大长度190,pandding内容为: '<PADDING>' sentences, max_document_length = data_helpers.padding_sentences( x_text, '<PADDING>', padding_sentence_length=190) embedding_dim = 128 x = np.array( word2vec_helpers.embedding_sentences(sentences, embedding_size=embedding_dim, file_to_save=os.path.join( out_dir, 'trained_word2vec.model'))) print("x.shape = {}".format(x.shape) ) # shape=(10000, 190, 128)->(样本个数10000,每个样本的字词个数190,每个字词的向量长度128) print("y.shape = {}".format( y.shape)) # y.shape = (10000, 2)->样本的labels,以one-hot编码 np.save(os.path.join(out_dir, 'data_vector.npy'), x) np.save(os.path.join(out_dir, 'labels.npy'), y)
def data_preprocess(): print("Loading data...") x, y = data_helpers.load_data_and_labels(FLAGS.positive_data_file, FLAGS.negative_data_file) sentences, max_document_length = data_helpers.padding_sentences(x) if not os.path.exists(W2VMODEL): x = np.array( word2vec_helper.embedding_sentences( x, embedding_size=FLAGS.embedding_dim, file_to_save=W2VMODEL)) else: print('w2c model found...') x = np.array( word2vec_helper.embedding_sentences( sentences, embedding_size=FLAGS.embedding_dim, file_to_load=W2VMODEL)) y = np.array(y) del sentences return x, y
params = data_helpers.loadDict(training_params_file) num_labels = int(params['num_labels']) max_document_length = int(params['max_document_length']) # Load data if FLAGS.eval_train: x_raw, y_test = data_helpers.load_data_and_labels( FLAGS.input_text_file, FLAGS.input_label_file, num_labels) else: x_raw = [ "a masterpiece four years in the making", "everything is off." ] y_test = [1, 0] # Get Embedding vector x_test sentences, max_document_length = data_helpers.padding_sentences( x_raw, '<PADDING>', padding_sentence_length=max_document_length) x_test = np.array( word2vec_helpers.embedding_sentences( sentences, file_to_load=trained_word2vec_model_file)) print("x_test.shape = {}".format(x_test.shape)) # Evaluation # ================================================== print("\nEvaluating...\n") checkpoint_file = tf.train.latest_checkpoint(FLAGS.checkpoint_dir) graph = tf.Graph() with graph.as_default(): session_conf = tf.ConfigProto( allow_soft_placement=FLAGS.allow_soft_placement, log_device_placement=FLAGS.log_device_placement) sess = tf.Session(config=session_conf)
timestamp = str(int(time.time())) out_dir = os.path.abspath(os.path.join(os.path.curdir, "runs", timestamp)) print("Writing to {}\n".format(out_dir)) if not os.path.exists(out_dir): os.makedirs(out_dir) # Data preprocess # ======================================================= # Load data print("Loading data...") x_text, y = data_helpers.load_positive_negative_data_files(FLAGS.positive_data_file, FLAGS.negative_data_file) # Get embedding vector sentences, max_document_length = data_helpers.padding_sentences(x_text, '<PADDING>') x = np.array(word2vec_helpers.embedding_sentences(sentences, embedding_size = FLAGS.embedding_dim, file_to_save = os.path.join(out_dir, 'trained_word2vec.model'))) print("x.shape = {}".format(x.shape)) print("y.shape = {}".format(y.shape)) # Save params training_params_file = os.path.join(out_dir, 'training_params.pickle') params = {'num_labels' : FLAGS.num_labels, 'max_document_length' : max_document_length} data_helpers.saveDict(params, training_params_file) # Shuffle data randomly np.random.seed(10) shuffle_indices = np.random.permutation(np.arange(len(y))) x_shuffled = x[shuffle_indices] y_shuffled = y[shuffle_indices]
_w2v_path = os.path.join(os.path.curdir, "runs", FLAGS.wordembedding_name) print("Writing to {}\n".format(out_dir)) if not os.path.exists(out_dir): os.makedirs(out_dir) # Data preprocess # ======================================================= # Load data print("Loading data...") x_text, y = data_helpers.load_positive_negative_data_files(FLAGS) # Get embedding vector sentences, max_document_length = data_helpers.padding_sentences( x_text, '<PADDING>', word_segment=FLAGS.word_segment, padding_sentence_length=FLAGS.max_document_len) if not os.path.exists(_w2v_path): _, w2vModel = word2vec_helpers.embedding_sentences( sentences=sentences, embedding_size=FLAGS.embedding_dim, file_to_save=_w2v_path) else: _, w2vModel = word2vec_helpers.embedding_sentences( sentences=None, embedding_size=FLAGS.embedding_dim, file_to_load=_w2v_path) FLAGS.embedding_dim = w2vModel.vector_size print('wordembedding.dim = {}'.format(FLAGS.embedding_dim)) print('wordembedding.lenth = {}'.format(len(w2vModel.wv.vocab)))
def validate_method(x_raw, y_test, max_document_length): # Get Embedding vector x_test sentences, max_document_length = data_helpers.padding_sentences( x_raw, '<PADDING>', padding_sentence_length=max_document_length) x_test = np.array( word2vec_helpers.embedding_sentences( sentences, file_to_load=trained_word2vec_model_file)) print("x_test.shape = {}".format(x_test.shape)) # Evaluation # ================================================== print("\nEvaluating...\n") checkpoint_file = tf.train.latest_checkpoint(FLAGS.checkpoint_dir) graph = tf.Graph() with graph.as_default(): session_conf = tf.ConfigProto( allow_soft_placement=FLAGS.allow_soft_placement, log_device_placement=FLAGS.log_device_placement) sess = tf.Session(config=session_conf) with sess.as_default(): # Load the saved meta graph and restore variables saver = tf.train.import_meta_graph( "{}.meta".format(checkpoint_file)) saver.restore(sess, checkpoint_file) # Get the placeholders from the graph by name input_x = graph.get_operation_by_name("input_x").outputs[0] # input_y = graph.get_operation_by_name("input_y").outputs[0] dropout_keep_prob = graph.get_operation_by_name( "dropout_keep_prob").outputs[0] # Tensors we want to evaluate predictions = graph.get_operation_by_name( "output/predictions").outputs[0] # Generate batches for one epoch batches = data_helpers.batch_iter(list(x_test), FLAGS.batch_size, 1, shuffle=False) # Collect the predictions here all_predictions = [] for x_test_batch in batches: batch_predictions = sess.run(predictions, { input_x: x_test_batch, dropout_keep_prob: 1.0 }) all_predictions = np.concatenate( [all_predictions, batch_predictions]) # Print accuracy if y_test is defined if y_test is not None: correct_predictions = float(sum(all_predictions == y_test)) print("Total number of test examples: {}".format(len(y_test))) print("Accuracy: {:g}".format(correct_predictions / float(len(y_test)))) # Save the evaluation to a csv predictions_human_readable = np.column_stack( (np.array([text.encode('utf-8') for text in x_raw]), all_predictions)) out_path = os.path.join(FLAGS.checkpoint_dir, "..", "prediction.csv") print("Saving evaluation to {0}".format(out_path)) with open(out_path, 'a+') as f: csv.writer(f).writerows(predictions_human_readable)
def sample(args): timestamp = str(int(time.time())) out_dir = os.path.abspath(os.path.join(os.path.curdir, "runs", timestamp)) print("Writing to {}\n".format(out_dir)) if not os.path.exists(out_dir): os.makedirs(out_dir) print('Loading data') #x_text, y = data_helpers.load_positive_negative_data_files1() # Get embedding vector #sentences, max_document_length = data_helpers.padding_sentences(x_text, '<PADDING>') #x = np.array(word2vec_helpers.embedding_sentences(sentences, embedding_size=FLAGS.embedding_dim, #file_to_save=os.path.join(out_dir, 'trained_word2vec.model'))) checkpoint_file = tf.train.latest_checkpoint(args.checkpoint_dir) graph = tf.Graph() with graph.as_default(): sess = tf.Session() with sess.as_default(): # Load the saved meta graph and restore variables saver = tf.train.import_meta_graph( "{}.meta".format(checkpoint_file)) saver.restore(sess, checkpoint_file) # Get the placeholders from the graph by name input_x = graph.get_operation_by_name("input_x").outputs[0] # input_y = graph.get_operation_by_name("input_y").outputs[0] dropout_keep_prob = graph.get_operation_by_name( "dropout_keep_prob").outputs[0] # Tensors we want to evaluate predictions = graph.get_operation_by_name( "output/predictions").outputs[0] #体育 娱乐 彩票 房产 textlist = [ '谁 足 球 踢 得 好 ?', u'彩 票 中 奖 几 乎 是 不 可 能 的', u'上 海 的 房 价 始 终 居 高 不 下', u'关 晓 彤 主 演 新 版 倚 天 屠 龙 记 让 人 笑 掉 大 牙 ', u'杜 兰 特 是 勇 士 的 篮 球 运 动 员 ', u'娱 乐 圈 吸 毒 是 常 有 的 事', u'上 海 一 彩 民 中了 二 等 奖', u' 万 达 集 团 再 次 中 标 关键 地 段 的 房 产 开 发 权 ', u'很 多 观 众 每 晚 准 时 看 体 育 新 闻 ', u'草 莓 音 乐 节 即 将 开 始', u'中 国 福 利 彩 票 是 否 有 黑 幕 不 得 而 知 ', u'房 地 产 行 业 永 远 不 会 倒' ] for i in textlist: textlist = [] textlist.append(i) print(textlist) sentences_padded1, max_document_length = data_helpers.padding_sentences( textlist, '<PADDING>') raw_x1 = np.array( word2vec_helpers.embedding_sentences( sentences_padded1, embedding_size=FLAGS.embedding_dim, file_to_load= 'C:/Users/I343039/PycharmProjects/nlp-multiclass-text-tf/runs/1508811868/trained_word2vec.model' )) predicted_result = sess.run(predictions, { input_x: raw_x1, dropout_keep_prob: 1.0 }) if (predicted_result[0] == 0): print(i + ": 体育") elif (predicted_result[0] == 1): print(i + ": 娱乐") elif (predicted_result[0] == 2): print(i + ": 彩票") elif (predicted_result[0] == 3): print(i + ": 房产")
def preprocess(): # Data Preparation # ================================================== global out_dir timestamp = str(int(time.time())) out_dir = os.path.abspath(os.path.join(os.path.curdir, "runs", timestamp)) print("Writing to {}\n".format(out_dir)) if not os.path.exists(out_dir): os.makedirs(out_dir) # Load data print("Loading data...") x_text, y, pos_len, neg_len = data_helpers.noisy_load_data_and_labels( FLAGS.positive_data_file, FLAGS.negative_data_file) data_size = len(x_text) """ # Build vocabulary max_document_length = max([len(x.split(" ")) for x in x_text]) vocab_processor = learn.preprocessing.VocabularyProcessor(max_document_length) x = np.array(list(vocab_processor.fit_transform(x_text))) """ # Get embedding vector sentences, max_document_length = data_helpers.padding_sentences( x_text, '<PADDING>') x = np.array( word2vec_helpers.embedding_sentences( sentences, embedding_size=FLAGS.embedding_dim, file_to_save=os.path.join(out_dir, 'trained_word2vec.model'))) #x=tf.cast(x, tf.float32) #vectors =word2vec_helpers.embedding_sentences([['first', 'sentence'], ['second', 'sentence']], embedding_size = 4, min_count = 1) print(x[0].shape) #y =np.reshape(y,(-1,1)) print("x.shape = {}".format(x.shape)) print("y.shape = {}".format(y.shape)) #adding noise according to different classes data_sigma = data_scale() global gradient_sigma gradient_sigma = gradient_scale(data_size) pos_noise = np.random.normal(0, data_sigma, [pos_len, x.shape[1], x.shape[2]]) neg_noise = np.random.normal(0, data_sigma, [neg_len, x.shape[1], x.shape[2]]) noise = np.concatenate([pos_noise, neg_noise], 0) x = x + noise # Save params """ training_params_file = os.path.join(out_dir, 'training_params.pickle') params = {'num_labels': FLAGS.num_labels,'max_document_length' : max_document_length} data_helpers.saveDict(params, training_params_file)""" # Randomly shuffle data np.random.seed(10) shuffle_indices = np.random.permutation(np.arange(len(y))) print(shuffle_indices) #x_shuffle_indices=[[index] for index in shuffle_indices] print("the shape of x:{}".format(x.shape[0])) print("indices shape:{}".format(shuffle_indices)) """ x_shuffled=tf.gather_nd( x, x_shuffle_indices, name=None )""" x_shuffled = x[shuffle_indices] #x_shuffled = x[x_shuffle_indices] y_shuffled = y[shuffle_indices] # Split train/test set # TODO: This is very crude, should use cross-validation dev_sample_index = -1 * int(FLAGS.dev_sample_percentage * float(len(y))) x_train, x_dev = x_shuffled[:dev_sample_index], x_shuffled[ dev_sample_index:] y_train, y_dev = y_shuffled[:dev_sample_index], y_shuffled[ dev_sample_index:] print("shape of x:{}".format(x_train.shape)) print("shape of y:{}".format(y_train.shape)) del x, y, x_shuffled, y_shuffled """ print("Vocabulary Size: {:d}".format(len(vocab_processor.vocabulary_))) print("Train/Dev split: {:d}/{:d}".format(len(y_train), len(y_dev))) """ #return x_train, y_train, vocab_processor, x_dev, y_dev return x_train, y_train, x_dev, y_dev
print("Using training params file : {}".format(training_params_file)) # Load params params = data_helpers.loadDict(training_params_file) num_labels = int(params['num_labels']) max_document_length = int(params['max_document_length']) # Load data if FLAGS.eval_train: x_raw, y_test = data_helpers.load_data_and_labels(FLAGS.input_text_file, FLAGS.input_label_file, num_labels) else: x_raw = ["a masterpiece four years in the making", "everything is off."] y_test = [1, 0] # Get Embedding vector x_test sentences, max_document_length = data_helpers.padding_sentences(x_raw, '<PADDING>', padding_sentence_length = max_document_length) x_test = np.array(word2vec_helpers.embedding_sentences(sentences, file_to_load = trained_word2vec_model_file)) print("x_test.shape = {}".format(x_test.shape)) # Evaluation # ================================================== print("\nEvaluating...\n") checkpoint_file = tf.train.latest_checkpoint(FLAGS.checkpoint_dir) graph = tf.Graph() with graph.as_default(): session_conf = tf.ConfigProto( allow_soft_placement=FLAGS.allow_soft_placement, log_device_placement=FLAGS.log_device_placement) sess = tf.Session(config=session_conf) with sess.as_default():
# Data preprocess # ======================================================= # Load data print("Loading data...") x_text, y = data_helpers.load_positive_negative_data_files( FLAGS.cooking_data_file, FLAGS.music_data_file, FLAGS.video_data_file) #测试集 x_test, y_test = data_helpers.load_positive_negative_data_files( FLAGS.cooking_test, FLAGS.music_test, FLAGS.video_test) print('=============', len(x_test), len(x_test[0])) # Get embedding vector sentences, max_document_length = data_helpers.padding_sentences( x_text, '<PADDING>') x = np.array( word2vec_helpers.embedding_sentences(sentences, embedding_size=FLAGS.embedding_dim, file_to_save=os.path.join( out_dir, 'trained_word2vec.model'))) #测试集 sentences_test, max_document_length_test = data_helpers.padding_sentences( x_test, '<PADDING>', padding_sentence_length=18) print(len(sentences_test), max_document_length_test) x1 = np.array( word2vec_helpers.embedding_sentences(sentences_test, embedding_size=FLAGS.embedding_dim, file_to_save=os.path.join(
out_dir = data_helpers.mkdir_if_not_exist("./runs") # Load data print("Loading data...") max_sentence_len = FLAGS.max_sentence_len x_text, y = data_helpers.load_data_and_labels(FLAGS.caijing_data_file, \ FLAGS.caipiao_data_file, \ FLAGS.fangchan_data_file, \ FLAGS.gupiao_data_file,\ FLAGS.jiaju_data_file,\ FLAGS.jiaoyu_data_file,\ FLAGS.shishang_data_file,\ FLAGS.shizheng_data_file,\ FLAGS.tiyu_data_file,\ FLAGS.yule_data_file) sentences = data_helpers.padding_sentences(x_text, FLAGS.padding_token,max_sentence_len) print("len(x_text)",len(x_text)) print("len(y)",len(y)) # Build vocabulary voc = None vocsize = None if os.path.exists('./runs/vocab'): # when sess restore,just reload vocab voc,vocsize = data_helpers.read_vocabulary('./runs/vocab') else: voc,vocsize = data_helpers.build_vocabulary(sentences,'./runs/vocab') x = np.array(data_helpers.sentence2matrix(sentences,max_sentence_len,voc))
# Data preprocess # ======================================================= # Load data print("Loading data...") #x_text为二维列表,第一维以每一句话为元素构成的列表,第二维以该句话的每个词组成的列表构成 #['全国', '少年儿童', '游泳', '锦标赛', '开幕', '新华社', '广州', '月', '日电', '记者', '何惠飞', '年', '喜乐', '杯', '全国', '少年儿童', '游泳', '锦标赛', '昨天', '在', '游泳', '之', '乡', '广东省', '东莞市', '开幕', '参加', '这次', '比赛', '的', '有', '个', '省', '自治区', '直辖市', '的', '名', '男女', '选手', '比赛', '分为', '岁', '组和岁', '以下', '组', '参赛者', '都', '是', '近几年', '涌现', '的', '优秀', '小', '选手', '不少', '是', '本', '年龄组', '的', '全国纪录', '创造者', '这次', '比赛', '是', '对', '我国', '参加', '下', '两届', '奥运会', '游泳赛', '后备力量', '的', '一次', '检阅', '国家体委', '将', '通过', '这次', '比赛', '选拔', '优秀', '选手', '组队参加', '今年', '月', '在', '印度尼西亚', '举行', '的', '亚太区', '年龄组', '游泳', '比赛', '比赛', '将', '于', '日', '结束', '完'] x_text, y = data_helpers.load_data_files( FLAGS.sports_file, FLAGS.amusement_file, FLAGS.home_file, FLAGS.estate_file, FLAGS.education_file, FLAGS.fashion_file, FLAGS.politics_file, FLAGS.game_file, FLAGS.technology_file, FLAGS.finance_file) # Get embedding vector sentences, max_document_length = data_helpers.padding_sentences( x_text, 'PADDING', FLAGS.max_seq_length) #此时的sentences为每一句话中的词构成的列表为元素而构成的二维列表,其中每一句话的长度都相同,因为都用 #'PADDING'将其补充到最大长度 #将返回的列表转化为数组 x_embedding = word2vec_helpers.embedding_sentences( sentences, embedding_size=FLAGS.embedding_dim, ext_emb_path=FLAGS.word_embedding_file) x = np.array(x_embedding) #x的三维分别表示句子总数,每个句子中的单词数(以最长的句子计),词向量的维数 print("x.shape =", x.shape) print("y.shape =", y.shape) #Save params training_params_file = 'train/training_params.pickle' params = { 'num_labels': FLAGS.num_labels,
timestamp = str(int(time.time())) out_dir = os.path.abspath(os.path.join(os.path.curdir, "runs", timestamp)) print("Writing to {}\n".format(out_dir)) if not os.path.exists(out_dir): os.makedirs(out_dir) # Data preprocess # ======================================================= # Load data print("Loading data...") x_text, y = data_helpers.load_positive_negative_data_files( FLAGS.positive_data_file, FLAGS.negative_data_file) # Get embedding vector sentences, max_document_length = data_helpers.padding_sentences( x_text, '<PADDING>') print('max_document_length:' + str(max_document_length)) x = np.array( word2vec_helpers.embedding_sentences(sentences, embedding_size=FLAGS.embedding_dim, file_to_save=os.path.join( out_dir, 'trained_word2vec.model'))) print("x.shape = {}".format(x.shape)) print("y.shape = {}".format(y.shape)) #原结果是 # Save params training_params_file = os.path.join(out_dir, 'training_params.pickle') params = { 'num_labels': FLAGS.num_labels, 'max_document_length': max_document_length
flags.DEFINE_float("beta1", 0.5, "Momentum term of adam [0.5]") FLAGS = flags.FLAGS print "training params:::" print flags.FLAGS.__flags # gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.333) run_config = tf.ConfigProto() run_config.gpu_options.allow_growth = True # Output directory for models and summaries out_dir = data_helpers.mkdir_if_not_exist("./runs") # Load true_sentences and build vocab true_sentences = data_helpers.read_and_clean_file(FLAGS.true_data_file) padding_true_sentences = data_helpers.padding_sentences(true_sentences, FLAGS.padding_token, FLAGS.max_sentences_length) # Question: should we build voc just use true sentences or use all chinese word dic? # Here we use true sentences voc,voc_size = data_helpers.build_vocabulary(padding_true_sentences,'./runs/vocab') true_data = np.array(data_helpers.sentence2matrix(true_sentences,FLAGS.max_sentences_length,voc)) # Randomly shuffle data np.random.seed(10) shuffle_indices = np.random.permutation(np.arange(len(true_sentences))) true_data_shuffled = true_data[shuffle_indices] #fake_factors = fake_factor_dist.sample((FLAGS.batch_size, FLAGS.max_sentences_length,FLAGS.embedding_dim)) global_graph = tf.Graph()
# Data preprocess # ======================================================= # Load data print("Loading data...") x_text, y = data_helpers.load_positive_negative_data_files( FLAGS.positive_data_file, FLAGS.negative_data_file, cut=False, stop_words_list_file=None, ) #不进行切分词 #x_text, y = data_helpers.load_positive_negative_data_files(FLAGS.positive_data_file, FLAGS.negative_data_file, # cut=True, stop_words_list_file=FLAGS.stop_word_file) #切分词版本 #print(x_text) # Get embedding vector sentences = data_helpers.padding_sentences(x_text, '<PADDING>', padding_sentence_length=20) x = np.array( word2vec_helpers.embedding_sentences(sentences, embedding_size=FLAGS.embedding_dim, file_to_save=os.path.join( out_dir, 'trained_word2vec.model'))) print("x.shape = {}".format(x.shape)) print("y.shape = {}".format(y.shape)) # Save params training_params_file = os.path.join(out_dir, 'training_params.pickle') params = {'num_labels': FLAGS.num_labels, 'max_document_length': 20} data_helpers.saveDict(params, training_params_file)
"Log placement of ops on devices") FLAGS = tf.flags.FLAGS #FLAGS._parse_flags() print("\nParameters:") for attr, value in sorted(FLAGS.__flags.items()): print("{}={}".format(attr.upper(), value.value)) print("") # CHANGE THIS: Load data. Load your own data here if FLAGS.eval_train: x_raw, y_test = data_helpers.load_testfile_and_labels( FLAGS.input_test_file, FLAGS.input_label_file, FLAGS.num_labels) # Get Embedding vector x_test sentences = data_helpers.padding_sentences(x_raw, FLAGS.padding_token, FLAGS.max_document_length) print "sentences length : %d" % len(sentences) voc, _ = data_helpers.read_vocabulary('./runs/vocab') print "voc length : %d" % len(voc) x_test = np.array( data_helpers.sentence2matrix(sentences, FLAGS.max_document_length, voc)) print("x_test.shape = {}".format(x_test.shape)) print("\nEvaluating...\n") # Evaluation # ================================================== checkpoint_file = tf.train.latest_checkpoint(FLAGS.checkpoint_dir) #checkpoint_file = "./runs/model/classify_text.ckpt-1000" graph = tf.Graph() with graph.as_default():