def predict(self,url): params = data_helper.loadDict(self.training_params_file) num_labels = int(params['num_labels']) max_document_length = int(params['max_document_length']) x_raw = [url] sentences, max_document_length = data_helper.padding_sentences(x_raw, '<PADDING>', padding_sentence_length = max_document_length) x_test = np.array(word2vec_helpers.embedding_sentences(sentences, file_to_load = self.trained_word2vec_model_file)) # print(x_test) # with self.graph.as_default(): # with self.sess.as_default(): result = self.sess.run(self.predictions, {self.input_x: x_test,self.dropout_keep_prob: 1.0}) result = 'good' if result else 'bad' print("Request examples: {}, inference result: {}".format(url,result)) return result
#y = y[0:1000] #random select a part of the original data new_x_text = [] new_y = [] for i in range(3000): rand_idx = random.randint(0, len(x_text)) #rand_y = random.randint(0, len(x_text)) new_x_text.append(x_text[rand_idx]) new_y.append(y[rand_idx]) print "new_x_text length: %d" % len(new_x_text) print "new_y length: %d" % len(new_y) # embedding vector print("Padding sentences...") sentences, max_document_length = data_helper.padding_sentences(new_x_text, '<PADDING>') #max_document_length = print("embedding_sentences...") all_vectors = word2vec_helpers.embedding_sentences(sentences, embedding_size = FLAGS.embedding_dim, file_to_save = os.path.join(out_dir, 'trained_word2vec.model')) print "all_vectors length %d * %d * %d : " % (len(all_vectors) , len(all_vectors[0]) , len(all_vectors[0][0])) #x = np.array(all_vectors) ## this operation could lead to memory error!!! #TODO: transform large vectors into sparse matrix x = np.asarray(all_vectors) y = np.asarray(new_y) print("x.shape = {}".format(x.shape)) print("y.shape = {}".format(y.shape)) # Save params training_params_file = os.path.join(out_dir, 'training_params.pickle') params = {'num_labels' : FLAGS.num_labels, 'max_document_length' : max_document_length}
# 建立词表 vocab_tokens = [ line.strip() for line in codecs.open('./runs/vocab', 'r', 'utf-8').readlines() ] vocsize = len(vocab_tokens) vocab = {} for (i, token) in enumerate(vocab_tokens): vocab[token] = i # 加载的是训练集数据及标签 x_text, y = data_helper.load_data_and_labels('./data/train_data/', './runs/vocab') # 进行padding 传入的是语料, 填充的标志, 最大填充的长度 sentences = data_helper.padding_sentences(x_text, FLAGS.padding_token, FLAGS.max_sentence_len) print("len(x_text)", len(x_text)) print("len(y)", len(y)) # Build vocabulary # 将语料转为对应的id x = np.array( data_helper.sentence2matrix(sentences, FLAGS.max_sentence_len, vocab)) # Randomly shuffle data np.random.seed(10) shuffle_indices = np.random.permutation(np.arange(len(y))) x_train = x[shuffle_indices] y_train = y[shuffle_indices]
# Load params params = data_helper.loadDict(training_params_file) num_labels = int(params['num_labels']) max_document_length = int(params['max_document_length']) # Load data if FLAGS.eval_train and FLAGS.single_url is None: x_raw, y_test = data_helper.load_data_and_labels(FLAGS.input_text_file) elif FLAGS.single_url is not None: x_raw = [FLAGS.single_url] y_test = None else: x_raw = ["a masterpiece four years in the making", "everything is off."] y_test = [1, 0] # Get Embedding vector x_test sentences, max_document_length = data_helper.padding_sentences( x_raw, '<PADDING>', padding_sentence_length=max_document_length) x_test = np.array( word2vec_helpers.embedding_sentences( sentences, file_to_load=trained_word2vec_model_file)) print("x_test.shape = {}".format(x_test.shape)) # Evaluation # ================================================== print("\nEvaluating...\n") checkpoint_file = tf.train.latest_checkpoint(FLAGS.checkpoint_dir) graph = tf.Graph() with graph.as_default(): session_conf = tf.ConfigProto( allow_soft_placement=FLAGS.allow_soft_placement, log_device_placement=FLAGS.log_device_placement) sess = tf.Session(config=session_conf)
def data_preprocess(): # Data preprocess # ======================================================= # Load data print("Loading data...") if not os.path.exists(os.path.join(out_dir, "data_x.npy")): x, y = data_helper.load_data_and_labels(FLAGS.data_file) # Get embedding vector x = x[:1000] y = y[:1000] sentences, max_document_length = data_helper.padding_sentences( x, '<PADDING>', padding_sentence_length=FLAGS.sequence_length) print(len(sentences[0])) if not os.path.exists(os.path.join(out_dir, "trained_word2vec.model")): x = np.array( word2vec_helpers.embedding_sentences( sentences, embedding_size=FLAGS.embedding_dim, file_to_save=os.path.join(out_dir, 'trained_word2vec.model'))) else: print('w2v model found...') x = np.array( word2vec_helpers.embedding_sentences( sentences, embedding_size=FLAGS.embedding_dim, file_to_save=os.path.join(out_dir, 'trained_word2vec.model'), file_to_load=os.path.join(out_dir, 'trained_word2vec.model'))) y = np.array(y) # np.save(os.path.join(out_dir,"data_x.npy"),x) # np.save(os.path.join(out_dir,"data_y.npy"),y) del sentences else: print('data found...') x = np.load(os.path.join(out_dir, "data_x.npy")) y = np.load(os.path.join(out_dir, "data_y.npy")) print("x.shape = {}".format(x.shape)) print("y.shape = {}".format(y.shape)) # Save params if not os.path.exists(os.path.join(out_dir, "training_params.pickle")): training_params_file = os.path.join(out_dir, 'training_params.pickle') params = { 'num_labels': FLAGS.num_labels, 'max_document_length': max_document_length } data_helper.saveDict(params, training_params_file) # Shuffle data randomly # np.random.seed(10) # shuffle_indices = np.random.permutation(np.arange(len(y))) # x_shuffled = x[shuffle_indices] # y_shuffled = y[shuffle_indices] # del x,y # x_train, x_test, y_train, y_test = train_test_split(x_shuffled, y_shuffled, test_size=0.2, random_state=42) # split into training and testing set 80/20 ratio x_train, x_test, y_train, y_test = train_test_split( x, y, test_size=0.2, random_state=42) # split into training and testing set 80/20 ratio del x, y return x_train, x_test, y_train, y_test