def batch_iter(x, y, wv_model, batch_size=64): config = TCNNConfig() data_len = len(y) num_batch = int((data_len - 1) / batch_size) + 1 indices = np.random.permutation(np.arange(data_len)) x_shuffle = None try: x_shuffle = x.loc[indices].reset_index(drop=True) except: pass y_shuffle = y[indices] for i in range(num_batch): start_id = i * batch_size end_id = min((i + 1) * batch_size, data_len) # yield x_shuffle[start_id:end_id], y_shuffle[start_id:end_id] x_shuffle_split = [ i.split(' ') for i in x_shuffle['article'][start_id:end_id] ] for i, x in enumerate(x_shuffle_split): # x_shuffle_split[i] = [wv_model.wv.vocab[word].index for word in x if word in wv_model.wv.vocab] # plus undefine word vector x_shuffle_split[i] = [ wv_model.wv.vocab[word].index if word in wv_model.wv.vocab else len(wv_model.wv.vocab) for word in x ] x_shuffle_split = kr.preprocessing.sequence.pad_sequences( x_shuffle_split, config.seq_length) yield x_shuffle_split, y_shuffle[start_id:end_id]
def __init__(self): self.config = TCNNConfig() self.model = TextCNN(self.config) self.session = tf.Session() self.session.run(tf.global_variables_initializer()) saver = tf.train.Saver() saver.restore(sess=self.session, save_path=save_path)
def batch_iter(x, y, batch_size=64, maxtext=39759): '''生成批次数据''' config = TCNNConfig() data_len = len(y) num_batch = int((data_len-1) / batch_size) + 1 indices = np.random.permutation(np.arange(data_len)) # 词向量过大,不能一次性读取,需分批使用到时再进行读取 x_shuffle = None try: x_shuffle = x.loc[indices].reset_index(drop=True) except: pass y_shuffle = y[indices] # 获取tf_idf特征矩阵 logging.info('generating Tf-Idf matrix...') vec = TfidfVectorizer(ngram_range=(1,2), min_df=3, max_df=0.9, use_idf=1, smooth_idf=1, sublinear_tf=1) train_set_term_doc = vec.fit_transform(x_shuffle['word_seg']) logging.info('generating done.{}'.format(train_set_term_doc.shape)) # logging.info('generating done. {}'.format(train_set_term_doc.shape)) # pca降维 # tsne = TSNE(perplexity=30, n_components=config.seq_length, method='exact', init='pca', n_iter=5000) # train_set_term_doc = tsne.fit_transform(train_set_term_doc) # logging.info('PCA:{}'.format(train_set_term_doc.shape)) # 截断SVD,截断奇异值分解,是另一种降维方法,其可以用在稀疏矩阵而PCA不能 svd = TruncatedSVD(config.seq_length, algorithm='arpack') for i in range(num_batch): start_id = i*batch_size end_id = min((i+1) * batch_size, data_len) # x_batch = [ [i] for i in train_set_term_doc[start_id : end_id]] x_batch = train_set_term_doc[start_id : end_id] logging.info('x_batch extract:{}, {}~{}'.format(x_batch.shape, start_id, end_id)) # x_batch = kr.preprocessing.sequence.pad_sequences(x_batch, config.vocab_size) x_batch = svd.fit_transform(x_batch) x_batch = kr.preprocessing.sequence.pad_sequences(x_batch, config.seq_length) logging.info('TruncatedSVD fit_transform done.{}'.format(x_batch.shape)) x_vec = np.empty(shape=(end_id-start_id, config.seq_length, 1)) for j in range(len(x_batch)): x_vec[j] = np.array([[i] for i in x_batch[j]]) # if end_id == data_len: # x_vec = None # else: # # x_batch = tsne.fit_transform(x_batch) # x_batch = svd.fit_transform(x_batch) # logging.info('TruncatedSVD fit_transform done.') # x_vec = np.empty(shape=(config.batch_size, config.seq_length, 1)) # for j in range(len(x_batch)): # x_vec[j] = np.array([[i] for i in x_batch[j]]) '''拟合数据''' # K = config.seq_length # 要降的维度 # pca_model = pca.PCA(n_components=K).fit(x_batch) # 拟合数据,n_components定义要降的维度 # x_batch = pca_model.transform(x_batch) # transform就会执行降维操作 # x_batch = pca.PCA(n_components=K).fit_transform(x_batch) yield x_vec, y_shuffle[start_id : end_id]
def batch_iter(x, y, batch_size=64): config = TCNNConfig() seq_length = config.seq_length data_len = len(x) num_batch = int((data_len-1) / batch_size) + 1 for i in range(num_batch): start_id = i*batch_size end_id = min((i+1) * batch_size, data_len) x_shuffle = x['word_seg'][start_id:end_id] x_shuffle = [str(i).split(' ') for i in x_shuffle] x_shuffle_pad = kr.preprocessing.sequence.pad_sequences(x_shuffle, maxlen=seq_length) yield x_shuffle_pad, y[start_id:end_id]
def generateDataMatrix(): train_set = pd.read_csv(train_set_file)['word_seg'] logging.info(len(train_set)) logging.info('Loading word2vec model') wv_model = get_wordvec_model() config = TCNNConfig() x_shuffle_split = np.array([i.split(' ') for i in train_set]) for i, x in enumerate(x_shuffle_split): if i % 100 == 0: logging.info('{}...'.format(i)) x_shuffle_split[i] = [ wv_model.wv.vocab[word].index for word in x if word in wv_model.wv.vocab ] logging.info('word2id done.') x_shuffle_split = kr.preprocessing.sequence.pad_sequences( x_shuffle_split, config.seq_length) logging.info('pad done.') np.save("cache/data_matrix.npy", x_shuffle_split) logging.info('save done.')
def batch_iter(x, y, batch_size=64): config = TCNNConfig() seq_length = config.seq_length data_len = len(x) num_batch = int((data_len - 1) / batch_size) + 1 for i in range(num_batch): start_id = i * batch_size end_id = min((i + 1) * batch_size, data_len) x_shuffle = x['word_seg'][start_id:end_id] x_shuffle = [str(i).split(' ') for i in x_shuffle] x_shuffle_pad = kr.preprocessing.sequence.pad_sequences( x_shuffle, maxlen=seq_length) x_shuffle_vec = np.zeros(shape=(len(x_shuffle_pad), seq_length, config.embedding_dim)) for i in range(x_shuffle_vec.shape[0]): words = x_shuffle_pad[i] tmp = [wv_model[int(w)] for w in words] # tmp = [wv_model[int(w)] for w in words if w in wv_model] tmp = kr.preprocessing.sequence.pad_sequences([tmp], maxlen=seq_length)[0] x_shuffle_vec[i] = tmp yield x_shuffle_vec, y[start_id:end_id]
logging.info(metrics.classification_report(y_test_cls, y_pred_cls)) # 混淆矩阵 logging.info("Confusion Matrix...") cm = metrics.confusion_matrix(y_test_cls, y_pred_cls) logging.info(cm) time_dif = get_time_dif(start_time) logging.info("Time usage:", time_dif) if __name__ == '__main__': logging.basicConfig(format="%(asctime)s : %(levelname)s : %(message)s", level=logging.INFO) # 显示INFO等级以上日志 if len(sys.argv) != 2 or sys.argv[1] not in ['train', 'test']: raise ValueError("""usage: python run_cnn.py [train / test]""") logging.info('Configuring CNN model...') config = TCNNConfig() # if not os.path.exists(vocab_dir): # 如果不存在词汇表,重建 # build_voca b(train_dir, vocab_dir, config.vocab_size) # categories, cat_to_id = read_category() # words, word_to_id = read_vocab(vocab_dir) # config.context_size = len(words) model = TextCNN(config) if sys.argv[1] == 'train': train2(restore=False) else: test()