Ejemplo n.º 1
0
def batch_iter(x, y, wv_model, batch_size=64):
    config = TCNNConfig()
    data_len = len(y)
    num_batch = int((data_len - 1) / batch_size) + 1
    indices = np.random.permutation(np.arange(data_len))
    x_shuffle = None
    try:
        x_shuffle = x.loc[indices].reset_index(drop=True)
    except:
        pass
    y_shuffle = y[indices]
    for i in range(num_batch):
        start_id = i * batch_size
        end_id = min((i + 1) * batch_size, data_len)
        # yield x_shuffle[start_id:end_id], y_shuffle[start_id:end_id]
        x_shuffle_split = [
            i.split(' ') for i in x_shuffle['article'][start_id:end_id]
        ]
        for i, x in enumerate(x_shuffle_split):
            # x_shuffle_split[i] = [wv_model.wv.vocab[word].index for word in x if word in wv_model.wv.vocab]
            # plus undefine word vector
            x_shuffle_split[i] = [
                wv_model.wv.vocab[word].index
                if word in wv_model.wv.vocab else len(wv_model.wv.vocab)
                for word in x
            ]
        x_shuffle_split = kr.preprocessing.sequence.pad_sequences(
            x_shuffle_split, config.seq_length)
        yield x_shuffle_split, y_shuffle[start_id:end_id]
Ejemplo n.º 2
0
    def __init__(self):
        self.config = TCNNConfig()
        self.model = TextCNN(self.config)

        self.session = tf.Session()
        self.session.run(tf.global_variables_initializer())
        saver = tf.train.Saver()
        saver.restore(sess=self.session, save_path=save_path)
Ejemplo n.º 3
0
def batch_iter(x, y, batch_size=64, maxtext=39759):
    '''生成批次数据'''
    config = TCNNConfig()
    data_len = len(y)
    num_batch = int((data_len-1) / batch_size) + 1
    indices = np.random.permutation(np.arange(data_len))
    # 词向量过大,不能一次性读取,需分批使用到时再进行读取
    x_shuffle = None
    try:
        x_shuffle = x.loc[indices].reset_index(drop=True)
    except:
        pass
    y_shuffle = y[indices]

    # 获取tf_idf特征矩阵
    logging.info('generating Tf-Idf matrix...')
    vec = TfidfVectorizer(ngram_range=(1,2), min_df=3, max_df=0.9, use_idf=1, smooth_idf=1, sublinear_tf=1)
    train_set_term_doc = vec.fit_transform(x_shuffle['word_seg'])
    logging.info('generating done.{}'.format(train_set_term_doc.shape))
    # logging.info('generating done. {}'.format(train_set_term_doc.shape))
    # pca降维
    # tsne = TSNE(perplexity=30, n_components=config.seq_length, method='exact', init='pca', n_iter=5000)
    # train_set_term_doc = tsne.fit_transform(train_set_term_doc)
    # logging.info('PCA:{}'.format(train_set_term_doc.shape))

    # 截断SVD,截断奇异值分解,是另一种降维方法,其可以用在稀疏矩阵而PCA不能
    svd = TruncatedSVD(config.seq_length, algorithm='arpack')

    for i in range(num_batch):
        start_id = i*batch_size
        end_id = min((i+1) * batch_size, data_len)
        # x_batch = [ [i] for i in train_set_term_doc[start_id : end_id]]
        x_batch = train_set_term_doc[start_id : end_id]
        logging.info('x_batch extract:{}, {}~{}'.format(x_batch.shape, start_id, end_id))
        # x_batch = kr.preprocessing.sequence.pad_sequences(x_batch, config.vocab_size)
        x_batch = svd.fit_transform(x_batch)
        x_batch = kr.preprocessing.sequence.pad_sequences(x_batch, config.seq_length)
        logging.info('TruncatedSVD fit_transform done.{}'.format(x_batch.shape))
        x_vec = np.empty(shape=(end_id-start_id, config.seq_length, 1))
        for j in range(len(x_batch)):
            x_vec[j] = np.array([[i] for i in x_batch[j]])
        # if end_id == data_len:
        #     x_vec = None
        # else:
        #     # x_batch = tsne.fit_transform(x_batch)
        #     x_batch = svd.fit_transform(x_batch)
        #     logging.info('TruncatedSVD fit_transform done.')
        #     x_vec = np.empty(shape=(config.batch_size, config.seq_length, 1))
        #     for j in range(len(x_batch)):
        #         x_vec[j] = np.array([[i] for i in x_batch[j]])
        '''拟合数据'''
        # K = config.seq_length  # 要降的维度
        # pca_model = pca.PCA(n_components=K).fit(x_batch)  # 拟合数据,n_components定义要降的维度
        # x_batch = pca_model.transform(x_batch)  # transform就会执行降维操作
        # x_batch = pca.PCA(n_components=K).fit_transform(x_batch)
        yield x_vec, y_shuffle[start_id : end_id]
Ejemplo n.º 4
0
def batch_iter(x, y, batch_size=64):
    config = TCNNConfig()
    seq_length = config.seq_length
    data_len = len(x)
    num_batch = int((data_len-1) / batch_size) + 1

    for i in range(num_batch):
        start_id = i*batch_size
        end_id = min((i+1) * batch_size, data_len)
        x_shuffle = x['word_seg'][start_id:end_id]
        x_shuffle = [str(i).split(' ') for i in x_shuffle]
        x_shuffle_pad = kr.preprocessing.sequence.pad_sequences(x_shuffle, maxlen=seq_length)
        yield x_shuffle_pad, y[start_id:end_id]
Ejemplo n.º 5
0
def generateDataMatrix():
    train_set = pd.read_csv(train_set_file)['word_seg']
    logging.info(len(train_set))
    logging.info('Loading word2vec model')
    wv_model = get_wordvec_model()
    config = TCNNConfig()
    x_shuffle_split = np.array([i.split(' ') for i in train_set])
    for i, x in enumerate(x_shuffle_split):
        if i % 100 == 0:
            logging.info('{}...'.format(i))
        x_shuffle_split[i] = [
            wv_model.wv.vocab[word].index for word in x
            if word in wv_model.wv.vocab
        ]
    logging.info('word2id done.')
    x_shuffle_split = kr.preprocessing.sequence.pad_sequences(
        x_shuffle_split, config.seq_length)
    logging.info('pad done.')
    np.save("cache/data_matrix.npy", x_shuffle_split)
    logging.info('save done.')
Ejemplo n.º 6
0
def batch_iter(x, y, batch_size=64):
    config = TCNNConfig()
    seq_length = config.seq_length
    data_len = len(x)
    num_batch = int((data_len - 1) / batch_size) + 1
    for i in range(num_batch):
        start_id = i * batch_size
        end_id = min((i + 1) * batch_size, data_len)
        x_shuffle = x['word_seg'][start_id:end_id]
        x_shuffle = [str(i).split(' ') for i in x_shuffle]
        x_shuffle_pad = kr.preprocessing.sequence.pad_sequences(
            x_shuffle, maxlen=seq_length)
        x_shuffle_vec = np.zeros(shape=(len(x_shuffle_pad), seq_length,
                                        config.embedding_dim))
        for i in range(x_shuffle_vec.shape[0]):
            words = x_shuffle_pad[i]
            tmp = [wv_model[int(w)] for w in words]
            # tmp = [wv_model[int(w)] for w in words if w in wv_model]
            tmp = kr.preprocessing.sequence.pad_sequences([tmp],
                                                          maxlen=seq_length)[0]
            x_shuffle_vec[i] = tmp
        yield x_shuffle_vec, y[start_id:end_id]
Ejemplo n.º 7
0
    logging.info(metrics.classification_report(y_test_cls, y_pred_cls))

    # 混淆矩阵
    logging.info("Confusion Matrix...")
    cm = metrics.confusion_matrix(y_test_cls, y_pred_cls)
    logging.info(cm)

    time_dif = get_time_dif(start_time)
    logging.info("Time usage:", time_dif)


if __name__ == '__main__':
    logging.basicConfig(format="%(asctime)s : %(levelname)s : %(message)s",
                        level=logging.INFO)  # 显示INFO等级以上日志

    if len(sys.argv) != 2 or sys.argv[1] not in ['train', 'test']:
        raise ValueError("""usage: python run_cnn.py [train / test]""")

    logging.info('Configuring CNN model...')
    config = TCNNConfig()
    # if not os.path.exists(vocab_dir):  # 如果不存在词汇表,重建
    #     build_voca b(train_dir, vocab_dir, config.vocab_size)
    # categories, cat_to_id = read_category()
    # words, word_to_id = read_vocab(vocab_dir)
    # config.context_size = len(words)
    model = TextCNN(config)
    if sys.argv[1] == 'train':
        train2(restore=False)
    else:
        test()