X_test, y_test = load_data(in_path + 'yelp-2014-seg-20-20.test.ss') X_train = [paragraph.split(' <sssss> ') for paragraph in X_train] X_test = [paragraph.split(' <sssss> ') for paragraph in X_test] X_train = [[split_words(sent) for sent in paragraph] for paragraph in X_train] X_test = [[split_words(sent) for sent in paragraph] for paragraph in X_test] W2V_corpus = W2V_corpus_iter(X_train) w2vModel = train_W2V(W2V_corpus, in_path + 'w2vModel') word2idx, embedMatrix = build_word2idx_embedMatrix( w2vModel) # 制作word2idx和embedMatrix X_train_idx = make_X_train_idx(X_train, word2idx, MAX_SENT_NUM, MAX_SENT_LEN) X_test_idx = make_X_train_idx(X_test, word2idx, MAX_SENT_NUM, MAX_SENT_LEN) y_train_oneHot = make_y_train_oneHot(y_train, is_cate_dict=True) y_test_oneHot = make_y_train_oneHot(y_test, is_cate_dict=True) print(len(X_train_idx), len(X_test_idx), len(y_train_oneHot), len(y_test_oneHot)) yelp_2014_data = {} yelp_2014_data['X_train_idx'] = X_train_idx yelp_2014_data['X_test_idx'] = X_test_idx yelp_2014_data['y_train_oneHot'] = y_train_oneHot yelp_2014_data['y_test_oneHot'] = y_test_oneHot yelp_2014_data['embedMatrix'] = embedMatrix
print('——————————————load data——————————————') (X_train, y_train), (X_test, y_test) = imdb.load_data() X_all = (list(X_train) + list(X_test))[0:] y_all = (list(y_train) + list(y_test))[0:] print(len(X_all), len(y_all)) imdb_word2idx = imdb.get_word_index() imdb_idx2word = dict((idx, word) for (word, idx) in imdb_word2idx.items()) X_all = [[imdb_idx2word.get(idx - 3, '?') for idx in sen][1:] for sen in X_all] w2vModel = train_W2V(X_all, in_path + 'w2vModel') word2idx, embedMatrix = build_word2idx_embedMatrix( w2vModel) # 制作word2idx和embedMatrix X_all_idx = make_X_train_idx(X_all, word2idx, MAX_SEQ_LEN) y_all_idx = np.array(y_all) # 一定要注意,X_all和y_all必须是np.array()类型,否则报错 X_tra_idx, X_val_idx, y_tra_idx, y_val_idx = train_test_split( X_all_idx, y_all_idx, test_size=0.2, random_state=0, stratify=y_all_idx) y_tra_oneHot = make_y_train_oneHot(y_tra_idx) y_val_oneHot = make_y_train_oneHot(y_val_idx) print('——————————————模型的训练和预测——————————————') start = time() model = textCNN_train_test(embedMatrix) model.train([X_tra_idx, y_tra_oneHot]) #不知道为什么,验证非常非常慢!!但keras非常快,很奇怪!!! y_pred_idx = model.test([X_val_idx, y_val_oneHot])
X_tra_c, X_tra_t, y_tra = load_data(in_path + 'train.raw') X_test_c, X_test_t, y_test = load_data(in_path + 'test.raw') print(len(X_tra_c), len(X_tra_t), len(y_tra)) print(len(X_test_c), len(X_test_t), len(y_test)) if os.path.exists(in_path + 'embedMatrix.pkl'): embedMatrix = pickle.load(open(in_path + 'embedMatrix.pkl', 'rb')) word2idx = pickle.load(open(in_path + 'word2idx.pkl', 'rb')) else: all = X_tra_c + X_tra_t + X_test_c + X_test_t word_set = set([w for sent in all for w in sent]) w2vModel = load_W2V(in_path + 'glove.42B.300d.txt', word_set=word_set) word2idx, embedMatrix = build_word2idx_embedMatrix(w2vModel) pickle.dump(embedMatrix, open(in_path + 'embedMatrix.pkl', 'wb')) pickle.dump(word2idx, open(in_path + 'word2idx.pkl', 'wb')) X_tra_c_idx = make_X_train_idx(X_tra_c, word2idx, MAX_SEQ_LEN) X_tra_t_idx = make_X_train_idx(X_tra_t, word2idx, MAX_SEQ_LEN) X_test_c_idx = make_X_train_idx(X_test_c, word2idx, MAX_SEQ_LEN) X_test_t_idx = make_X_train_idx(X_test_t, word2idx, MAX_SEQ_LEN) y_tra_oneHot = make_y_train_oneHot(y_tra) y_test_oneHot = make_y_train_oneHot(y_test) print('——————————————train model——————————————') model = IAN_train_test(embedMatrix) model.train([X_tra_c_idx, X_tra_t_idx, y_tra_oneHot]) y_pred = model.test([X_test_c_idx, X_test_t_idx, y_test_oneHot]) print(y_pred)