def test(): model = CNN(num_used_hists, num_words, num_word_embedding_dims) model.create_model() model.get_model_summary() print('loading pre-trained model...') model.model.load_weights(r'%s/3dcnn_word_80_neg_1_epoch_14_val_loss_0.26.model'%config['3DCNN']['path_model_folder']) print('loading doc embedding...') mp_doc_embedding = my_utils.read_pkl(config['DEFAULT']['path_all_news_doc_embedding']) test_user_ids = list(mp_test_hist.keys()) test_news_ids = get_test_news_ids() user_ids, news_ids = [], [] print('start predicting...') user_in = [] article_in = [] for user_id in tqdm(test_user_ids): clicked_news_ids = set(mp_test_hist[user_id]) un_clicked_news_ids = list(test_news_ids - clicked_news_ids) user_train_hist = mp_train_hist[user_id] user_embedding = [] if len(user_train_hist) > num_used_hists: for news_id in user_train_hist[:num_used_hists]: user_embedding.append(mp_doc_embedding[news_id]) else: for news_id in user_train_hist[:-1]: user_embedding.append(mp_doc_embedding[news_id]) num_paddings = num_used_hists - len(user_embedding) for i in range(num_paddings): user_embedding.append(np.zeros((num_words, num_word_embedding_dims))) for news_id in random.sample(un_clicked_news_ids, int(config['DEFAULT']['num_test_negatives'])): user_in.append(user_embedding) article_in.append(mp_doc_embedding[news_id]) user_ids.append(user_id) news_ids.append(news_id) for news_id in clicked_news_ids: user_in.append(user_embedding) article_in.append(mp_doc_embedding[news_id]) user_ids.append(user_id) news_ids.append(news_id) user_in = np.array(user_in) article_in = np.array(article_in) user_in = np.resize(user_in, (user_in.shape[0], 1) + user_in.shape[1:]) article_in = np.resize(article_in, (article_in.shape[0], 1) + article_in.shape[1:]) out = model.model.predict([user_in, article_in], batch_size = sample_batch_size, verbose = 1) save_prediction(user_ids, news_ids, out)
def train(): model = CNN(num_used_hists, num_words, num_word_embedding_dims) model.create_model() model.get_model_summary() print('loading doc embedding...') mp_doc_embedding = my_utils.read_pkl( config['DEFAULT']['path_all_news_doc_embedding']) print('constructing input data...') train_news_ids = get_train_news_ids() user_in = [] article_in = [] truth = [] for user_id, clicked_news_ids in tqdm(mp_train_hist.items()): user_embedding = [] if len(clicked_news_ids) > num_used_hists: for news_id in clicked_news_ids[:num_used_hists]: user_embedding.append(mp_doc_embedding[news_id]) for news_id in clicked_news_ids[num_used_hists:]: article_in.append(mp_doc_embedding[news_id]) user_in.append(user_embedding) truth.append(1) for i in range(num_train_negatives): article_in.append(mp_doc_embedding[get_negative_news_id( train_news_ids, user_id)]) user_in.append(user_embedding) truth.append(0) else: for news_id in clicked_news_ids[:-1]: user_embedding.append(mp_doc_embedding[news_id]) num_paddings = num_used_hists - len(user_embedding) for i in range(num_paddings): user_embedding.append( np.zeros((num_words, num_word_embedding_dims))) article_in.append(mp_doc_embedding[clicked_news_ids[-1]]) user_in.append(user_embedding) truth.append(1) for i in range(num_train_negatives): article_in.append(mp_doc_embedding[get_negative_news_id( train_news_ids, user_id)]) user_in.append(user_embedding) truth.append(0) print('reshaping input data...') user_in = np.array(user_in) article_in = np.array(article_in) user_in = np.resize(user_in, (user_in.shape[0], 1) + user_in.shape[1:]) article_in = np.resize(article_in, (article_in.shape[0], 1) + article_in.shape[1:]) print('start training...') model.fit_model([user_in, article_in], np.array(truth), sample_batch_size, num_epochs)
def batch_test(): model = CNN(num_used_hists, num_words, num_word_embedding_dims) model.create_model() model.get_model_summary() print('loading pre-trained model...') model.model.load_weights(r'%s/3dcnn_word_80_neg_1_epoch_07_val_loss_0.27.model'%config['3DCNN']['path_model_folder']) print('loading doc embedding...') mp_doc_embedding = my_utils.read_pkl(config['DEFAULT']['path_all_news_doc_embedding']) test_user_ids = list(mp_test_hist.keys()) test_news_ids = get_test_news_ids() user_ids, news_ids, outs = [], [], [] print('start predicting...') for user_batch_id in tqdm(range(0, len(test_user_ids), test_user_batch_size)): user_in = [] article_in = [] for user_id in test_user_ids[user_batch_id : user_batch_id + test_user_batch_size]: clicked_news_ids = set(mp_test_hist[user_id]) un_clicked_news_ids = list(test_news_ids - clicked_news_ids) user_train_hist = mp_train_hist[user_id] user_embedding = [] if len(user_train_hist) > num_used_hists: for news_id in user_train_hist[:num_used_hists]: user_embedding.append(mp_doc_embedding[news_id]) else: for news_id in user_train_hist[:-1]: user_embedding.append(mp_doc_embedding[news_id]) num_paddings = num_used_hists - len(user_embedding) for i in range(num_paddings): user_embedding.append(np.zeros((num_words, num_word_embedding_dims))) for news_id in random.sample(un_clicked_news_ids, int(config['DEFAULT']['num_test_negatives'])): user_in.append(user_embedding) article_in.append(mp_doc_embedding[news_id]) user_ids.append(user_id) news_ids.append(news_id) for news_id in clicked_news_ids: user_in.append(user_embedding) article_in.append(mp_doc_embedding[news_id]) user_ids.append(user_id) news_ids.append(news_id) # #records必须是batchsize的整数倍,要不然会崩溃:https://stackoverflow.com/a/59971264/2468587 while len(user_in) % sample_batch_size != 0: for user_id in test_user_ids[user_batch_id : user_batch_id + test_user_batch_size]: clicked_news_ids = set(mp_test_hist[user_id]) un_clicked_news_ids = list(test_news_ids - clicked_news_ids) user_train_hist = mp_train_hist[user_id] user_embedding = [] if len(user_train_hist) > num_used_hists: for news_id in user_train_hist[:num_used_hists]: user_embedding.append(mp_doc_embedding[news_id]) else: for news_id in user_train_hist[:-1]: user_embedding.append(mp_doc_embedding[news_id]) num_paddings = num_used_hists - len(user_embedding) for i in range(num_paddings): user_embedding.append(np.zeros((num_words, num_word_embedding_dims))) news_id = random.sample(list(test_news_ids), 1)[0] user_in.append(user_embedding) article_in.append(mp_doc_embedding[news_id]) user_ids.append(user_id) news_ids.append(news_id) if len(user_in) % sample_batch_size == 0: break print('#records=%d,batchsize=%d'%(len(user_in), sample_batch_size)) user_in = np.array(user_in) article_in = np.array(article_in) user_in = np.resize(user_in, (user_in.shape[0], 1) + user_in.shape[1:]) article_in = np.resize(article_in, (article_in.shape[0], 1) + article_in.shape[1:]) out = model.model.predict([user_in, article_in], batch_size = sample_batch_size, verbose = 1) outs.extend(list(out)) save_prediction(user_ids, news_ids, outs)