def test_extract_mind(size,tmp): """ Test file download and extration for demo and small datasets """ train_zip, valid_zip = download_mind(size, dest_path=tmp) train_path, valid_path = extract_mind(train_zip, valid_zip) if size == "demo": statinfo = os.stat(os.path.join(train_path, "behaviors.tsv")) assert statinfo.st_size == 14707247 statinfo = os.stat(os.path.join(train_path, "entity_embedding.vec")) assert statinfo.st_size == 16077470 statinfo = os.stat(os.path.join(train_path, "news.tsv")) assert statinfo.st_size == 23120370 statinfo = os.stat(os.path.join(train_path, "relation_embedding.vec")) assert statinfo.st_size == 1044588 statinfo = os.stat(os.path.join(valid_path, "behaviors.tsv")) assert statinfo.st_size == 4434762 statinfo = os.stat(os.path.join(valid_path, "entity_embedding.vec")) assert statinfo.st_size == 11591565 statinfo = os.stat(os.path.join(valid_path, "news.tsv")) assert statinfo.st_size == 15624320 statinfo = os.stat(os.path.join(valid_path, "relation_embedding.vec")) assert statinfo.st_size == 1044588 elif size == "small": statinfo = os.stat(os.path.join(train_path, "behaviors.tsv")) assert statinfo.st_size == 92019716 statinfo = os.stat(os.path.join(train_path, "entity_embedding.vec")) assert statinfo.st_size == 25811015 statinfo = os.stat(os.path.join(train_path, "news.tsv")) assert statinfo.st_size == 41202121 statinfo = os.stat(os.path.join(train_path, "relation_embedding.vec")) assert statinfo.st_size == 1044588 statinfo = os.stat(os.path.join(valid_path, "behaviors.tsv")) assert statinfo.st_size == 42838544 statinfo = os.stat(os.path.join(valid_path, "entity_embedding.vec")) assert statinfo.st_size == 21960998 statinfo = os.stat(os.path.join(valid_path, "news.tsv")) assert statinfo.st_size == 33519092 statinfo = os.stat(os.path.join(valid_path, "relation_embedding.vec")) assert statinfo.st_size == 1044588 else: assert False
def test_extract_mind(tmp): train_zip, valid_zip = download_mind(size="small", dest_path=tmp) train_path, valid_path = extract_mind(train_zip, valid_zip) statinfo = os.stat(os.path.join(train_path, "behaviors.tsv")) assert statinfo.st_size == 92047111 statinfo = os.stat(os.path.join(train_path, "entity_embedding.vec")) assert statinfo.st_size == 25811015 statinfo = os.stat(os.path.join(train_path, "news.tsv")) assert statinfo.st_size == 45895926 statinfo = os.stat(os.path.join(train_path, "relation_embedding.vec")) assert statinfo.st_size == 1044588 statinfo = os.stat(os.path.join(valid_path, "behaviors.tsv")) assert statinfo.st_size == 42975799 statinfo = os.stat(os.path.join(valid_path, "entity_embedding.vec")) assert statinfo.st_size == 21960998 statinfo = os.stat(os.path.join(valid_path, "news.tsv")) assert statinfo.st_size == 37410117 statinfo = os.stat(os.path.join(valid_path, "relation_embedding.vec")) assert statinfo.st_size == 1044588
def test_extract_mind(tmp): train_zip, valid_zip = download_mind(size="large", dest_path=tmp) train_path, valid_path = extract_mind(train_zip, valid_zip) statinfo = os.stat(os.path.join(train_path, "behaviors.tsv")) assert statinfo.st_size == 1373844151 statinfo = os.stat(os.path.join(train_path, "entity_embedding.vec")) assert statinfo.st_size == 40305151 statinfo = os.stat(os.path.join(train_path, "news.tsv")) assert statinfo.st_size == 84881998 statinfo = os.stat(os.path.join(train_path, "relation_embedding.vec")) assert statinfo.st_size == 1044588 statinfo = os.stat(os.path.join(valid_path, "behaviors.tsv")) assert statinfo.st_size == 230662527 statinfo = os.stat(os.path.join(valid_path, "entity_embedding.vec")) assert statinfo.st_size == 31958202 statinfo = os.stat(os.path.join(valid_path, "news.tsv")) assert statinfo.st_size == 59055351 statinfo = os.stat(os.path.join(valid_path, "relation_embedding.vec")) assert statinfo.st_size == 1044588
def test_extract_mind(tmp): train_zip, valid_zip = download_mind(size="small", dest_path=tmp) train_path, valid_path = extract_mind(train_zip, valid_zip) statinfo = os.stat(os.path.join(train_path, "behaviors.tsv")) assert statinfo.st_size == 92019716 statinfo = os.stat(os.path.join(train_path, "entity_embedding.vec")) assert statinfo.st_size == 25811015 statinfo = os.stat(os.path.join(train_path, "news.tsv")) assert statinfo.st_size == 41202121 statinfo = os.stat(os.path.join(train_path, "relation_embedding.vec")) assert statinfo.st_size == 1044588 statinfo = os.stat(os.path.join(valid_path, "behaviors.tsv")) assert statinfo.st_size == 42838544 statinfo = os.stat(os.path.join(valid_path, "entity_embedding.vec")) assert statinfo.st_size == 21960998 statinfo = os.stat(os.path.join(valid_path, "news.tsv")) assert statinfo.st_size == 33519092 statinfo = os.stat(os.path.join(valid_path, "relation_embedding.vec")) assert statinfo.st_size == 1044588
MIND_SIZE = "small" # DKN parameters epochs = 10 history_size = 50 batch_size = 100 # Paths data_path = os.path.join(tmpdir, "mind-dkn") train_file = os.path.join(data_path, "train_mind.txt") valid_file = os.path.join(data_path, "valid_mind.txt") user_history_file = os.path.join(data_path, "user_history.txt") infer_embedding_file = os.path.join(data_path, "infer_embedding.txt") train_zip, valid_zip = download_mind(size=MIND_SIZE, dest_path=data_path) train_path, valid_path = extract_mind(train_zip, valid_zip) train_session, train_history = read_clickhistory(train_path, "behaviors.tsv") valid_session, valid_history = read_clickhistory(valid_path, "behaviors.tsv") get_train_input(train_session, train_file) get_valid_input(valid_session, valid_file) get_user_history(train_history, valid_history, user_history_file) train_news = os.path.join(train_path, "news.tsv") valid_news = os.path.join(valid_path, "news.tsv") news_words, news_entities = get_words_and_entities(train_news, valid_news) train_entities = os.path.join(train_path, "entity_embedding.vec") valid_entities = os.path.join(valid_path, "entity_embedding.vec") news_feature_file, word_embeddings_file, entity_embeddings_file = generate_embeddings( data_path,
user_history_file = os.path.join(data_path, "user_history.txt") infer_embedding_file = os.path.join(data_path, "infer_embedding.txt") news_feature_file = os.path.join(data_path, "doc_feature.txt") word_embeddings_file = os.path.join(data_path, "word_embeddings_5w_100.npy") entity_embeddings_file = os.path.join(data_path, "entity_embeddings_5w_100.npy") train_path = os.path.join(data_path, "train") valid_path = os.path.join(data_path, "valid") test_path = os.path.join(data_path, "test") # not have file then download if not os.path.exists(train_path): train_zip, valid_zip, test_zip = download_mind(size='large', dest_path=data_path) train_path, valid_path, test_path = extract_mind(train_zip, valid_zip, test_zip, root_folder=data_path) # parse file if not os.path.exists(train_file): train_session, train_history = read_clickhistory(train_path, "behaviors.tsv") get_train_input(train_session, train_file) valid_session, valid_history = read_clickhistory(valid_path, "behaviors.tsv") get_valid_input(valid_session, valid_file) test_session, test_history = read_test_clickhistory( test_path, "behaviors.tsv") get_test_input(test_session, test_file)