Exemple #1
0
def test_extract_mind(size,tmp):
    """ Test file download and extration for demo and small datasets """
    train_zip, valid_zip = download_mind(size, dest_path=tmp)
    train_path, valid_path = extract_mind(train_zip, valid_zip)

    if size == "demo":
        statinfo = os.stat(os.path.join(train_path, "behaviors.tsv"))
        assert statinfo.st_size == 14707247
        statinfo = os.stat(os.path.join(train_path, "entity_embedding.vec"))
        assert statinfo.st_size == 16077470
        statinfo = os.stat(os.path.join(train_path, "news.tsv"))
        assert statinfo.st_size == 23120370
        statinfo = os.stat(os.path.join(train_path, "relation_embedding.vec"))
        assert statinfo.st_size == 1044588
        statinfo = os.stat(os.path.join(valid_path, "behaviors.tsv"))
        assert statinfo.st_size == 4434762
        statinfo = os.stat(os.path.join(valid_path, "entity_embedding.vec"))
        assert statinfo.st_size == 11591565
        statinfo = os.stat(os.path.join(valid_path, "news.tsv"))
        assert statinfo.st_size == 15624320
        statinfo = os.stat(os.path.join(valid_path, "relation_embedding.vec"))
        assert statinfo.st_size == 1044588
    elif size == "small":
        statinfo = os.stat(os.path.join(train_path, "behaviors.tsv"))
        assert statinfo.st_size == 92019716
        statinfo = os.stat(os.path.join(train_path, "entity_embedding.vec"))
        assert statinfo.st_size == 25811015
        statinfo = os.stat(os.path.join(train_path, "news.tsv"))
        assert statinfo.st_size == 41202121
        statinfo = os.stat(os.path.join(train_path, "relation_embedding.vec"))
        assert statinfo.st_size == 1044588
        statinfo = os.stat(os.path.join(valid_path, "behaviors.tsv"))
        assert statinfo.st_size == 42838544
        statinfo = os.stat(os.path.join(valid_path, "entity_embedding.vec"))
        assert statinfo.st_size == 21960998
        statinfo = os.stat(os.path.join(valid_path, "news.tsv"))
        assert statinfo.st_size == 33519092
        statinfo = os.stat(os.path.join(valid_path, "relation_embedding.vec"))
        assert statinfo.st_size == 1044588
    else:
        assert False
        
Exemple #2
0
def test_extract_mind(tmp):
    train_zip, valid_zip = download_mind(size="small", dest_path=tmp)
    train_path, valid_path = extract_mind(train_zip, valid_zip)

    statinfo = os.stat(os.path.join(train_path, "behaviors.tsv"))
    assert statinfo.st_size == 92047111
    statinfo = os.stat(os.path.join(train_path, "entity_embedding.vec"))
    assert statinfo.st_size == 25811015
    statinfo = os.stat(os.path.join(train_path, "news.tsv"))
    assert statinfo.st_size == 45895926
    statinfo = os.stat(os.path.join(train_path, "relation_embedding.vec"))
    assert statinfo.st_size == 1044588

    statinfo = os.stat(os.path.join(valid_path, "behaviors.tsv"))
    assert statinfo.st_size == 42975799
    statinfo = os.stat(os.path.join(valid_path, "entity_embedding.vec"))
    assert statinfo.st_size == 21960998
    statinfo = os.stat(os.path.join(valid_path, "news.tsv"))
    assert statinfo.st_size == 37410117
    statinfo = os.stat(os.path.join(valid_path, "relation_embedding.vec"))
    assert statinfo.st_size == 1044588
Exemple #3
0
def test_extract_mind(tmp):
    train_zip, valid_zip = download_mind(size="large", dest_path=tmp)
    train_path, valid_path = extract_mind(train_zip, valid_zip)

    statinfo = os.stat(os.path.join(train_path, "behaviors.tsv"))
    assert statinfo.st_size == 1373844151
    statinfo = os.stat(os.path.join(train_path, "entity_embedding.vec"))
    assert statinfo.st_size == 40305151
    statinfo = os.stat(os.path.join(train_path, "news.tsv"))
    assert statinfo.st_size == 84881998
    statinfo = os.stat(os.path.join(train_path, "relation_embedding.vec"))
    assert statinfo.st_size == 1044588

    statinfo = os.stat(os.path.join(valid_path, "behaviors.tsv"))
    assert statinfo.st_size == 230662527
    statinfo = os.stat(os.path.join(valid_path, "entity_embedding.vec"))
    assert statinfo.st_size == 31958202
    statinfo = os.stat(os.path.join(valid_path, "news.tsv"))
    assert statinfo.st_size == 59055351
    statinfo = os.stat(os.path.join(valid_path, "relation_embedding.vec"))
    assert statinfo.st_size == 1044588
Exemple #4
0
def test_extract_mind(tmp):
    train_zip, valid_zip = download_mind(size="small", dest_path=tmp)
    train_path, valid_path = extract_mind(train_zip, valid_zip)

    statinfo = os.stat(os.path.join(train_path, "behaviors.tsv"))
    assert statinfo.st_size == 92019716
    statinfo = os.stat(os.path.join(train_path, "entity_embedding.vec"))
    assert statinfo.st_size == 25811015
    statinfo = os.stat(os.path.join(train_path, "news.tsv"))
    assert statinfo.st_size == 41202121
    statinfo = os.stat(os.path.join(train_path, "relation_embedding.vec"))
    assert statinfo.st_size == 1044588

    statinfo = os.stat(os.path.join(valid_path, "behaviors.tsv"))
    assert statinfo.st_size == 42838544
    statinfo = os.stat(os.path.join(valid_path, "entity_embedding.vec"))
    assert statinfo.st_size == 21960998
    statinfo = os.stat(os.path.join(valid_path, "news.tsv"))
    assert statinfo.st_size == 33519092
    statinfo = os.stat(os.path.join(valid_path, "relation_embedding.vec"))
    assert statinfo.st_size == 1044588
MIND_SIZE = "small"

# DKN parameters
epochs = 10
history_size = 50
batch_size = 100

# Paths
data_path = os.path.join(tmpdir, "mind-dkn")
train_file = os.path.join(data_path, "train_mind.txt")
valid_file = os.path.join(data_path, "valid_mind.txt")
user_history_file = os.path.join(data_path, "user_history.txt")
infer_embedding_file = os.path.join(data_path, "infer_embedding.txt")

train_zip, valid_zip = download_mind(size=MIND_SIZE, dest_path=data_path)
train_path, valid_path = extract_mind(train_zip, valid_zip)

train_session, train_history = read_clickhistory(train_path, "behaviors.tsv")
valid_session, valid_history = read_clickhistory(valid_path, "behaviors.tsv")
get_train_input(train_session, train_file)
get_valid_input(valid_session, valid_file)
get_user_history(train_history, valid_history, user_history_file)

train_news = os.path.join(train_path, "news.tsv")
valid_news = os.path.join(valid_path, "news.tsv")
news_words, news_entities = get_words_and_entities(train_news, valid_news)

train_entities = os.path.join(train_path, "entity_embedding.vec")
valid_entities = os.path.join(valid_path, "entity_embedding.vec")
news_feature_file, word_embeddings_file, entity_embeddings_file = generate_embeddings(
    data_path,
Exemple #6
0
user_history_file = os.path.join(data_path, "user_history.txt")
infer_embedding_file = os.path.join(data_path, "infer_embedding.txt")
news_feature_file = os.path.join(data_path, "doc_feature.txt")
word_embeddings_file = os.path.join(data_path, "word_embeddings_5w_100.npy")
entity_embeddings_file = os.path.join(data_path,
                                      "entity_embeddings_5w_100.npy")

train_path = os.path.join(data_path, "train")
valid_path = os.path.join(data_path, "valid")
test_path = os.path.join(data_path, "test")
# not have file then download
if not os.path.exists(train_path):
    train_zip, valid_zip, test_zip = download_mind(size='large',
                                                   dest_path=data_path)
    train_path, valid_path, test_path = extract_mind(train_zip,
                                                     valid_zip,
                                                     test_zip,
                                                     root_folder=data_path)
# parse file
if not os.path.exists(train_file):
    train_session, train_history = read_clickhistory(train_path,
                                                     "behaviors.tsv")
    get_train_input(train_session, train_file)

    valid_session, valid_history = read_clickhistory(valid_path,
                                                     "behaviors.tsv")
    get_valid_input(valid_session, valid_file)

    test_session, test_history = read_test_clickhistory(
        test_path, "behaviors.tsv")
    get_test_input(test_session, test_file)