Esempio n. 1
0
def download_mind(size="small", dest_path=None):
    """Download MIND dataset

    Args:
        size (str): Dataset size. One of ["small", "large"]
        dest_path (str): Download path. If path is None, it will download the dataset on a temporal path
        
    Returns:
        str, str: Path to train and validation sets.
    """
    size_options = ["small", "large", "demo"]
    if size not in size_options:
        raise ValueError(
            f"Wrong size option, available options are {size_options}")
    if size == 'large':
        url_train, url_valid, url_test = URL_MIND[size]
        with download_path(dest_path) as path:
            train_path = maybe_download(url=url_train, work_directory=path)
            valid_path = maybe_download(url=url_valid, work_directory=path)
            test_path = maybe_download(url=url_test, work_directory=path)
        return train_path, valid_path, test_path
    else:
        url_train, url_valid = URL_MIND[size]
        with download_path(dest_path) as path:
            train_path = maybe_download(url=url_train, work_directory=path)
            valid_path = maybe_download(url=url_valid, work_directory=path)
        return train_path, valid_path
Esempio n. 2
0
def test_maybe_download():
    file_url = "https://raw.githubusercontent.com/Microsoft/Recommenders/main/LICENSE"
    filepath = "license.txt"
    assert not os.path.exists(filepath)
    filepath = maybe_download(file_url, "license.txt", expected_bytes=1162)
    assert os.path.exists(filepath)
    os.remove(filepath)
    with pytest.raises(IOError):
        filepath = maybe_download(file_url, "license.txt", expected_bytes=0)
Esempio n. 3
0
def test_maybe_download():
    file_url = "https://raw.githubusercontent.com/Microsoft/Recommenders/master/LICENSE"
    filepath = "license.txt"
    assert not os.path.exists(filepath)
    filepath = maybe_download(file_url, "license.txt", expected_bytes=1162)
    assert os.path.exists(filepath)
    os.remove(filepath)
    with pytest.raises(IOError):
        filepath = maybe_download(file_url, "license.txt", expected_bytes=0)
def _download_reviews(name, dest_path):
    """Downloads Amazon reviews datafile.

    Args:
        dest_path (str): File path for the downloaded file
    """

    url = ("http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/" +
           name + ".gz")

    dirs, file = os.path.split(dest_path)
    maybe_download(url, file + ".gz", work_directory=dirs)
Esempio n. 5
0
def download_movielens(size, dest_path):
    """Downloads MovieLens datafile.

    Args:
        size (str): Size of the data to load. One of ("100k", "1m", "10m", "20m").
        dest_path (str): File path for the downloaded file
    """
    if size not in DATA_FORMAT:
        raise ValueError(ERROR_MOVIE_LENS_SIZE)

    url = "http://files.grouplens.org/datasets/movielens/ml-" + size + ".zip"
    dirs, file = os.path.split(dest_path)
    maybe_download(url, file, work_directory=dirs)
Esempio n. 6
0
def download_movielens(size, dest_path):
    """Downloads MovieLens datafile.

    Args:
        size (str): Size of the data to load. One of ("100k", "1m", "10m", "20m").
        dest_path (str): File path for the downloaded file
    """
    if size not in DATA_FORMAT:
        raise ValueError(ERROR_MOVIE_LENS_SIZE)

    url = "http://files.grouplens.org/datasets/movielens/ml-" + size + ".zip"
    dirs, file = os.path.split(dest_path)
    maybe_download(url, file, work_directory=dirs)
Esempio n. 7
0
def download_deeprec_resources(azure_container_url, data_path, remote_resource_name):
    """Download resources.

    Args:
        azure_container_url (str): URL of Azure container.
        data_path (str): Path to download the resources.
        remote_resource_name (str): Name of the resource.
    """
    os.makedirs(data_path, exist_ok=True)
    remote_path = azure_container_url + remote_resource_name
    maybe_download(remote_path, remote_resource_name, data_path)
    zip_ref = zipfile.ZipFile(os.path.join(data_path, remote_resource_name), "r")
    zip_ref.extractall(data_path)
    zip_ref.close()
    os.remove(os.path.join(data_path, remote_resource_name))
Esempio n. 8
0
def download_criteo(size="sample", work_directory="."):
    """Download criteo dataset as a compressed file.

    Args:
        size (str): Size of criteo dataset. It can be "full" or "sample".
        work_directory (str): Working directory.

    Returns:
        str: Path of the downloaded file.

    """
    url = CRITEO_URL[size]
    return maybe_download(url, work_directory=work_directory)
Esempio n. 9
0
def download_criteo(size="sample", work_directory="."):
    """Download criteo dataset as a compressed file.

    Args:
        size (str): Size of criteo dataset. It can be "full" or "sample".
        work_directory (str): Working directory.

    Returns:
        str: Path of the downloaded file.

    """
    url = CRITEO_URL[size]
    return maybe_download(url, work_directory=work_directory)
Esempio n. 10
0
def _download_and_extract_globe(dest_path):
    url = "http://nlp.stanford.edu/data/glove.6B.zip"
    filepath = maybe_download(url=url, work_directory=dest_path)
    glove_path = os.path.join(dest_path, "glove")
    unzip_file(filepath, glove_path, clean_zip_file=False)
    return glove_path
Esempio n. 11
0
train_file = os.path.join(data_path, "train_mind.txt")
valid_file = os.path.join(data_path, "valid_mind.txt")
test_file = os.path.join(data_path, "test_mind.txt")
user_history_file = os.path.join(data_path, "user_history.txt")
infer_embedding_file = os.path.join(data_path, "infer_embedding.txt")
news_feature_file = os.path.join(data_path, "doc_feature.txt")
word_embeddings_file = os.path.join(data_path, "word_embeddings_5w_100.npy")
entity_embeddings_file = os.path.join(data_path,
                                      "entity_embeddings_5w_100.npy")

train_path = os.path.join(data_path, "train")
valid_path = os.path.join(data_path, "valid")
test_path = os.path.join(data_path, "test")

yaml_file = maybe_download(
    url=
    "https://recodatasets.blob.core.windows.net/deeprec/deeprec/dkn/dkn_MINDsmall.yaml",
    work_directory=data_path)
hparams = prepare_hparams(yaml_file,
                          news_feature_file=news_feature_file,
                          user_history_file=user_history_file,
                          wordEmb_file=word_embeddings_file,
                          entityEmb_file=entity_embeddings_file,
                          epochs=epochs,
                          history_size=history_size,
                          batch_size=batch_size)

hparams.save_model = True
hparams.show_step = 5000
hparams.MODEL_DIR = 'para'

model = DKN(hparams, DKNTextIterator)
Esempio n. 12
0
print("System version: {}".format(sys.version))
print("Pyspark version: {}".format(pyspark.__version__))

#%%

DATA_URL = "http://files.grouplens.org/datasets/movielens/ml-100k/u.data"
DATA_PATH = "ml-100k.data"

# Set the column names that will be imported
COL_USER = "******"
COL_ITEM = "MovieId"
COL_RATING = "Rating"
COL_PREDICTION = "Rating"
COL_TIMESTAMP = "Timestamp"

filepath = maybe_download(DATA_URL, DATA_PATH)

data = pd.read_csv(filepath,
                   sep="\t",
                   names=[COL_USER, COL_ITEM, COL_RATING, COL_TIMESTAMP])

# Head of pandas lets you look at some initial instances
print(data.head())
# Describe shows data statistics of each column
print(data.describe())

# Print the number of users, items and ratings for the chosen dataset
print("Total number of ratings are\t{}".format(data.shape[0]),
      "Total number of users are\t{}".format(data[COL_USER].nunique()),
      "Total number of items are\t{}".format(data[COL_ITEM].nunique()),
      sep="\n")