def download_mind(size="small", dest_path=None): """Download MIND dataset Args: size (str): Dataset size. One of ["small", "large"] dest_path (str): Download path. If path is None, it will download the dataset on a temporal path Returns: str, str: Path to train and validation sets. """ size_options = ["small", "large", "demo"] if size not in size_options: raise ValueError( f"Wrong size option, available options are {size_options}") if size == 'large': url_train, url_valid, url_test = URL_MIND[size] with download_path(dest_path) as path: train_path = maybe_download(url=url_train, work_directory=path) valid_path = maybe_download(url=url_valid, work_directory=path) test_path = maybe_download(url=url_test, work_directory=path) return train_path, valid_path, test_path else: url_train, url_valid = URL_MIND[size] with download_path(dest_path) as path: train_path = maybe_download(url=url_train, work_directory=path) valid_path = maybe_download(url=url_valid, work_directory=path) return train_path, valid_path
def test_maybe_download(): file_url = "https://raw.githubusercontent.com/Microsoft/Recommenders/main/LICENSE" filepath = "license.txt" assert not os.path.exists(filepath) filepath = maybe_download(file_url, "license.txt", expected_bytes=1162) assert os.path.exists(filepath) os.remove(filepath) with pytest.raises(IOError): filepath = maybe_download(file_url, "license.txt", expected_bytes=0)
def test_maybe_download(): file_url = "https://raw.githubusercontent.com/Microsoft/Recommenders/master/LICENSE" filepath = "license.txt" assert not os.path.exists(filepath) filepath = maybe_download(file_url, "license.txt", expected_bytes=1162) assert os.path.exists(filepath) os.remove(filepath) with pytest.raises(IOError): filepath = maybe_download(file_url, "license.txt", expected_bytes=0)
def _download_reviews(name, dest_path): """Downloads Amazon reviews datafile. Args: dest_path (str): File path for the downloaded file """ url = ("http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/" + name + ".gz") dirs, file = os.path.split(dest_path) maybe_download(url, file + ".gz", work_directory=dirs)
def download_movielens(size, dest_path): """Downloads MovieLens datafile. Args: size (str): Size of the data to load. One of ("100k", "1m", "10m", "20m"). dest_path (str): File path for the downloaded file """ if size not in DATA_FORMAT: raise ValueError(ERROR_MOVIE_LENS_SIZE) url = "http://files.grouplens.org/datasets/movielens/ml-" + size + ".zip" dirs, file = os.path.split(dest_path) maybe_download(url, file, work_directory=dirs)
def download_movielens(size, dest_path): """Downloads MovieLens datafile. Args: size (str): Size of the data to load. One of ("100k", "1m", "10m", "20m"). dest_path (str): File path for the downloaded file """ if size not in DATA_FORMAT: raise ValueError(ERROR_MOVIE_LENS_SIZE) url = "http://files.grouplens.org/datasets/movielens/ml-" + size + ".zip" dirs, file = os.path.split(dest_path) maybe_download(url, file, work_directory=dirs)
def download_deeprec_resources(azure_container_url, data_path, remote_resource_name): """Download resources. Args: azure_container_url (str): URL of Azure container. data_path (str): Path to download the resources. remote_resource_name (str): Name of the resource. """ os.makedirs(data_path, exist_ok=True) remote_path = azure_container_url + remote_resource_name maybe_download(remote_path, remote_resource_name, data_path) zip_ref = zipfile.ZipFile(os.path.join(data_path, remote_resource_name), "r") zip_ref.extractall(data_path) zip_ref.close() os.remove(os.path.join(data_path, remote_resource_name))
def download_criteo(size="sample", work_directory="."): """Download criteo dataset as a compressed file. Args: size (str): Size of criteo dataset. It can be "full" or "sample". work_directory (str): Working directory. Returns: str: Path of the downloaded file. """ url = CRITEO_URL[size] return maybe_download(url, work_directory=work_directory)
def download_criteo(size="sample", work_directory="."): """Download criteo dataset as a compressed file. Args: size (str): Size of criteo dataset. It can be "full" or "sample". work_directory (str): Working directory. Returns: str: Path of the downloaded file. """ url = CRITEO_URL[size] return maybe_download(url, work_directory=work_directory)
def _download_and_extract_globe(dest_path): url = "http://nlp.stanford.edu/data/glove.6B.zip" filepath = maybe_download(url=url, work_directory=dest_path) glove_path = os.path.join(dest_path, "glove") unzip_file(filepath, glove_path, clean_zip_file=False) return glove_path
train_file = os.path.join(data_path, "train_mind.txt") valid_file = os.path.join(data_path, "valid_mind.txt") test_file = os.path.join(data_path, "test_mind.txt") user_history_file = os.path.join(data_path, "user_history.txt") infer_embedding_file = os.path.join(data_path, "infer_embedding.txt") news_feature_file = os.path.join(data_path, "doc_feature.txt") word_embeddings_file = os.path.join(data_path, "word_embeddings_5w_100.npy") entity_embeddings_file = os.path.join(data_path, "entity_embeddings_5w_100.npy") train_path = os.path.join(data_path, "train") valid_path = os.path.join(data_path, "valid") test_path = os.path.join(data_path, "test") yaml_file = maybe_download( url= "https://recodatasets.blob.core.windows.net/deeprec/deeprec/dkn/dkn_MINDsmall.yaml", work_directory=data_path) hparams = prepare_hparams(yaml_file, news_feature_file=news_feature_file, user_history_file=user_history_file, wordEmb_file=word_embeddings_file, entityEmb_file=entity_embeddings_file, epochs=epochs, history_size=history_size, batch_size=batch_size) hparams.save_model = True hparams.show_step = 5000 hparams.MODEL_DIR = 'para' model = DKN(hparams, DKNTextIterator)
print("System version: {}".format(sys.version)) print("Pyspark version: {}".format(pyspark.__version__)) #%% DATA_URL = "http://files.grouplens.org/datasets/movielens/ml-100k/u.data" DATA_PATH = "ml-100k.data" # Set the column names that will be imported COL_USER = "******" COL_ITEM = "MovieId" COL_RATING = "Rating" COL_PREDICTION = "Rating" COL_TIMESTAMP = "Timestamp" filepath = maybe_download(DATA_URL, DATA_PATH) data = pd.read_csv(filepath, sep="\t", names=[COL_USER, COL_ITEM, COL_RATING, COL_TIMESTAMP]) # Head of pandas lets you look at some initial instances print(data.head()) # Describe shows data statistics of each column print(data.describe()) # Print the number of users, items and ratings for the chosen dataset print("Total number of ratings are\t{}".format(data.shape[0]), "Total number of users are\t{}".format(data[COL_USER].nunique()), "Total number of items are\t{}".format(data[COL_ITEM].nunique()), sep="\n")