def _download(self, path): """ Downloads train triplets, MSD Challenge Kaggle data (http://millionsongdataset.com/challenge/) and a list of matching errors http://millionsongdataset.com/blog/12-2-12-fixing-matching-errors/ :param path: path to save :return: None """ self.logger.info('Getting Million Song Dataset...') self.logger.info('Downloading Echo Nest Taste Subprofile train data...') base_url = 'http://millionsongdataset.com/sites/default/files/challenge/' download_dataset( base_url + 'train_triplets.txt.zip', join(self.data_folder, 'train.zip') ) rename(join(path, 'train'), path) self.logger.info('Downloading evaluation data for MSD Challenge...') download_dataset( base_url + 'EvalDataYear1MSDWebsite.zip', join(path, 'eval.zip') ) rename( join(path, 'EvalDataYear1MSDWebsite'), join(path, 'evaluation') ) self.logger.info('Downloading list of matching errors...') url = 'http://millionsongdataset.com/sites/default/files/tasteprofile/sid_mismatches.txt' download_url(url, join(path, 'sid_mismatches.txt'))
def _download(self, path): self.logger.info('Downloading Netflix Prize dataset...') url = 'https://archive.org/download/nf_prize_dataset.tar/nf_prize_dataset.tar.gz' download_dataset(url, join(self.data_folder, 'netflix.tar.gz')) rename(join(self.data_folder, 'download'), path) archive = join(path, 'training_set.tar') extract(archive) rm_if_exists(archive)
def _download(self, path): self.logger.info('Downloading Jester dataset...') mkdir(path) base_url = 'http://eigentaste.berkeley.edu/dataset/' self.logger.info('Dataset 1...') d1 = join(path, '1') mkdir(d1) download_dataset(base_url + 'jester_dataset_1_joke_texts.zip', join(d1, 'joke_texts.zip'), manage_folder=False) for i in [1, 2, 3]: download_dataset(base_url + f'jester_dataset_1_{i}.zip', join(d1, f'ratings_{i}.zip'), manage_folder=False) rename(join(d1, f'jester-data-{i}.xls'), join(d1, f'data_{i}.xls')) for i in [3, 4]: self.logger.info(f'Dataset {i}...') d = join(path, str(i)) mkdir(d) download_dataset(base_url + f'Dataset{i}JokeSet.zip', join(d, 'joke_texts.zip'), manage_folder=False) rename(join(d, f'Dataset{i}JokeSet.xlsx'), join(d, 'jokes.xlsx')) download_dataset(base_url + f'JesterDataset{i}.zip', join(d, 'ratings.zip'), manage_folder=False) if i == 3: name = 'FINAL jester 2006-15.xls' else: name = '[final] April 2015 to Nov 30 2019 - Transformed Jester Data - .xlsx' rename(join(d, name), join(d, 'data.xlsx'))
def _download(self, path): self.logger.info('Downloading Epinions dataset...') mkdir(path) base_url = 'http://www.trustlet.org/datasets/downloaded_epinions/' filepath = join(path, 'ratings_data.txt.bz2') download_dataset(base_url + 'ratings_data.txt.bz2', filepath, manage_folder=False) filepath = join(path, 'trust_data.txt.bz2') download_dataset(base_url + 'trust_data.txt.bz2', filepath, manage_folder=False)
def _download(self, path, dataset): """ Download data from https://grouplens.org/datasets/movielens/ Available options: ml-20m, ml-latest-small, ml-latest and other, can be checked on ml site. :param path: where to save :param dataset: dataset version :return: None """ self.logger.info('Downloading %s from grouplens...', dataset) archive = dataset + '.zip' url = f'http://files.grouplens.org/datasets/movielens/{archive}' download_dataset(url, path + '.zip') if dataset == 'ml-10m': rename(join(self.data_folder, 'ml-10M100K'), path) self.replace_separator(join(path, 'movies.dat'), '::', '\t') self.replace_separator(join(path, 'ratings.dat'), '::', '\t') self.replace_separator(join(path, 'tags.dat'), '::', '\t') elif dataset == 'ml-1m': self.replace_separator(join(path, 'movies.dat'), '::', '\t', 'ISO-8859-1') self.replace_separator(join(path, 'ratings.dat'), '::', '\t') self.replace_separator(join(path, 'users.dat'), '::', '\t')
def _download(self, path): self.logger.info('Downloading Last.fm 360k dataset...') url = 'http://mtg.upf.edu/static/datasets/last.fm/lastfm-dataset-360K.tar.gz' download_dataset(url, join(self.data_folder, 'lastfm.tar.gz')) rename(join(self.data_folder, 'lastfm-dataset-360K'), path)
def _download(self, path): self.logger.info('Downloading Dating Agency Dataset...') url = 'http://www.occamslab.com/petricek/data/libimseti-complete.zip' download_dataset(url, join(self.data_folder, 'dating.zip')) rename(join(self.data_folder, 'libimseti'), path)
def _download(self, path): self.logger.info('Downloading YooChoose Dataset...') url = 'https://s3-eu-west-1.amazonaws.com/yc-rdata/yoochoose-data.7z' download_dataset(url, join(self.data_folder, 'yoochoose.7z'))
def _download(self, path): logging.info('Downloading rekko challenge dataset...') archive = 'rekko_challenge_rekko_challenge_2019.zip' url = f'https://boosters.pro/api/ch/files/pub/{archive}' download_dataset(url, join(self.data_folder, 'rekko.zip')) rename(join(self.data_folder, 'rekko'), path)
def _download(self, path): self.logger.info('Downloading Book-Crossing Dataset...') url = 'http://www2.informatik.uni-freiburg.de/~cziegler/BX/BX-CSV-Dump.zip' download_dataset(url, join(self.data_folder, 'bookx.zip'))