コード例 #1
0
    def _download(self, path):
        """
        Downloads train triplets, MSD Challenge Kaggle data
        (http://millionsongdataset.com/challenge/)
        and a list of matching errors
        http://millionsongdataset.com/blog/12-2-12-fixing-matching-errors/

        :param path: path to save
        :return: None
        """
        self.logger.info('Getting Million Song Dataset...')
        self.logger.info('Downloading Echo Nest Taste Subprofile train data...')
        base_url = 'http://millionsongdataset.com/sites/default/files/challenge/'

        download_dataset(
            base_url + 'train_triplets.txt.zip',
            join(self.data_folder, 'train.zip')
        )
        rename(join(path, 'train'), path)

        self.logger.info('Downloading evaluation data for MSD Challenge...')
        download_dataset(
            base_url + 'EvalDataYear1MSDWebsite.zip',
            join(path, 'eval.zip')
        )
        rename(
            join(path, 'EvalDataYear1MSDWebsite'),
            join(path, 'evaluation')
        )

        self.logger.info('Downloading list of matching errors...')
        url = 'http://millionsongdataset.com/sites/default/files/tasteprofile/sid_mismatches.txt'
        download_url(url, join(path, 'sid_mismatches.txt'))
コード例 #2
0
ファイル: netflix.py プロジェクト: sts-sadr/rs_datasets
 def _download(self, path):
     self.logger.info('Downloading Netflix Prize dataset...')
     url = 'https://archive.org/download/nf_prize_dataset.tar/nf_prize_dataset.tar.gz'
     download_dataset(url, join(self.data_folder, 'netflix.tar.gz'))
     rename(join(self.data_folder, 'download'), path)
     archive = join(path, 'training_set.tar')
     extract(archive)
     rm_if_exists(archive)
コード例 #3
0
    def _download(self, path):
        self.logger.info('Downloading Jester dataset...')
        mkdir(path)
        base_url = 'http://eigentaste.berkeley.edu/dataset/'

        self.logger.info('Dataset 1...')
        d1 = join(path, '1')
        mkdir(d1)
        download_dataset(base_url + 'jester_dataset_1_joke_texts.zip',
                         join(d1, 'joke_texts.zip'),
                         manage_folder=False)
        for i in [1, 2, 3]:
            download_dataset(base_url + f'jester_dataset_1_{i}.zip',
                             join(d1, f'ratings_{i}.zip'),
                             manage_folder=False)
            rename(join(d1, f'jester-data-{i}.xls'), join(d1, f'data_{i}.xls'))

        for i in [3, 4]:
            self.logger.info(f'Dataset {i}...')
            d = join(path, str(i))
            mkdir(d)
            download_dataset(base_url + f'Dataset{i}JokeSet.zip',
                             join(d, 'joke_texts.zip'),
                             manage_folder=False)
            rename(join(d, f'Dataset{i}JokeSet.xlsx'), join(d, 'jokes.xlsx'))
            download_dataset(base_url + f'JesterDataset{i}.zip',
                             join(d, 'ratings.zip'),
                             manage_folder=False)
            if i == 3:
                name = 'FINAL jester 2006-15.xls'
            else:
                name = '[final] April 2015 to Nov 30 2019 - Transformed Jester Data - .xlsx'
            rename(join(d, name), join(d, 'data.xlsx'))
コード例 #4
0
ファイル: epinions.py プロジェクト: sts-sadr/rs_datasets
    def _download(self, path):
        self.logger.info('Downloading Epinions dataset...')
        mkdir(path)
        base_url = 'http://www.trustlet.org/datasets/downloaded_epinions/'

        filepath = join(path, 'ratings_data.txt.bz2')
        download_dataset(base_url + 'ratings_data.txt.bz2',
                         filepath,
                         manage_folder=False)

        filepath = join(path, 'trust_data.txt.bz2')
        download_dataset(base_url + 'trust_data.txt.bz2',
                         filepath,
                         manage_folder=False)
コード例 #5
0
ファイル: movielens.py プロジェクト: sts-sadr/rs_datasets
    def _download(self, path, dataset):
        """
        Download data from https://grouplens.org/datasets/movielens/
        Available options: ml-20m, ml-latest-small, ml-latest and other,
        can be checked on ml site.

        :param path: where to save
        :param dataset: dataset version
        :return: None
        """
        self.logger.info('Downloading %s from grouplens...', dataset)
        archive = dataset + '.zip'
        url = f'http://files.grouplens.org/datasets/movielens/{archive}'
        download_dataset(url, path + '.zip')
        if dataset == 'ml-10m':
            rename(join(self.data_folder, 'ml-10M100K'), path)
            self.replace_separator(join(path, 'movies.dat'), '::', '\t')
            self.replace_separator(join(path, 'ratings.dat'), '::', '\t')
            self.replace_separator(join(path, 'tags.dat'), '::', '\t')
        elif dataset == 'ml-1m':
            self.replace_separator(join(path, 'movies.dat'), '::', '\t', 'ISO-8859-1')
            self.replace_separator(join(path, 'ratings.dat'), '::', '\t')
            self.replace_separator(join(path, 'users.dat'), '::', '\t')
コード例 #6
0
 def _download(self, path):
     self.logger.info('Downloading Last.fm 360k dataset...')
     url = 'http://mtg.upf.edu/static/datasets/last.fm/lastfm-dataset-360K.tar.gz'
     download_dataset(url, join(self.data_folder, 'lastfm.tar.gz'))
     rename(join(self.data_folder, 'lastfm-dataset-360K'), path)
コード例 #7
0
 def _download(self, path):
     self.logger.info('Downloading Dating Agency Dataset...')
     url = 'http://www.occamslab.com/petricek/data/libimseti-complete.zip'
     download_dataset(url, join(self.data_folder, 'dating.zip'))
     rename(join(self.data_folder, 'libimseti'), path)
コード例 #8
0
ファイル: yoochoose.py プロジェクト: inpefess/rs_datasets
 def _download(self, path):
     self.logger.info('Downloading YooChoose Dataset...')
     url = 'https://s3-eu-west-1.amazonaws.com/yc-rdata/yoochoose-data.7z'
     download_dataset(url, join(self.data_folder, 'yoochoose.7z'))
コード例 #9
0
 def _download(self, path):
     logging.info('Downloading rekko challenge dataset...')
     archive = 'rekko_challenge_rekko_challenge_2019.zip'
     url = f'https://boosters.pro/api/ch/files/pub/{archive}'
     download_dataset(url, join(self.data_folder, 'rekko.zip'))
     rename(join(self.data_folder, 'rekko'), path)
コード例 #10
0
ファイル: book_crossing.py プロジェクト: sts-sadr/rs_datasets
 def _download(self, path):
     self.logger.info('Downloading Book-Crossing Dataset...')
     url = 'http://www2.informatik.uni-freiburg.de/~cziegler/BX/BX-CSV-Dump.zip'
     download_dataset(url, join(self.data_folder, 'bookx.zip'))