def download_from_remote(remote, save_dir, force_overwrite=False): """Download a remote dataset into path Fetch a dataset pointed by remote's url, save into path using remote's filename and ensure its integrity based on the MD5 Checksum of the downloaded file. Adapted from scikit-learn's sklearn.datasets.base._fetch_remote. Args: remote (RemoteFileMetadata): Named tuple containing remote dataset meta information: url, filename and checksum save_dir (str): Directory to save the file to. Usually `data_home` force_overwrite (bool): If True, overwrite existing file with the downloaded file. If False, does not overwrite, but checks that checksum is consistent. Returns: file_path (str): Full path of the created file. """ if remote.destination_dir is None: download_dir = save_dir else: download_dir = os.path.join(save_dir, remote.destination_dir) if not os.path.exists(download_dir): os.makedirs(download_dir) download_path = os.path.join(download_dir, remote.filename) if not os.path.exists(download_path) or force_overwrite: # If file doesn't exist or we want to overwrite, download it with DownloadProgressBar( unit='B', unit_scale=True, unit_divisor=1024, miniters=1 ) as t: try: urllib.request.urlretrieve( remote.url, filename=download_path, reporthook=t.update_to, data=None, ) except Exception as e: error_msg = """ mirdata failed to download the dataset! Please try again in a few minutes. If this error persists, please raise an issue at https://github.com/mir-dataset-loaders/mirdata, and tag it with 'broken-link'. """ print(error_msg) raise e checksum = md5(download_path) if remote.checksum != checksum: raise IOError( '{} has an MD5 checksum ({}) ' 'differing from expected ({}), ' 'file may be corrupted.'.format(download_path, checksum, remote.checksum) ) return download_path
def download_from_remote(remote, save_dir, force_overwrite=False): """Download a remote dataset into path Fetch a dataset pointed by remote's url, save into path using remote's filename and ensure its integrity based on the MD5 Checksum of the downloaded file. Adapted from scikit-learn's sklearn.datasets.base._fetch_remote. Parameters ----------- remote: RemoteFileMetadata Named tuple containing remote dataset meta information: url, filename and checksum save_dir: string Directory to save the file to. Usually `data_home` force_overwrite: bool If True, overwrite existing file with the downloaded file. If False, does not overwrite, but checks that checksum is consistent. Returns ------- file_path: string Full path of the created file. """ if remote.destination_dir is None: download_dir = save_dir else: download_dir = os.path.join(save_dir, remote.destination_dir) if not os.path.exists(download_dir): os.makedirs(download_dir) download_path = os.path.join(download_dir, remote.filename) if not os.path.exists(download_path) or force_overwrite: # If file doesn't exist or we want to overwrite, download it with DownloadProgressBar( unit='B', unit_scale=True, miniters=1, desc=remote.url.split('/')[-1] ) as t: try: _download_large_file(remote.url, download_path, t.update_to) except HTTPError: error_msg = """ mirdata failed to download the dataset! Please try again in a few minutes. If this error persists, please raise an issue at https://github.com/mir-dataset-loaders/mirdata, and tag it with 'broken-link'. """ raise HTTPError(error_msg) checksum = md5(download_path) if remote.checksum != checksum: raise IOError( '{} has an MD5 checksum ({}) ' 'differing from expected ({}), ' 'file may be corrupted.'.format(download_path, checksum, remote.checksum) ) return download_path
def test_md5(mocker): audio_file = b"audio1234" expected_checksum = "6dc00d1bac757abe4ea83308dde68aab" mocker.patch("%s.open" % builtin_module_name, new=mocker.mock_open(read_data=audio_file)) md5_checksum = utils.md5("test_file_path") assert expected_checksum == md5_checksum
def test_md5(mocker): audio_file = b'audio1234' expected_checksum = '6dc00d1bac757abe4ea83308dde68aab' mocker.patch('%s.open' % builtin_module_name, new=mocker.mock_open(read_data=audio_file)) md5_checksum = utils.md5('test_file_path') assert expected_checksum == md5_checksum
def make_ikala_index(ikala_data_path): lyrics_dir = os.path.join(ikala_data_path, 'Lyrics') lyrics_files = glob.glob(os.path.join(lyrics_dir, '*.lab')) track_ids = sorted( [os.path.basename(f).split('.')[0] for f in lyrics_files]) # top-key level metadata metadata_checksum = md5(os.path.join(ikala_data_path, 'id_mapping.txt')) index_metadata = { "metadata": { "id_mapping": ("id_mapping.txt", metadata_checksum) } } # top-key level tracks index_tracks = {} for track_id in track_ids: audio_checksum = md5( os.path.join(ikala_data_path, "Wavfile/{}.wav".format(track_id))) pitch_checksum = md5( os.path.join(ikala_data_path, "PitchLabel/{}.pv".format(track_id))) lyrics_checksum = md5( os.path.join(ikala_data_path, "Lyrics/{}.lab".format(track_id))) index_tracks[track_id] = { "audio": ("Wavfile/{}.wav".format(track_id), audio_checksum), "pitch": ("PitchLabel/{}.pv".format(track_id), pitch_checksum), "lyrics": ("Lyrics/{}.lab".format(track_id), lyrics_checksum), } # top-key level version ikala_index = {"version": None} # combine all in dataset index ikala_index.update(index_metadata) ikala_index.update({"tracks": index_tracks}) with open(IKALA_INDEX_PATH, 'w') as fhandle: json.dump(ikala_index, fhandle, indent=2)
def make_gtzan_genre_index(data_path): index = {} for track_key, path in iter_paths(data_path): abspath = os.path.join(data_path, path) if not os.path.exists(abspath): print("Missing file: {}".format(abspath)) continue checksum = md5(abspath) audio_path = os.path.join("gtzan_genre/genres", path) index[track_key] = {"audio": [audio_path, checksum]} with open(GTZAN_GENRE_INDEX_PATH, "w") as f: json.dump(index, f, indent=2)
def update_index(all_indexes): """Function to update indexes to new format. Parameters ---------- all_indexes (list): list of all current dataset indexes """ for index_name in tqdm(all_indexes): module = index_name.replace('_index.json', '') # load old index old_index = mirdata.Dataset(module)._index # avoid modifying when running multiple times if 'tracks' in old_index.keys(): old_index = old_index['tracks'] data_home = mirdata.Dataset(module).data_home # get metadata checksum metadata_files = get_metadata_paths(module) metadata_checksums = None if metadata_files is not None: metadata_checksums = {key: [metadata_files[key], md5(os.path.join(data_home, metadata_files[key]))] for key in metadata_files.keys()} # get version of dataset version = get_dataset_version(module) # Some datasets have a single metadata file, some have multiple. # The computation of the checksum should be customized in the make_index # of each dataset. This is a patch to convert previous indexes to the new format. new_index = {'version': version, 'tracks': old_index} if metadata_files is not None: new_index['metadata'] = metadata_checksums with open(os.path.join(INDEXES_PATH, index_name), 'w') as fhandle: json.dump(new_index, fhandle, indent=2)