def test_initialize(): d = initialize("orchset") assert isinstance(d, core.Dataset) assert d.name == "orchset" with pytest.raises(ValueError): initialize("asdfasdfasdfa")
def test_dataset_errors(): with pytest.raises(ValueError): mirdata.initialize("not_a_dataset") d = mirdata.initialize("orchset") d._track_class = None with pytest.raises(AttributeError): d.track("asdf") with pytest.raises(AttributeError): d.multitrack("asdf") with pytest.raises(AttributeError): d.load_tracks() with pytest.raises(AttributeError): d.load_multitracks() with pytest.raises(AttributeError): d.choice_track() with pytest.raises(AttributeError): d.choice_multitrack() d = mirdata.initialize("acousticbrainz_genre") with pytest.raises(FileNotFoundError): d._index d = mirdata.initialize("phenicx_anechoic") with pytest.raises(ValueError): d._multitrack("a")
def test_dataset(): dataset = mirdata.initialize("guitarset") assert isinstance(dataset, core.Dataset) dataset = mirdata.initialize("rwc_jazz") assert isinstance(dataset, core.Dataset) dataset = mirdata.initialize("ikala") assert isinstance(dataset, core.Dataset) print(dataset) # test that repr doesn't fail
def dataset(test_dataset): if test_dataset == "": return None elif test_dataset not in mirdata.DATASETS: raise ValueError("{} is not a dataset in mirdata".format(test_dataset)) data_home = os.path.join("tests/resources/mir_datasets_full", test_dataset) return mirdata.initialize(test_dataset, data_home)
def main(): DATASETS = ["rwc_classical"] ALL_INDEXES = ["rwc_classical_index.json"] # Download metadata from all datasets for computing metadata checksums for module in DATASETS: if module not in ["dali", "beatles", "groove_midi"]: dataset = mirdata.initialize(module) if dataset.remotes is not None: dataset.download( partial_download=[ "metadata" if "metadata" in dataset.remotes else key for key in dataset.remotes if key is not "audio" and "training" not in key and "testing" not in key ] ) # Update index to new format print("Updating indexes...\n") update_index(ALL_INDEXES) # Check new indexes are shaped as expected print("Quick check on datasets...\n") test_index(DATASETS) test_track_load(DATASETS)
def update_index(all_indexes): """Function to update indexes to new format. Parameters ---------- all_indexes (list): list of all current dataset indexes """ for index_name in tqdm(all_indexes): module = index_name.replace("_index.json", "") # load old index old_index = mirdata.initialize(module)._index # avoid modifying when running multiple times if "tracks" in old_index.keys(): old_index = old_index["tracks"] data_home = mirdata.initialize(module).data_home # get metadata checksum metadata_files = get_metadata_paths(module) metadata_checksums = None if metadata_files is not None: metadata_checksums = { key: [ metadata_files[key], md5(os.path.join(data_home, metadata_files[key])), ] for key in metadata_files.keys() } # get version of dataset version = get_dataset_version(module) # Some datasets have a single metadata file, some have multiple. # The computation of the checksum should be customized in the make_index # of each dataset. This is a patch to convert previous indexes to the new format. new_index = {"version": version, "tracks": old_index} if metadata_files is not None: new_index["metadata"] = metadata_checksums with open(os.path.join(INDEXES_PATH, index_name), "w") as fhandle: json.dump(new_index, fhandle, indent=2)
def test_multitracks(httpserver): data_home_dir = "tests/resources/mir_datasets" for dataset_name in DATASETS: module = importlib.import_module( "mirdata.datasets.{}".format(dataset_name)) if dataset_name not in REMOTE_DATASETS: dataset = module.Dataset() else: # TODO fix these tests continue # remote_index = create_remote_index(httpserver, dataset_name) # dataset = module.Dataset(remote_index=remote_index) # TODO this is currently an opt-in test. Make it an opt out test # once #265 is addressed if dataset_name in CUSTOM_TEST_MTRACKS: mtrack_id = CUSTOM_TEST_MTRACKS[dataset_name] else: # there are no multitracks continue try: mtrack_default = dataset.MultiTrack(mtrack_id) except: assert False, "{}: {}".format(dataset_name, sys.exc_info()[0]) # test data home specified data_home = os.path.join(data_home_dir, dataset_name) dataset_specific = mirdata.initialize(dataset_name, data_home=data_home) try: mtrack_test = dataset_specific.MultiTrack(mtrack_id, data_home=data_home) except: assert False, "{}: {}".format(dataset_name, sys.exc_info()[0]) assert isinstance( mtrack_test, core.MultiTrack ), "{}.MultiTrack must be an instance of type core.MultiTrack".format( dataset_name) assert hasattr( mtrack_test, "to_jams"), "{}.MultiTrack must have a to_jams method".format( dataset_name) # Validate JSON schema try: jam = mtrack_test.to_jams() except: assert False, "{}: {}".format(dataset_name, sys.exc_info()[0]) assert jam.validate( ), "Jams validation failed for {}.MultiTrack({})".format( dataset_name, mtrack_id) if dataset_name in REMOTE_DATASETS: clean_remote_dataset(dataset_name)
def test_dataset_errors(): with pytest.raises(ValueError): mirdata.initialize("not_a_dataset") d = mirdata.initialize("orchset") d._track_class = None with pytest.raises(NotImplementedError): d.track("asdf") with pytest.raises(NotImplementedError): d.load_tracks() with pytest.raises(NotImplementedError): d.choice_track() d = mirdata.initialize("acousticbrainz_genre") with pytest.raises(FileNotFoundError): d._index
def test_track_load(dataset_names): """Function to test all loaders work and indexes are fine (run locally) Parameters ---------- dataset_names (list): list of dataset names """ for module in dataset_names: dataset = mirdata.initialize(module) dataset.load_tracks()
def test_index(dataset_names): """ Test if updated indexes are as expected. Parameters ---------- dataset_names (list): list of dataset names """ mandatory_keys = ["version"] for module in dataset_names: index = mirdata.initialize(module)._index assert type(index["tracks"]) == dict assert set(mandatory_keys) <= set([*index.keys()])
def main(args): data_home = "tests/resources/mir_datasets/{}".format(dataset.name) print(data_home) dataset = mirdata.initialize(args.dataset, data_home=data_home) if args.dataset in TEST_TRACKIDS.keys(): track_id = TEST_TRACKIDS[args.dataset] else: print("No test track found for {}. ".format(args.dataset)) print( "Please add a test track to the dictionary at the top of this script." ) return track = dataset.track(track_id) data = get_attributes_and_properties(track) print('"""{} Track class'.format(args.dataset)) print("") print("Args:") print(" track_id (str): track id of the track") print("") if len(data["attributes"]) > 0: print("Attributes:") for attr in data["attributes"]: if attr == "track_id": print(" {} ({}): track id".format( attr, type(getattr(track, attr)).__name__)) else: print(" {} ({}): TODO".format( attr, type(getattr(track, attr)).__name__)) print("") if len(data["cached_properties"]) > 0: print("Cached Properties:") for attr in data["cached_properties"]: print(" {} ({}): TODO".format( attr, type(getattr(track, attr)).__name__)) print("") print('"""')
import json import mirdata import muda import os from tqdm import tqdm giantsteps_key = mirdata.initialize('giantsteps_key', data_home='/scratch/qx244/data/gskey') gs_data = giantsteps_key.load_tracks() with open('/scratch/qx244/data/gskey/good_files.json', 'r') as fp: good_files = json.load(fp) pitch_shifter = muda.deformers.LinearPitchShift(n_samples=12, lower=-5, upper=6) for idx in tqdm(good_files.keys()): track = gs_data[idx] track_jams_path = os.path.join('/scratch/qx244/data/gskey/jams/', track.title + '.jams') #check if already augmented: if os.path.isfile( '/scratch/qx244/data/gskey/augmentation/{}.11.jams'.format( track.title)): continue j_orig = muda.load_jam_audio(track_jams_path, track.audio_path) for i, jam_out in enumerate(pitch_shifter.transform(j_orig)):
def split_dataset_in_chunks(self): # Create output folder if it does not exist if not os.path.exists(self.chunks_path): os.mkdir(self.chunks_path) # Initialize Saraga Carnatic dataset and get list of multitrack audio subset saraga_carnatic = mirdata.initialize('saraga_carnatic', data_home=self.dataset_path) track_ids = saraga_carnatic.track_ids saraga_data = saraga_carnatic.load_tracks() concerts_to_ignore = ['Akkarai', 'Sundar'] multitrack_list = self.get_multitrack_ids(track_ids, saraga_data, concerts_to_ignore) split_count = 0 for track_id in tqdm(multitrack_list): # Get track to format track = saraga_data[track_id] # Get tonic rounded at 4 decimals tonic = round(self.get_tonic(track), 4) # Get tanpura audio from the synthesized tanpura dataset tanpura_filename = os.path.join(self.tanpura_dataset_path, 'tanpura_' + str(tonic) + '.wav') audio_tanpura = estd.MonoLoader(filename=tanpura_filename)() # Get voice audio_vocal = estd.MonoLoader(filename=track.audio_vocal_path)() # Get violin audio_violin = estd.MonoLoader(filename=track.audio_violin_path)() # Get mridangam right audio_mridangam_right = estd.MonoLoader( filename=track.audio_mridangam_right_path)() # Get mridangam left audio_mridangam_left = estd.MonoLoader( filename=track.audio_mridangam_left_path)() # Get splits split_mridangam_left = self.split_into_chunks( audio_mridangam_left, len(audio_tanpura)) split_mridangam_right = self.split_into_chunks( audio_mridangam_right, len(audio_tanpura)) split_violin = self.split_into_chunks(audio_violin, len(audio_tanpura)) split_vocal = self.split_into_chunks(audio_vocal, len(audio_tanpura)) split_tanpura = [audio_tanpura] * len(split_vocal) number_of_chunks = 0 for split_id, (tanpura, vocal, violin, mri_right, mri_left) in enumerate( zip(split_tanpura, split_vocal, split_violin, split_mridangam_right, split_mridangam_left)): write(filename=os.path.join( self.chunks_path, str(split_id + split_count) + '_tanpura.wav'), rate=44100, data=np.array(tanpura)) write(filename=os.path.join( self.chunks_path, str(split_id + split_count) + '_vocal.wav'), rate=44100, data=np.array(vocal)) write(filename=os.path.join( self.chunks_path, str(split_id + split_count) + '_violin.wav'), rate=44100, data=np.array(violin)) write(filename=os.path.join( self.chunks_path, str(split_id + split_count) + '_mridangam_right.wav'), rate=44100, data=np.array(mri_right)) write(filename=os.path.join( self.chunks_path, str(split_id + split_count) + '_mridangam_left.wav'), rate=44100, data=np.array(mri_left)) number_of_chunks = split_id split_count = split_count + number_of_chunks
items.pop("v_num", None) return items def configure_optimizers(self): optimizer = torch.optim.Adam(self.parameters(), lr=4e-2, weight_decay=0.0001) scheduler = torch.optim.lr_scheduler.StepLR( optimizer, step_size=10, gamma=0.1) # reduce the learning after 10 epochs by a factor of 10 return [optimizer], [scheduler] #### Init the Mridangam stroke dataset data_home = '/Volumes/Macintosh HD 2/Documents/git/mirdata/tests/resources/mir_datasets_full/mridangam_stroke' mridangam = mirdata.initialize("mridangam_stroke") #,data_home=data_home download = False if download: mridangam.download() random_seed = 0 pl.utilities.seed.seed_everything(seed=random_seed) #### Pytorch dataset loaders train_dataset = MridangamDataset(mirdataset=mridangam, subset=0, random_seed=random_seed) train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=128, num_workers=24,