Ejemplo n.º 1
0
def make_classicalDB_index(data_path):
    audio_dir = os.path.join(data_path, 'audio')
    key_dir = os.path.join(data_path, 'keys')
    spectrum_dir = os.path.join(data_path, 'spectrums')
    HPCP_dir = os.path.join(data_path, 'HPCPs')
    mb_dir = os.path.join(data_path, 'musicbrainz_metadata')

    classicalDB_index = {}
    for track_id, key_file in enumerate(sorted(os.listdir(key_dir))):
        if '.txt' in key_file:
            codec = '.wav'
            audio_path = os.path.join(audio_dir,
                                      os.path.splitext(key_file)[0] + codec)
            spectrum_path = os.path.join(
                spectrum_dir,
                os.path.splitext(key_file)[0] + '.json')
            HPCP_path = os.path.join(HPCP_dir,
                                     os.path.splitext(key_file)[0] + '.json')
            mb_path = os.path.join(mb_dir,
                                   os.path.splitext(key_file)[0] + '.json')
            key_path = os.path.join(key_dir, key_file)

            classicalDB_index[track_id] = {
                'audio': (audio_path.replace(data_path + '/',
                                             ''), md5(audio_path)),
                'key': (key_path.replace(data_path + '/', ''), md5(key_path)),
                'spectrum': (spectrum_path.replace(data_path + '/',
                                                   ''), md5(spectrum_path)),
                'mb': (mb_path.replace(data_path + '/', ''), md5(mb_path)),
                'HPCP': (HPCP_path.replace(data_path + '/',
                                           ''), md5(HPCP_path))
            }
    with open(classicalDB_INDEX_PATH, 'w') as fhandle:
        json.dump(classicalDB_index, fhandle, indent=2)
Ejemplo n.º 2
0
def make_beatport_key_index(data_path):
    meta_dir = os.path.join(data_path, 'meta')
    audio_dir = os.path.join(data_path, 'audio')
    key_dir = os.path.join(data_path, 'keys')
    beatport_key_index = {
        'version': '1.0.0',
        'tracks': {},
        'metadata': None,
    }
    for track_id, ann_dir in enumerate(sorted(os.listdir(key_dir))):
        if '.txt' in ann_dir:
            codec = '.mp3'
            audio_path = os.path.join(audio_dir,
                                      ann_dir.replace('.txt', codec))
            chord_path = os.path.join(key_dir, ann_dir)
            meta_path = os.path.join(meta_dir,
                                     ann_dir.replace('.txt', '.json'))
            if not os.path.exists(meta_path):
                meta = (None, None)
            else:
                meta = (meta_path.replace(data_path + '/', ''), md5(meta_path))

            beatport_key_index['tracks'][track_id] = {
                'audio': (audio_path.replace(data_path + '/',
                                             ''), md5(audio_path)),
                'meta': meta,
                'key': (chord_path.replace(data_path + '/',
                                           ''), md5(chord_path)),
            }
    with open(beatport_key_INDEX_PATH, 'w') as fhandle:
        json.dump(beatport_key_index, fhandle, indent=2)
def make_queen_index(data_path):
    annotations_dir = os.path.join(data_path, 'Queen', 'annotations')
    cds_dir = os.path.join(annotations_dir, 'all', 'Queen')
    audio_dir = os.path.join(data_path, 'Queen', 'audio')
    cds = sorted(os.listdir(cds_dir))
    track_ids = []
    queen_tracks = {}
    totfiles = []
    track_id = 0
    for c in cds:

        for t in sorted(os.listdir(os.path.join(cds_dir, c))):
            if 'ttl' in t:
                totfiles.append(t)

                if 'CD' in t:
                    track_id = '10{}{}'.format(
                        os.path.basename(c).split('_')[0][-1],
                        os.path.basename(t).split('_')[2][:2],
                    )
                track_ids.append(track_id)

                # checksum
                audio_checksum = md5(
                    os.path.join(audio_dir, c, '{}.flac'.format(t[:-4])))
                audio_path = '{}/{}'.format(
                    'audio', os.path.join(c, '{}.flac'.format(t[:-4])))

                annot_checksum, annot_rels = [], []

                for annot_type in QUEEN_ANNOTATION_SCHEMA:
                    cds_dir = os.path.join(annotations_dir, annot_type,
                                           'Queen')
                    annot_path = os.path.join(cds_dir, c)

                    annot_file = '{}.lab'.format(t[:-4])

                    if os.path.exists(os.path.join(annot_path, annot_file)):
                        annot_checksum.append(
                            md5(os.path.join(annot_path, annot_file)))
                        annot_rels.append(
                            os.path.join('annotations', annot_type, 'Queen', c,
                                         annot_file))
                    else:
                        annot_checksum.append(None)
                        annot_rels.append(None)

                queen_tracks[track_id] = {
                    'audio': (audio_path, audio_checksum),
                    'chords': (annot_rels[0], annot_checksum[0]),
                    'keys': (annot_rels[1], annot_checksum[1]),
                    'sections': (annot_rels[2], annot_checksum[2]),
                }
                track_id += 1
    queen_index = {'version': "1.0", 'tracks': queen_tracks, 'metadata': None}
    with open(QUEEN_INDEX_PATH, 'w') as fhandle:
        json.dump(queen_index, fhandle, indent=2)
def make_otmm_makam_index(dataset_data_path):

    otmm_index = {'version': 'dlfm2016', 'tracks': {}, 'metadata': []}

    for makam in os.listdir(os.path.join(dataset_data_path, 'data')):
        if '.' not in makam:
            for track in os.listdir(
                    os.path.join(dataset_data_path, 'data', makam)):
                if '.json' in track:
                    # Declare track attributes
                    index = track.split('.json')[0]
                    pitch_path = index + '.pitch'

                    otmm_index['tracks'][index] = {
                        "metadata": [
                            os.path.join(
                                'MTG-otmm_makam_recognition_dataset-f14c0d0',
                                'data',
                                makam,
                                track,
                            ),
                            md5(
                                os.path.join(dataset_data_path, 'data', makam,
                                             track)),
                        ],
                        "pitch": [
                            os.path.join(
                                'MTG-otmm_makam_recognition_dataset-f14c0d0',
                                'data',
                                makam,
                                pitch_path,
                            ),
                            md5(
                                os.path.join(dataset_data_path, 'data', makam,
                                             pitch_path)),
                        ],
                    }
    otmm_index['metadata'] = [
        os.path.join('MTG-otmm_makam_recognition_dataset-f14c0d0',
                     'annotations.json'),
        md5(os.path.join(dataset_data_path, 'annotations.json')),
    ]

    with open(OTMM_MAKAM_INDEX_PATH, 'w') as fhandle:
        json.dump(otmm_index, fhandle, indent=2)
Ejemplo n.º 5
0
def test_md5(mocker):
    audio_file = b"audio1234"

    expected_checksum = "6dc00d1bac757abe4ea83308dde68aab"

    mocker.patch("builtins.open", new=mocker.mock_open(read_data=audio_file))

    md5_checksum = validate.md5("test_file_path")
    assert expected_checksum == md5_checksum
Ejemplo n.º 6
0
def make_tonas_index(dataset_data_path):

    tonas_index = {"version": "1.0", "tracks": {}}

    for style in os.listdir(os.path.join(dataset_data_path)):
        if "." not in style:
            for track in os.listdir(os.path.join(dataset_data_path, style)):
                if ".wav" in track:
                    # Declare track attributes
                    index = track.replace(".wav", "")
                    f0_path = index + ".f0.Corrected"
                    notes_path = index + ".notes.Corrected"

                    tonas_index["tracks"][index] = {
                        "audio": [
                            os.path.join(style, track),
                            md5(os.path.join(dataset_data_path, style, track)),
                        ],
                        "f0": [
                            os.path.join(style, f0_path),
                            md5(os.path.join(dataset_data_path, style,
                                             f0_path)),
                        ],
                        "notes": [
                            os.path.join(style, notes_path),
                            md5(
                                os.path.join(dataset_data_path, style,
                                             notes_path)),
                        ],
                    }
    tonas_index["metadata"] = {
        "TONAS-Metadata": [
            "TONAS-Metadata.txt",
            md5(os.path.join(dataset_data_path, "TONAS-Metadata.txt")),
        ]
    }

    with open(TONAS_INDEX_PATH, "w") as fhandle:
        json.dump(tonas_index, fhandle, indent=2)
Ejemplo n.º 7
0
def make_dataset_index(dataset_data_path):
    annotation_dir = os.path.join(dataset_data_path, "annotation")
    annotation_files = glob.glob(os.path.join(annotation_dir, "*.lab"))
    track_ids = sorted(
        [os.path.basename(f).split(".")[0] for f in annotation_files])

    # top-key level metadata
    metadata_checksum = md5(os.path.join(dataset_data_path, "id_mapping.txt"))
    index_metadata = {
        "metadata": {
            "id_mapping": ("id_mapping.txt", metadata_checksum)
        }
    }

    # top-key level tracks
    index_tracks = {}
    for track_id in track_ids:
        audio_checksum = md5(
            os.path.join(dataset_data_path, "Wavfile/{}.wav".format(track_id)))
        annotation_checksum = md5(
            os.path.join(dataset_data_path,
                         "annotation/{}.lab".format(track_id)))

        index_tracks[track_id] = {
            "audio": ("Wavfile/{}.wav".format(track_id), audio_checksum),
            "annotation":
            ("annotation/{}.lab".format(track_id), annotation_checksum),
        }

    # top-key level version
    dataset_index = {"version": None}

    # combine all in dataset index
    dataset_index.update(index_metadata)
    dataset_index.update({"tracks": index_tracks})

    with open(DATASET_INDEX_PATH, "w") as fhandle:
        json.dump(dataset_index, fhandle, indent=2)
Ejemplo n.º 8
0
def update_index(all_indexes):
    """Function to update indexes to new format.
    Parameters
    ----------
    all_indexes (list): list of all current dataset indexes


    """

    for index_name in tqdm(all_indexes):
        module = index_name.replace("_index.json", "")

        # load old index
        old_index = mirdata.initialize(module)._index

        # avoid modifying when running multiple times
        if "tracks" in old_index.keys():
            old_index = old_index["tracks"]

        data_home = mirdata.initialize(module).data_home

        # get metadata checksum
        metadata_files = get_metadata_paths(module)
        metadata_checksums = None

        if metadata_files is not None:
            metadata_checksums = {
                key: [
                    metadata_files[key],
                    md5(os.path.join(data_home, metadata_files[key])),
                ]
                for key in metadata_files.keys()
            }

        # get version of dataset
        version = get_dataset_version(module)

        # Some datasets have a single metadata file, some have multiple.
        # The computation of the checksum should be customized in the make_index
        # of each dataset. This is a patch to convert previous indexes to the new format.
        new_index = {"version": version, "tracks": old_index}

        if metadata_files is not None:
            new_index["metadata"] = metadata_checksums

        with open(os.path.join(INDEXES_PATH, index_name), "w") as fhandle:
            json.dump(new_index, fhandle, indent=2)
Ejemplo n.º 9
0
def make_acousticbrainz_genre_index(data_path):
    index = 0
    datasets = ['tagtraum', 'allmusic', 'lastfm', 'discogs']
    dataset_types = ['validation', 'train']
    f = open(acousticbrainz_genre_INDEX_PATH, 'w')
    f.write('{\n')
    for dataset, dataset_type in itertools.product(datasets, dataset_types):
        tsv_file = open(
            os.path.join(
                data_path, "acousticbrainz-mediaeval-" + dataset + "-" +
                dataset_type + ".tsv"))
        read_tsv = csv.reader(tsv_file, delimiter="\t")
        next(read_tsv, None)
        read_tsv_list = list(read_tsv)
        for line, row in enumerate(read_tsv_list):
            mbid = ""
            track_id = dataset + '#' + dataset_type
            for i, r in enumerate(row):
                track_id = track_id + '#' + r
                if i == 0:
                    mbid = r
            ann_path = os.path.join(data_path,
                                    "acousticbrainz-mediaeval-" + dataset_type,
                                    mbid[:2], mbid + ".json")
            f.write('  \"%s\": {\n' % (track_id, ))
            f.write('    \"data\": [\n')
            f.write('      \"%s\",\n' %
                    (ann_path.replace(data_path + '/', ''), ))
            f.write('      \"%s\"\n' % md5(ann_path))
            f.write('    ]\n')
            is_the_last = dataset == datasets[
                -1] and dataset_type == dataset_types[-1] and line == len(
                    read_tsv_list) - 1
            if not is_the_last:
                f.write('  },\n')
            else:
                f.write('  }\n')
            index += 1

    f.write('}')
def make_saraga_hindustani_index(dataset_data_path):

    saraga_index = {
        'version': 1.5,
        'tracks': {},
    }
    idx = 0
    dataset_data_path_prev = dataset_data_path.split('saraga1.5_hindustani')[0]
    for concert in os.listdir(dataset_data_path):
        if '.' not in concert:
            for song in os.listdir(os.path.join(dataset_data_path, concert)):
                if '.' not in song:

                    # Declare track attributes
                    index = str(idx) + '_' + song.replace(' ', '_')
                    print(index)
                    audio = (None, None)
                    ctonic = (None, None)
                    pitch = (None, None)
                    tempo = (None, None)
                    sama = (None, None)
                    sections = (None, None)
                    phrases = (None, None)
                    metadata = (None, None)

                    for file in os.listdir(
                            os.path.join(dataset_data_path, concert, song)):
                        if '.mp3' in file:
                            audio_path = os.path.join('saraga1.5_hindustani/',
                                                      concert, song, file)
                            audio_checksum = md5(
                                os.path.join(dataset_data_path_prev,
                                             audio_path))
                            audio = (audio_path, audio_checksum)
                        if 'ctonic' in file:
                            ctonic_path = os.path.join('saraga1.5_hindustani/',
                                                       concert, song, file)
                            ctonic_checksum = md5(
                                os.path.join(dataset_data_path_prev,
                                             ctonic_path))
                            ctonic = (ctonic_path, ctonic_checksum)
                        if 'pitch.' in file:
                            pitch_path = os.path.join('saraga1.5_hindustani/',
                                                      concert, song, file)
                            pitch_checksum = md5(
                                os.path.join(dataset_data_path_prev,
                                             pitch_path))
                            pitch = (pitch_path, pitch_checksum)
                        if 'tempo-manual' in file:
                            tempo_path = os.path.join('saraga1.5_hindustani/',
                                                      concert, song, file)
                            tempo_checksum = md5(
                                os.path.join(dataset_data_path_prev,
                                             tempo_path))
                            tempo = (tempo_path, tempo_checksum)
                        if 'sama-manual' in file:
                            sama_path = os.path.join('saraga1.5_hindustani/',
                                                     concert, song, file)
                            sama_checksum = md5(
                                os.path.join(dataset_data_path_prev,
                                             sama_path))
                            sama = (sama_path, sama_checksum)
                        if 'sections-manual-p' in file:
                            sections_path = os.path.join(
                                'saraga1.5_hindustani/', concert, song, file)
                            sections_checksum = md5(
                                os.path.join(dataset_data_path_prev,
                                             sections_path))
                            sections = (sections_path, sections_checksum)
                        if 'mphrase' in file:
                            phrases_path = os.path.join(
                                'saraga1.5_hindustani/', concert, song, file)
                            phrases_checksum = md5(
                                os.path.join(dataset_data_path_prev,
                                             phrases_path))
                            phrases = (phrases_path, phrases_checksum)
                        if '.json' in file:
                            metadata_path = os.path.join(
                                'saraga1.5_hindustani/', concert, song, file)
                            metadata_checksum = md5(
                                os.path.join(dataset_data_path_prev,
                                             metadata_path))
                            metadata = (metadata_path, metadata_checksum)

                        saraga_index['tracks'][index] = {
                            'audio': audio,
                            'ctonic': ctonic,
                            'pitch': pitch,
                            'tempo': tempo,
                            'sama': sama,
                            'sections': sections,
                            'phrases': phrases,
                            'metadata': metadata
                        }

                    idx = idx + 1

    with open(SARAGA_HINDUSTANI_INDEX_PATH, 'w') as fhandle:
        json.dump(saraga_index, fhandle, indent=2)
Ejemplo n.º 11
0
def make_dataset_index(data_path):
    audio_dir = os.path.join(data_path, "audio_wav_22050_mono")

    index = {"version": "1.2.3", "tracks": {}, "multitracks": {}}

    # define pieces directly from data directory
    pieces = sorted(
        list(
            set([
                "_".join(filename.split("/")[-1].split("_")[:4])
                for filename in glob.glob(os.path.join(audio_dir, "*.wav"))
            ])))

    for ip, piece in enumerate(pieces):

        index["multitracks"][piece] = {}

        ## add mixture audios

        # STM
        audio_mix_dir = os.path.join(data_path, "audio_wav_22050_mono",
                                     "{}_Stereo_STM.wav".format(piece))
        audio_checksum = md5(audio_mix_dir)
        index["multitracks"][piece]["audio_stm"] = (
            "audio_wav_22050_mono/{}_Stereo_STM.wav".format(piece),
            audio_checksum,
        )

        # STR
        audio_mix_dir = os.path.join(data_path, "audio_wav_22050_mono",
                                     "{}_Stereo_STR.wav".format(piece))
        audio_checksum = md5(audio_mix_dir)
        index["multitracks"][piece]["audio_str"] = (
            "audio_wav_22050_mono/{}_Stereo_STR.wav".format(piece),
            audio_checksum,
        )

        # STL
        audio_mix_dir = os.path.join(data_path, "audio_wav_22050_mono",
                                     "{}_Stereo_STL.wav".format(piece))
        audio_checksum = md5(audio_mix_dir)
        index["multitracks"][piece]["audio_stl"] = (
            "audio_wav_22050_mono/{}_Stereo_STL.wav".format(piece),
            audio_checksum,
        )

        # STRev
        audio_mix_dir = os.path.join(data_path, "audio_wav_22050_mono",
                                     "{}_StereoReverb_STM.wav".format(piece))
        audio_checksum = md5(audio_mix_dir)
        index["multitracks"][piece]["audio_rev"] = (
            "audio_wav_22050_mono/{}_StereoReverb_STM.wav".format(piece),
            audio_checksum,
        )

        # beats
        index["multitracks"][piece]["beat"] = (None, None)

        # piano
        index["multitracks"][piece]["audio_spl"] = (None, None)
        index["multitracks"][piece]["audio_spr"] = (None, None)

        ## add each track inside the multitrack

        audio_files = sorted(
            glob.glob(os.path.join(audio_dir, "{}*.wav".format(piece))))

        singers = [
            singer.split("_")[-2] for singer in audio_files
            if not "Stereo" in singer
        ]

        # second step to remove piano from singers
        singers = [singer for singer in singers if "Piano" not in singer]

        # mics = [singer.split('_')[-1].split('.')[0] for singer in audio_files if not 'Stereo' in singer]
        # assert len(singers) == len(mics), "number of mics does not match number of singers for {}".format(piece)
        # set_singers = set(singers)

        index["multitracks"][piece]["tracks"] = []

        for sidx, singer in enumerate(sorted(singers)):

            track_name = "{}_{}".format(piece, singer)

            # define fields as None
            index["tracks"][track_name] = {
                "audio_dyn": (None, None),
                "audio_hsm": (None, None),
                "audio_lrx": (None, None),
                "f0_crepe_dyn": (None, None),
                "f0_crepe_hsm": (None, None),
                "f0_crepe_lrx": (None, None),
                "f0_pyin_dyn": (None, None),
                "f0_pyin_hsm": (None, None),
                "f0_pyin_lrx": (None, None),
                "f0_manual_lrx": (None, None),
                "score": (None, None),
            }

            index["multitracks"][piece]["tracks"].append(track_name)

            mics = [
                mic.split("_")[-1].split(".")[0] for mic in glob.glob(
                    os.path.join(audio_dir, "{}_{}*.wav".format(
                        piece, singer))) if mic not in ["SPL", "SPR"]
            ]

            ### add all fields for each track

            for mic in mics:

                ## add audio
                audio_stem_dir = os.path.join(
                    data_path,
                    "audio_wav_22050_mono",
                    "{}_{}_{}.wav".format(piece, singer, mic),
                )
                audio_checksum = md5(audio_stem_dir)

                index["tracks"][track_name]["audio_{}".format(mic.lower())] = (
                    "audio_wav_22050_mono/{}_{}_{}.wav".format(
                        piece, singer, mic),
                    audio_checksum,
                )

                ## add crepe f0s
                crepe_dir = os.path.join(
                    data_path,
                    "annotations_csv_F0_CREPE",
                    "{}_{}_{}.csv".format(piece, singer, mic),
                )
                crepe_checksum = md5(crepe_dir)

                index["tracks"][track_name]["f0_crepe_{}".format(
                    mic.lower())] = (
                        "annotations_csv_F0_CREPE/{}_{}_{}.csv".format(
                            piece, singer, mic),
                        crepe_checksum,
                    )

                ## add pyin f0s
                pyin_dir = os.path.join(
                    data_path,
                    "annotations_csv_F0_PYIN",
                    "{}_{}_{}.csv".format(piece, singer, mic),
                )
                pyin_checksum = md5(pyin_dir)

                index["tracks"][track_name]["f0_pyin_{}".format(
                    mic.lower())] = (
                        "annotations_csv_F0_PYIN/{}_{}_{}.csv".format(
                            piece, singer, mic),
                        pyin_checksum,
                    )

                ## add score when it exists

                # some have no associated score
                if not any(x in piece for x in NO_SCORE):
                    score_dir = os.path.join(
                        data_path,
                        "annotations_csv_scorerepresentation",
                        "{}_Stereo_STM_{}.csv".format(piece, singer[0]),
                    )
                    score_checksum = md5(score_dir)

                    index["tracks"][track_name]["score"] = (
                        "annotations_csv_scorerepresentation/{}_Stereo_STM_{}.csv"
                        .format(piece, singer[0]),
                        score_checksum,
                    )

            ## add beats for the full songs when available

            if not any(x in piece for x in NO_SCORE):
                ## add beats
                beats_dir = os.path.join(data_path, "annotations_csv_beat",
                                         "{}_Stereo_STM.csv".format(piece))
                beats_checksum = md5(beats_dir)

                index["multitracks"][piece]["beat"] = (
                    "annotations_csv_beat/{}_Stereo_STM.csv".format(piece),
                    beats_checksum,
                )

        ## check if piano track exists and add it to the mtrack if so

        audio_pianoL_dir = os.path.join(data_path, "audio_wav_22050_mono",
                                        "{}_Piano_SPL.wav".format(piece))
        if os.path.exists(audio_pianoL_dir):
            # add piano SPL
            audio_checksum = md5(audio_pianoL_dir)
            index["multitracks"][piece]["audio_spl"] = (
                "audio_wav_22050_mono/{}_Piano_SPL.wav".format(piece),
                audio_checksum,
            )

            # add piano SPR
            audio_checksum = md5(audio_pianoL_dir.replace("SPL", "SPR"))
            index["multitracks"][piece]["audio_spr"] = (
                "audio_wav_22050_mono/{}_Piano_SPR.wav".format(piece),
                audio_checksum,
            )

        # tracks should not be repeated
        index["multitracks"][piece]["tracks"] = sorted(
            list(set(index["multitracks"][piece]["tracks"])))

    ## add the manual annotations to their corresponding tracks
    manual_files = sorted(
        glob.glob(os.path.join(data_path, "annotations_csv_F0_manual",
                               "*.csv")))
    for mf in manual_files:
        track_name = "_".join(os.path.basename(mf).split("_")[:-1])

        manual_checksum = md5(mf)

        index["tracks"][track_name]["f0_manual_lrx"] = (
            "annotations_csv_F0_manual/{}".format(os.path.basename(mf)),
            manual_checksum,
        )

    with open(DATASET_INDEX_PATH, "w") as fhandle:
        json.dump(index, fhandle, indent=2)
Ejemplo n.º 12
0
def download_from_remote(remote, save_dir, force_overwrite):
    """Download a remote dataset into path
    Fetch a dataset pointed by remote's url, save into path using remote's
    filename and ensure its integrity based on the MD5 Checksum of the
    downloaded file.

    Adapted from scikit-learn's sklearn.datasets.base._fetch_remote.

    Args:
        remote (RemoteFileMetadata): Named tuple containing remote dataset
            meta information: url, filename and checksum
        save_dir (str): Directory to save the file to. Usually `data_home`
        force_overwrite  (bool):
            If True, overwrite existing file with the downloaded file.
            If False, does not overwrite, but checks that checksum is consistent.

    Returns:
        str: Full path of the created file.

    """
    if remote.destination_dir is None:
        download_dir = save_dir
    else:
        download_dir = os.path.join(save_dir, remote.destination_dir)

    if not os.path.exists(download_dir):
        os.makedirs(download_dir)

    download_path = os.path.join(download_dir, remote.filename)

    if not os.path.exists(download_path) or force_overwrite:
        # if we got here, we want to overwrite any existing file
        if os.path.exists(download_path):
            os.remove(download_path)

        # If file doesn't exist or we want to overwrite, download it
        with DownloadProgressBar(
            unit="B", unit_scale=True, unit_divisor=1024, miniters=1
        ) as t:
            try:
                urllib.request.urlretrieve(
                    remote.url,
                    filename=download_path,
                    reporthook=t.update_to,
                    data=None,
                )
            except Exception as exc:
                error_msg = """
                            mirdata failed to download the dataset from {}!
                            Please try again in a few minutes.
                            If this error persists, please raise an issue at
                            https://github.com/mir-dataset-loaders/mirdata,
                            and tag it with 'broken-link'.
                            """.format(
                    remote.url
                )
                logging.error(error_msg)
                raise exc
    else:
        logging.info(
            "{} already exists and will not be downloaded. ".format(download_path)
            + "Rerun with force_overwrite=True to delete this file and force the download."
        )

    checksum = md5(download_path)
    if remote.checksum != checksum:

        raise IOError(
            "{} has an MD5 checksum ({}) "
            "differing from expected ({}), "
            "file may be corrupted.".format(download_path, checksum, remote.checksum)
        )
    return download_path
Ejemplo n.º 13
0
def make_dataset_index(data_path):

    pieces = ['beethoven', 'bruckner', 'mahler', 'mozart']
    families = {
        'doublebass': 'strings',
        'cello': 'strings',
        'clarinet': 'woodwinds',
        'viola': 'strings',
        'violin': 'strings',
        'oboe': 'woodwinds',
        'flute': 'woodwinds',
        'trumpet': 'brass',
        'bassoon': 'woodwinds',
        'horn': 'brass',
    }
    totalinstruments = [20, 39, 30, 10]
    ninstruments = [10, 10, 10, 8]
    index = {'version': 1}

    index['tracks'] = {}
    index['multitracks'] = {}

    for ip, piece in enumerate(pieces):
        index['multitracks'][piece] = {}

        audio_files = sorted(
            glob.glob(os.path.join(data_path, 'audio', piece, '*.wav')))
        instruments = [
            os.path.basename(audio_path).split('.')[0].rstrip(string.digits)
            for audio_path in audio_files
        ]
        set_instruments = list(set(instruments))

        assert (len(instruments) == totalinstruments[ip]
                ), 'audio files for some instruments are missing'
        assert (len(set_instruments) == ninstruments[ip]
                ), 'some instruments are missing from the dataset'

        index['multitracks'][piece]['tracks'] = []
        for instrument in set_instruments:
            assert (
                instrument in families.keys()
            ), "instrument {} is not in the list of dataset instruments".format(
                instrument)
            index['tracks'][piece + '-' + instrument] = {}
            index['multitracks'][piece]['tracks'].append(piece + '-' +
                                                         instrument)

            #### add audios
            instrument_audio_files = sorted(
                glob.glob(
                    os.path.join(data_path, 'audio', piece,
                                 instrument + '*.wav')))
            assert (len(instrument_audio_files) >
                    0), 'no audio has been found for {}'.format(instrument)

            for i, audio_file in enumerate(instrument_audio_files):
                audio_checksum = md5(
                    os.path.join(data_path, 'audio', piece,
                                 os.path.basename(audio_file)))
                source = os.path.basename(audio_file).replace('.wav', '')

                index['tracks'][piece + '-' +
                                instrument]['audio_' + source] = (
                                    'audio/{}/{}'.format(
                                        piece, os.path.basename(audio_file)),
                                    audio_checksum,
                                )

            #### add scores
            assert os.path.exists(
                os.path.join(data_path, 'annotations', piece,
                             '{}.txt'.format(instrument))
            ), 'cannot find score file {}'.formatos.path.join(
                data_path, 'annotations', piece, '{}.txt'.format(instrument))
            assert os.path.exists(
                os.path.join(data_path, 'annotations', piece,
                             '{}_o.txt'.format(instrument))
            ), 'cannot find score file {}'.formatos.path.join(
                data_path, 'annotations', piece, '{}_o.txt'.format(instrument))

            score_checksum = md5(
                os.path.join(data_path, 'annotations', piece,
                             '{}.txt'.format(instrument)))
            score_original_checksum = md5(
                os.path.join(data_path, 'annotations', piece,
                             '{}_o.txt'.format(instrument)))

            index['tracks'][piece + '-' + instrument]['notes'] = (
                'annotations/{}/{}.txt'.format(piece, instrument),
                score_checksum,
            )
            index['tracks'][piece + '-' + instrument]['notes_original'] = (
                'annotations/{}/{}_o.txt'.format(piece, instrument),
                score_original_checksum,
            )

    with open(DATASET_INDEX_PATH, 'w') as fhandle:
        json.dump(index, fhandle, indent=2)
def make_jingju_acappella_index(dataset_data_path):

    jingju_index = {"version": 7.0, "tracks": {}, "metadata": {}}

    # Building the index while parsing the audio path
    for folder in os.listdir(dataset_data_path):
        if "wav" in folder:
            for folder_ in os.listdir(os.path.join(dataset_data_path, folder)):
                if "." not in folder_:
                    for song in os.listdir(
                            os.path.join(dataset_data_path, folder, folder_)):
                        if ".DS" not in song:

                            index = song.replace(".wav",
                                                 "").replace(".WAV", "")
                            jingju_index["tracks"][index] = {
                                "audio": (None, None),
                                "phoneme": (None, None),
                                "phrase_char": (None, None),
                                "phrase": (None, None),
                                "syllable": (None, None),
                            }
                            jingju_index["tracks"][index]["audio"] = (
                                os.path.join(folder, folder_, song),
                                md5(
                                    os.path.join(dataset_data_path, folder,
                                                 folder_, song)),
                            )

    # Parsing annotations and textgrid
    for folder in os.listdir(dataset_data_path):
        if "annotation_txt" in folder:
            for folder_ in os.listdir(os.path.join(dataset_data_path, folder)):
                if "." not in folder_:
                    for file in os.listdir(
                            os.path.join(dataset_data_path, folder, folder_)):

                        if "phoneme" in file:
                            index = file.replace("_phoneme.txt", "")
                            jingju_index["tracks"][index]["phoneme"] = (
                                os.path.join(folder, folder_, file),
                                md5(
                                    os.path.join(dataset_data_path, folder,
                                                 folder_, file)),
                            )
                        if "phrase_char" in file:
                            index = file.replace("_phrase_char.txt", "")
                            jingju_index["tracks"][index]["phrase_char"] = (
                                os.path.join(folder, folder_, file),
                                md5(
                                    os.path.join(dataset_data_path, folder,
                                                 folder_, file)),
                            )
                        if "phrase.txt" in file:
                            index = file.replace("_phrase.txt", "")
                            jingju_index["tracks"][index]["phrase"] = (
                                os.path.join(folder, folder_, file),
                                md5(
                                    os.path.join(dataset_data_path, folder,
                                                 folder_, file)),
                            )
                        if "syllable" in file:
                            index = file.replace("_syllable.txt", "")
                            jingju_index["tracks"][index]["syllable"] = (
                                os.path.join(folder, folder_, file),
                                md5(
                                    os.path.join(dataset_data_path, folder,
                                                 folder_, file)),
                            )

    # Parsing metadata
    for file in os.listdir(dataset_data_path):
        if "catalogue" in file:
            if "dan" in file:
                jingju_index["metadata"]["dan_metadata"] = (
                    file,
                    md5(os.path.join(dataset_data_path, file)),
                )
            else:
                jingju_index["metadata"]["laosheng_metadata"] = (
                    file,
                    md5(os.path.join(dataset_data_path, file)),
                )

    with open(JINJGU_ACAPPELLA_INDEX_PATH, "w") as fhandle:
        json.dump(jingju_index, fhandle, indent=2)
Ejemplo n.º 15
0
def make_index(data_path):
    _index = {}
    index_file = csv.reader(
        open(os.path.join(data_path, "billboard-2.0-index.csv")))
    for row in index_file:
        k = row[0]
        _index[k] = row[1:]

    annotations_dir = os.path.join(data_path, "McGill-Billboard")
    audio_dir = os.path.join(data_path, "audio")
    anns = sorted(os.listdir(annotations_dir))

    track_ids = []
    index = {}
    index = {
        "version": "2.0",
        "tracks": {},
        "metadata": None,
    }

    txtfiles = []
    for a in anns:
        for t in os.listdir(os.path.join(annotations_dir, a)):
            if t == "salami_chords.txt":
                fp = os.path.join(annotations_dir, a, t)
                track_id = "{}".format(os.path.basename(a.lstrip("0")))

                if track_id in _index.keys():
                    txtfiles.append(t)
                    track_ids.append(track_id)

                    release_date = _index[track_id][0]
                    track_name = _index[track_id][3]
                    artist = _index[track_id][4]

                    _release_date = "{}s".format(
                        round(int(release_date.split("-")[0]), -1))

                    audio_path = os.path.join(audio_dir, _release_date, artist,
                                              track_name, "audio.flac")
                    audio_checksum = None
                    if os.path.exists(audio_path):
                        audio_checksum = md5(audio_path)
                    else:
                        audio_path = None

                    annot_rel = os.path.join("annotation", a, t)
                    audio_rel = os.path.join("audio", _release_date, artist,
                                             track_name, "audio.flac")
                    annot_checksum = md5(fp)

                    full_fp = os.path.join(annotations_dir, a, "full.lab")
                    majmin7 = os.path.join(annotations_dir, a, "majmin7.lab")
                    majmin7inv = os.path.join(annotations_dir, a,
                                              "majmin7inv.lab")
                    majmin = os.path.join(annotations_dir, a, "majmin.lab")
                    majmininv = os.path.join(annotations_dir, a,
                                             "majmininv.lab")

                    bothchroma = os.path.join(annotations_dir, a,
                                              "bothchroma.csv")
                    tuning = os.path.join(annotations_dir, a, "tuning.csv")

                    index["tracks"][track_id] = {
                        "audio": (audio_rel, audio_checksum),
                        "salami": (annot_rel, annot_checksum),
                        "bothchroma": (
                            os.path.join("McGill-Billboard", a,
                                         "bothchroma.csv"),
                            md5(bothchroma),
                        ),
                        "tuning": (
                            os.path.join("McGill-Billboard", a, "tuning.csv"),
                            md5(tuning),
                        ),
                        "lab_full": (
                            os.path.join("McGill-Billboard", a, "full.lab"),
                            md5(full_fp),
                        ),
                        "lab_majmin7": (
                            os.path.join("McGill-Billboard", a, "majmin7.lab"),
                            md5(majmin7),
                        ),
                        "lab_majmin7inv": (
                            os.path.join("McGill-Billboard", a,
                                         "majmin7inv.lab"),
                            md5(majmin7inv),
                        ),
                        "lab_majmin": (
                            os.path.join("McGill-Billboard", a, "majmin.lab"),
                            md5(majmin),
                        ),
                        "lab_majmininv": (
                            os.path.join("McGill-Billboard", a,
                                         "majmininv.lab"),
                            md5(majmininv),
                        ),
                    }

    with open(INDEX_PATH, "w") as fhandle:
        json.dump(index, fhandle, indent=2)
Ejemplo n.º 16
0
def make_saraga_carnatic_index(dataset_data_path):

    saraga_index = {
        'version': 1.5,
        'tracks': {},
    }
    idx = 0
    dataset_data_path_prev = dataset_data_path.split('saraga1.5_carnatic/')[0]
    for concert in os.listdir(dataset_data_path):
        if '.' not in concert:
            for song in os.listdir(os.path.join(dataset_data_path, concert)):
                if '.' not in song:
                    # Declare track attributes
                    index = str(idx) + '_' + song.replace(' ', '_')
                    print(index)
                    audio = (None, None)
                    audio_ghatam = (None, None)
                    audio_mridangam_left = (None, None)
                    audio_mridangam_right = (None, None)
                    audio_violin = (None, None)
                    audio_vocal = (None, None)
                    audio_vocal_s = (None, None)
                    ctonic = (None, None)
                    pitch = (None, None)
                    pitch_v = (None, None)
                    tempo = (None, None)
                    sama = (None, None)
                    sections = (None, None)
                    phrases = (None, None)
                    metadata = (None, None)

                    for file in os.listdir(
                            os.path.join(dataset_data_path, concert, song)):
                        if '.mp3' in file:
                            if 'multitrack' in file:
                                if 'ghatam' in file:
                                    audio_ghatam_path = os.path.join(
                                        'saraga1.5_carnatic', concert, song,
                                        file)
                                    audio_ghatam_checksum = md5(
                                        os.path.join(dataset_data_path_prev,
                                                     audio_ghatam_path))
                                    audio_ghatam = (audio_ghatam_path,
                                                    audio_ghatam_checksum)
                                if 'mridangam-left' in file:
                                    audio_mridangam_left_path = os.path.join(
                                        'saraga1.5_carnatic', concert, song,
                                        file)
                                    audio_mridangam_left_checksum = md5(
                                        os.path.join(
                                            dataset_data_path_prev,
                                            audio_mridangam_left_path))
                                    audio_mridangam_left = (
                                        audio_mridangam_left_path,
                                        audio_mridangam_left_checksum)
                                if 'mridangam-right' in file:
                                    mridangam_right_path = os.path.join(
                                        'saraga1.5_carnatic', concert, song,
                                        file)
                                    mridangam_right_checksum = md5(
                                        os.path.join(dataset_data_path_prev,
                                                     mridangam_right_path))
                                    audio_mridangam_right = (
                                        mridangam_right_path,
                                        mridangam_right_checksum)
                                if 'violin' in file:
                                    audio_violin_path = os.path.join(
                                        'saraga1.5_carnatic', concert, song,
                                        file)
                                    audio_violin_checksum = md5(
                                        os.path.join(dataset_data_path_prev,
                                                     audio_violin_path))
                                    audio_violin = (audio_violin_path,
                                                    audio_violin_checksum)
                                if 'vocal-s' in file:
                                    audio_vocal_s_path = os.path.join(
                                        'saraga1.5_carnatic', concert, song,
                                        file)
                                    audio_vocal_s_checksum = md5(
                                        os.path.join(dataset_data_path_prev,
                                                     audio_vocal_s_path))
                                    audio_vocal_s = (audio_vocal_s_path,
                                                     audio_vocal_s_checksum)
                                if 'vocal.' in file:
                                    audio_vocal_path = os.path.join(
                                        'saraga1.5_carnatic', concert, song,
                                        file)
                                    audio_vocal_checksum = md5(
                                        os.path.join(dataset_data_path_prev,
                                                     audio_vocal_path))
                                    audio_vocal = (audio_vocal_path,
                                                   audio_vocal_checksum)

                            else:
                                audio_path = os.path.join(
                                    'saraga1.5_carnatic', concert, song, file)
                                audio_checksum = md5(
                                    os.path.join(dataset_data_path_prev,
                                                 audio_path))
                                audio = (audio_path, audio_checksum)

                        if 'ctonic.' in file:
                            ctonic_path = os.path.join('saraga1.5_carnatic',
                                                       concert, song, file)
                            ctonic_checksum = md5(
                                os.path.join(dataset_data_path_prev,
                                             ctonic_path))
                            ctonic = (ctonic_path, ctonic_checksum)
                        if 'pitch.' in file:
                            pitch_path = os.path.join('saraga1.5_carnatic',
                                                      concert, song, file)
                            pitch_checksum = md5(
                                os.path.join(dataset_data_path_prev,
                                             pitch_path))
                            pitch = (pitch_path, pitch_checksum)
                        if 'pitch-vocal' in file:
                            pitch_v_path = os.path.join(
                                'saraga1.5_carnatic', concert, song, file)
                            pitch_v_checksum = md5(
                                os.path.join(dataset_data_path_prev,
                                             pitch_v_path))
                            pitch_v = (pitch_v_path, pitch_v_checksum)
                        if 'tempo-manual' in file:
                            tempo_path = os.path.join('saraga1.5_carnatic',
                                                      concert, song, file)
                            tempo_checksum = md5(
                                os.path.join(dataset_data_path_prev,
                                             tempo_path))
                            tempo = (tempo_path, tempo_checksum)
                        if 'sama-manual' in file:
                            sama_path = os.path.join('saraga1.5_carnatic',
                                                     concert, song, file)
                            sama_checksum = md5(
                                os.path.join(dataset_data_path_prev,
                                             sama_path))
                            sama = (sama_path, sama_checksum)
                        if 'sections-manual-p.txt' in file:
                            sections_path = os.path.join(
                                'saraga1.5_carnatic', concert, song, file)
                            sections_checksum = md5(
                                os.path.join(dataset_data_path_prev,
                                             sections_path))
                            sections = (sections_path, sections_checksum)
                        if 'mphrase' in file:
                            phrases_path = os.path.join(
                                'saraga1.5_carnatic', concert, song, file)
                            phrases_checksum = md5(
                                os.path.join(dataset_data_path_prev,
                                             phrases_path))
                            phrases = (phrases_path, phrases_checksum)
                        if '.json' in file:
                            metadata_path = os.path.join(
                                'saraga1.5_carnatic', concert, song, file)
                            metadata_checksum = md5(
                                os.path.join(dataset_data_path_prev,
                                             metadata_path))
                            metadata = (metadata_path, metadata_checksum)

                        saraga_index['tracks'][index] = {
                            'audio-mix': audio,
                            'audio-ghatam': audio_ghatam,
                            'audio-mridangam-left': audio_mridangam_left,
                            'audio-mridangam-right': audio_mridangam_right,
                            'audio-violin': audio_violin,
                            'audio-vocal-s': audio_vocal_s,
                            'audio-vocal': audio_vocal,
                            'ctonic': ctonic,
                            'pitch': pitch,
                            'pitch-vocal': pitch_v,
                            'tempo': tempo,
                            'sama': sama,
                            'sections': sections,
                            'phrases': phrases,
                            'metadata': metadata,
                        }

                    idx = idx + 1

    with open(SARAGA_CARNATIC_INDEX_PATH, 'w') as fhandle:
        json.dump(saraga_index, fhandle, indent=2)