def rekordbox_file_to_analysis_file(dataset_path):
    """
    Read information from rekordbox_rhythm.xml present in dataset_path and convert it into
    analsysis_rhythm_rekordbox.json to be stored in the same folder and compatible with our evaluation
    framework.
    """
    rekordbox_file = xml.etree.ElementTree.parse(os.path.join(dataset_path, 'rekordbox_rhythm.xml')).getroot()
    metadata_file = load_from_json(os.path.join(dataset_path, 'metadata.json'))
    out_file_path = os.path.join(dataset_path, 'analysis_rhythm_rekordbox.json')

    analysis = dict()
    with click.progressbar(metadata_file.keys(), label="Converting...") as metadata_keys:
        for key in metadata_keys:
            entry = find_corresponding_rekordbox_entry(metadata_file[key], rekordbox_file)
            if entry is not False:
                tempo_entry = entry.find('TEMPO')
                if tempo_entry is not None:
                    bpm_raw = float(tempo_entry.attrib['Bpm'])
                else:
                    bpm_raw = 0.0
                analysis[key] = {"RekBox": {
                        "bpm": bpm_raw,
                    }
                }
    save_to_json(out_file_path, analysis, verbose=True)
def rekordbox_file_to_analysis_file(dataset_path):
    """
    Read information from rekordbox_rhythm.xml present in dataset_path and convert it into
    analsysis_rhythm_rekordbox.json to be stored in the same folder and compatible with our evaluation
    framework.
    """
    rekordbox_file = xml.etree.ElementTree.parse(
        os.path.join(dataset_path, 'rekordbox_rhythm.xml')).getroot()
    metadata_file = load_from_json(os.path.join(dataset_path, 'metadata.json'))
    out_file_path = os.path.join(dataset_path,
                                 'analysis_rhythm_rekordbox.json')

    analysis = dict()
    with click.progressbar(metadata_file.keys(),
                           label="Converting...") as metadata_keys:
        for key in metadata_keys:
            entry = find_corresponding_rekordbox_entry(metadata_file[key],
                                                       rekordbox_file)
            if entry is not False:
                tempo_entry = entry.find('TEMPO')
                if tempo_entry is not None:
                    bpm_raw = float(tempo_entry.attrib['Bpm'])
                else:
                    bpm_raw = 0.0
                analysis[key] = {
                    "RekBox": {
                        "bpm": bpm_raw,
                    }
                }
    save_to_json(out_file_path, analysis, verbose=True)
def create_mixcraft_loops_dataset(dataset_path):
    """
    Analyze audio and metadata folder from mixcraft_loops dataset path and create matadata.json file with all gathered
    information. If downsampled wav files are not present, this command will also create them.
    The dataset_path provided needs to contain an original_metadata.csv file with metadata from the Mixcraft library
    as well as the original audio as present in the 'loops' folder of the application.
    """
    converted_audio_path = os.path.join(dataset_path, 'audio', 'wav')
    create_directories([dataset_path, converted_audio_path])
    csv_data = [row for row in csv.reader(open(os.path.join(dataset_path, 'original_metadata.csv')), delimiter=';')]
    csv_data = csv_data[1:]  # Remove header

    metadata = dict()
    with click.progressbar(csv_data,
                           label="Gathering metadata and convertig to wav (if needed)...") as original_metadata:
        for csv_row in original_metadata:

            sound_metadata = parse_audio_file_metatada(csv_row)
            if sound_metadata:
                sound_metadata['original_sound_path'] = os.path.join(dataset_path, 'audio', 'original',
                                                                     sound_metadata['style'],
                                                                     sound_metadata['name'] + '.ogg')
                wav_sound_path = "audio/wav/%i.wav" % sound_metadata['id']
                sound_metadata['wav_sound_path'] = wav_sound_path
                if not os.path.exists(os.path.join(dataset_path, sound_metadata['wav_sound_path'])):
                    out_filename = os.path.join(converted_audio_path, "%i.wav" % sound_metadata['id'])
                    convert_to_wav(sound_metadata['original_sound_path'], out_filename,
                                   samplerate=44100, nbits=16, nchannels=1)
                metadata[sound_metadata['id']] = sound_metadata

    save_to_json(os.path.join(dataset_path, 'metadata.json'), metadata)
    click.echo('Created dataset with %i sounds' % len(metadata))
    click.echo('Saved in %s' % dataset_path)
def create_giantsteps_dataset(dataset_path):
    """
    Before running this script, get the annotations and data from the dataset repository:
        1) clone https://github.com/GiantSteps/giantsteps-key-dataset
        2) cd to directory and run ./audio_dl.sh
    """

    original_audio_path = os.path.join(dataset_path, 'audio', 'original')
    converted_audio_path = os.path.join(dataset_path, 'audio', 'wav')
    metadata_files_path = os.path.join(dataset_path, 'annotations', 'key')

    if not os.path.exists(original_audio_path):
        click.echo("Moving downloaded audio files to new original path")
        shutil.move(os.path.join(dataset_path, 'audio'),
                    os.path.join(dataset_path, 'original'))
        shutil.move(os.path.join(dataset_path, 'original'),
                    original_audio_path)

    if not os.path.exists(converted_audio_path):
        click.echo(
            "Audio files in the dataset have not been converted to wav, we'll convert them now"
        )
        create_directories([converted_audio_path])
        convert_audio_files_to_wav(original_audio_path, converted_audio_path,
                                   44100, 16, 1, '')
    else:
        click.echo(
            "Converted audio files already exist, no need to convert them")

    metadata = dict()
    with click.progressbar(
        [fname for fname in os.listdir(original_audio_path)],
            label="Gathering metadata...") as dir_filenames:
        for filename in dir_filenames:
            if filename.endswith('mp3'):
                sound_metadata = parse_audio_file_metatada(
                    os.path.join(metadata_files_path,
                                 "%s" % filename).replace('.mp3', '.key'))
                if sound_metadata:
                    sound_metadata[
                        'original_sound_path'] = 'audio/original/%s' % filename
                    filename, extension = os.path.splitext(filename)
                    wav_sound_path = "audio/wav/%s.wav" % filename
                    sound_metadata['wav_sound_path'] = wav_sound_path
                    if os.path.exists(
                            os.path.join(dataset_path,
                                         sound_metadata['wav_sound_path'])):
                        metadata[sound_metadata['id']] = sound_metadata

    save_to_json(os.path.join(dataset_path, 'metadata.json'), metadata)
    click.echo('Created dataset with %i sounds' % len(metadata))
    click.echo('Saved in %s' % dataset_path)
Exemple #5
0
def create_apple_loops_dataset(dataset_path):
    """
    Analyze audio and metadata folder from apple_loops dataset path and create matadata.json file with all gathered
    information. If downsampled wav files are not present, this command will also create them.
    The dataset_path that is given needs to already contain a directory structure with metadata files extracted
    from .caf files (we used https://github.com/jhorology/apple-loops-meta-reader and added file_path property to
    each extracted file).
    """
    metadata_files_path = os.path.join(dataset_path, 'metadata')
    converted_audio_path = os.path.join(dataset_path, 'audio', 'wav')
    create_directories([dataset_path, converted_audio_path])

    metadata = dict()
    with click.progressbar(
        [fname for fname in os.listdir(metadata_files_path)],
            label="Gathering metadata and convertig to wav (if needed)..."
    ) as dir_filenames:
        for filename in dir_filenames:
            if filename.startswith('.'):
                continue
            sound_metadata = parse_audio_file_metatada(
                os.path.join(metadata_files_path, filename))
            if sound_metadata:
                sound_metadata['original_sound_path'] = sound_metadata[
                    'file_path']
                filesize = os.path.getsize(
                    sound_metadata['original_sound_path'])
                if filesize < 1024 * 5:
                    # Original filesize is lower than 5KB, ignore this file as it probably is just midi data
                    continue
                filename, extension = os.path.splitext(filename)
                wav_sound_path = "audio/wav/%s.wav" % filename
                sound_metadata['wav_sound_path'] = wav_sound_path
                if not os.path.exists(
                        os.path.join(dataset_path,
                                     sound_metadata['wav_sound_path'])):
                    out_filename = os.path.join(converted_audio_path,
                                                "%s.wav" % filename)
                    convert_to_wav(sound_metadata['original_sound_path'],
                                   out_filename,
                                   samplerate=44100,
                                   nbits=16,
                                   nchannels=1)
                metadata[sound_metadata['id']] = sound_metadata

    save_to_json(os.path.join(dataset_path, 'metadata.json'), metadata)
    click.echo('Created dataset with %i sounds' % len(metadata))
    click.echo('Saved in %s' % dataset_path)
def create_mixcraft_loops_dataset(dataset_path):
    """
    Analyze audio and metadata folder from mixcraft_loops dataset path and create matadata.json file with all gathered
    information. If downsampled wav files are not present, this command will also create them.
    The dataset_path provided needs to contain an original_metadata.csv file with metadata from the Mixcraft library
    as well as the original audio as present in the 'loops' folder of the application.
    """
    converted_audio_path = os.path.join(dataset_path, 'audio', 'wav')
    create_directories([dataset_path, converted_audio_path])
    csv_data = [
        row for row in csv.reader(open(
            os.path.join(dataset_path, 'original_metadata.csv')),
                                  delimiter=';')
    ]
    csv_data = csv_data[1:]  # Remove header

    metadata = dict()
    with click.progressbar(
            csv_data,
            label="Gathering metadata and convertig to wav (if needed)..."
    ) as original_metadata:
        for csv_row in original_metadata:

            sound_metadata = parse_audio_file_metatada(csv_row)
            if sound_metadata:
                sound_metadata['original_sound_path'] = os.path.join(
                    dataset_path, 'audio', 'original', sound_metadata['style'],
                    sound_metadata['name'] + '.ogg')
                wav_sound_path = "audio/wav/%i.wav" % sound_metadata['id']
                sound_metadata['wav_sound_path'] = wav_sound_path
                if not os.path.exists(
                        os.path.join(dataset_path,
                                     sound_metadata['wav_sound_path'])):
                    out_filename = os.path.join(
                        converted_audio_path, "%i.wav" % sound_metadata['id'])
                    convert_to_wav(sound_metadata['original_sound_path'],
                                   out_filename,
                                   samplerate=44100,
                                   nbits=16,
                                   nchannels=1)
                metadata[sound_metadata['id']] = sound_metadata

    save_to_json(os.path.join(dataset_path, 'metadata.json'), metadata)
    click.echo('Created dataset with %i sounds' % len(metadata))
    click.echo('Saved in %s' % dataset_path)
def create_looperman_dataset(dataset_path):
    """
    Analyze audio and metadata folder from looperman dataset path and create matadata.json file with all gathered
    information. If downsampled wav files are not present, this command will also create them.
    The dataset_path that is given needs to already contain a directory structure with original files and metadata
    files from looperman.
    """
    metadata_files_path = os.path.join(dataset_path, 'metadata')
    original_audio_path = os.path.join(dataset_path, 'audio', 'original')
    converted_audio_path = os.path.join(dataset_path, 'audio', 'wav')

    if not os.path.exists(converted_audio_path):
        click.echo(
            "Audio files in the dataset have not been converted to wav, we'll convert them now"
        )
        create_directories([dataset_path, converted_audio_path])
        convert_audio_files_to_wav(original_audio_path, converted_audio_path,
                                   44100, 16, 1, '')
    else:
        click.echo(
            "Converted audio files already exist, no need to convert them")

    metadata = dict()
    with click.progressbar(
        [fname for fname in os.listdir(original_audio_path)],
            label="Gathering metadata...") as dir_filenames:
        for filename in dir_filenames:
            sound_metadata = parse_audio_file_metatada(
                os.path.join(metadata_files_path, "%s.txt" % filename))
            if sound_metadata:
                sound_metadata[
                    'original_sound_path'] = 'audio/original/%s' % filename
                filename, extension = os.path.splitext(filename)
                wav_sound_path = "audio/wav/%s.wav" % filename
                sound_metadata['wav_sound_path'] = wav_sound_path
                if os.path.exists(
                        os.path.join(dataset_path,
                                     sound_metadata['wav_sound_path'])):
                    metadata[sound_metadata['id']] = sound_metadata

    save_to_json(os.path.join(dataset_path, 'metadata.json'), metadata)
    click.echo('Created dataset with %i sounds' % len(metadata))
    click.echo('Saved in %s' % dataset_path)
Exemple #8
0
def create_philarmonia_dataset(dataset_path):
    """
    
    """

    original_audio_path = os.path.join(dataset_path, 'audio', 'original')
    converted_audio_path = os.path.join(dataset_path, 'audio', 'wav')

    if not os.path.exists(converted_audio_path):
        click.echo(
            "Audio files in the dataset have not been converted to wav, we'll convert them now"
        )
        create_directories([converted_audio_path])
        convert_audio_files_to_wav(original_audio_path, converted_audio_path,
                                   44100, 16, 1, '')
    else:
        click.echo(
            "Converted audio files already exist, no need to convert them")

    metadata = dict()
    with click.progressbar(
        [fname for fname in os.listdir(original_audio_path)],
            label="Gathering metadata...") as dir_filenames:
        for filename in dir_filenames:
            if filename.endswith('mp3'):
                sound_metadata = parse_audio_file_metatada(filename)
                if sound_metadata:
                    sound_metadata[
                        'original_sound_path'] = 'audio/original/%s' % filename
                    filename, extension = os.path.splitext(filename)
                    wav_sound_path = "audio/wav/%s.wav" % filename
                    sound_metadata['wav_sound_path'] = wav_sound_path
                    if os.path.exists(
                            os.path.join(dataset_path,
                                         sound_metadata['wav_sound_path'])):
                        metadata[sound_metadata['id']] = sound_metadata

    save_to_json(os.path.join(dataset_path, 'metadata.json'), metadata)
    click.echo('Created dataset with %i sounds' % len(metadata))
    click.echo('Saved in %s' % dataset_path)
def create_dataset(dataset_path):
    """

    """

    converted_audio_path = os.path.join(dataset_path, 'audio', 'wav')
    sounds_metadata = csv.reader(open(os.path.join(dataset_path, 'goodsounds_sounds_metadata.csv')))
    packs_metadata = [item for item in csv.reader(open(os.path.join(dataset_path, 'goodsounds_packs_metadata.csv')))]
    create_directories([converted_audio_path])
    metadata = dict()
    with click.progressbar([row for row in sounds_metadata],
                           label="Gathering metadata...") as csv_rows:
        for csv_row in csv_rows:
            filename = csv_row[13]
            pack_row = get_pack_by_id(csv_row[14], packs_metadata)
            if pack_row is None:
                continue
            pack_name = pack_row[1]
            file_path = os.path.join(dataset_path, 'sound_files', pack_name, 'neumann', filename)
            new_filename = '%s-%s' % (pack_name, filename)
            dest_path = os.path.join(converted_audio_path, new_filename)

            if not os.path.exists(dest_path):
                try:
                    shutil.move(file_path, dest_path)
                except IOError:
                    continue

            sound_metadata = parse_audio_file_metatada(csv_row)
            if sound_metadata:
                wav_sound_path = "audio/wav/%s" % new_filename
                sound_metadata['wav_sound_path'] = wav_sound_path
                if os.path.exists(os.path.join(dataset_path, sound_metadata['wav_sound_path'])):
                    metadata[sound_metadata['id']] = sound_metadata

    save_to_json(os.path.join(dataset_path, 'metadata.json'), metadata)
    click.echo('Created dataset with %i sounds' % len(metadata))
    click.echo('Saved in %s' % dataset_path)
def create_apple_loops_dataset(dataset_path):
    """
    Analyze audio and metadata folder from apple_loops dataset path and create matadata.json file with all gathered
    information. If downsampled wav files are not present, this command will also create them.
    The dataset_path that is given needs to already contain a directory structure with metadata files extracted
    from .caf files (we used https://github.com/jhorology/apple-loops-meta-reader and added file_path property to
    each extracted file).
    """
    metadata_files_path = os.path.join(dataset_path, 'metadata')
    converted_audio_path = os.path.join(dataset_path, 'audio', 'wav')
    create_directories([dataset_path, converted_audio_path])

    metadata = dict()
    with click.progressbar([fname for fname in os.listdir(metadata_files_path)],
                           label="Gathering metadata and convertig to wav (if needed)...") as dir_filenames:
        for filename in dir_filenames:
            if filename.startswith('.'):
                continue
            sound_metadata = parse_audio_file_metatada(os.path.join(metadata_files_path, filename))
            if sound_metadata:
                sound_metadata['original_sound_path'] = sound_metadata['file_path']
                filesize = os.path.getsize(sound_metadata['original_sound_path'])
                if filesize < 1024 * 5:
                    # Original filesize is lower than 5KB, ignore this file as it probably is just midi data
                    continue
                filename, extension = os.path.splitext(filename)
                wav_sound_path = "audio/wav/%s.wav" % filename
                sound_metadata['wav_sound_path'] = wav_sound_path
                if not os.path.exists(os.path.join(dataset_path, sound_metadata['wav_sound_path'])):
                    out_filename = os.path.join(converted_audio_path, "%s.wav" % filename)
                    convert_to_wav(sound_metadata['original_sound_path'], out_filename,
                                   samplerate=44100, nbits=16, nchannels=1)
                metadata[sound_metadata['id']] = sound_metadata

    save_to_json(os.path.join(dataset_path, 'metadata.json'), metadata)
    click.echo('Created dataset with %i sounds' % len(metadata))
    click.echo('Saved in %s' % dataset_path)
Exemple #11
0
def analyze_dataset(dataset_path, algorithms, max_sounds, save_every, incremental, use_celery):
    """
    Analyze the audio files in a dataset with the specified algorithms.
    """

    available_algorithms = list()
    for algorithms_set in [rhythm_algorithms, general_algorithms]:
        for item in dir(algorithms_set):
            if item.startswith('algorithm_'):
                available_algorithms.append([
                    item.replace('algorithm_', ''),
                    algorithms_set.__getattribute__(item)
                ])

    analysis_algorithms_to_run = list()
    algorithm_names = algorithms.split(',')
    for algorithm_name, algorithm_function in available_algorithms:
        if algorithm_name in algorithm_names:
            out_file_path = os.path.join(dataset_path, "analysis_%s.json" % algorithm_name)
            if not incremental:
                if promt_user_to_abort_if_file_exists(out_file_path, throw_exception=False):
                    analysis_algorithms_to_run.append([algorithm_name, algorithm_function])
            else:
                analysis_algorithms_to_run.append([algorithm_name, algorithm_function])
    if not analysis_algorithms_to_run:
        click.echo('No analysis algorithms to run.')

    metadata = load_from_json(os.path.join(dataset_path, "metadata.json"))
    algorithms_run = 0
    for analysis_algorithm_name, analysis_algorithm in analysis_algorithms_to_run:
        start_timestamp = time.time()
        algorithms_run += 1
        click.echo("Analyzing '%s' dataset [%i/%i] - %s" %
                   (dataset_path, algorithms_run, len(analysis_algorithms_to_run), analysis_algorithm_name))
        if max_sounds is None:
            max_sounds = len(metadata)
        else:
            max_sounds = int(max_sounds)

        out_file_path = os.path.join(dataset_path, "analysis_%s.json" % analysis_algorithm_name)
        analysis_results = dict()
        if incremental and os.path.exists(out_file_path):
            # Load existing analysis file (if it exists)
            analysis_results = load_from_json(out_file_path)
            click.echo("Continuing existing analysis (%i sounds already analysed)" % len(analysis_results))

        label = "Analyzing sounds..."
        asynchronous_job_objects = None
        if use_celery:
            label = "Sending sounds to analyze..."
            asynchronous_job_objects = list()
        with click.progressbar(metadata.values()[:max_sounds], label=label) as sounds_metadata:
            for count, sound in enumerate(sounds_metadata):
                if incremental:
                    if unicode(sound['id']) in analysis_results:
                        # If analysis for that sound already exists, continue iteration with the next sound
                        continue

                # If there is no existing analysis, run it
                sound['file_path'] = os.path.join(dataset_path, sound['wav_sound_path'])
                if not use_celery:
                    try:
                        analysis = analysis_algorithm(sound)
                        if analysis:
                            analysis_results[sound['id']] = analysis
                        else:
                            continue
                    except RuntimeError:
                        continue
                    if count % save_every == 0:
                        # Save progress every 100 analysed sounds
                        save_to_json(out_file_path, analysis_results)
                else:
                    sound['file_path'] = os.path.join(dataset_path, sound['wav_sound_path'])
                    task_object = run_analysis_algorithm.delay(analysis_algorithm, sound)
                    asynchronous_job_objects.append((sound['id'], task_object))

        if use_celery and len(asynchronous_job_objects):
            # Enter a loop to check status of jobs and add jobs to metadata as they get done
            finished_asynchronous_job_objects = dict()
            n_total_jobs = len(asynchronous_job_objects)
            n_jobs_done_in_last_save = 0
            while len(asynchronous_job_objects) != 0:
                time.sleep(1)
                i = 0
                jobs_to_delete_from_list = list()  # store position of job_objects in asynchronous_job_objects
                # that will be deleted after each while iteration
                for sound_id, job_object in asynchronous_job_objects:
                    max_objects_to_check = 100
                    if job_object.ready():
                        # Job is finished
                        finished_asynchronous_job_objects[sound_id] = job_object
                        jobs_to_delete_from_list.append(i)
                        try:
                            analysis = job_object.get()
                            if analysis:
                                analysis_results[sound_id] = analysis
                            else:
                                continue
                        except RuntimeError:
                            pass
                    i += 1
                    if i >= max_objects_to_check:
                        # We only check the first 'max_objects_to_check' in each iteration.
                        # Assuming that jobs are being processed approximately in the order they were sent,
                        # we should be fine.
                        break
                for index in sorted(jobs_to_delete_from_list, reverse=True):
                    del asynchronous_job_objects[index]
                print_progress('Analyzing sounds...', len(finished_asynchronous_job_objects), n_total_jobs,
                               start_time=start_timestamp, show_progress_bar=True)
                # Estimate time remaining
                if len(finished_asynchronous_job_objects) - n_jobs_done_in_last_save >= save_every:
                    # Save progress every 100 analysed sounds
                    n_jobs_done_in_last_save += len(finished_asynchronous_job_objects)
                    save_to_json(out_file_path, analysis_results)
            print ''

        # Report number of correctly analyzed sounds
        end_timestamp = time.time()
        click.echo('Analyzed dataset with %i sounds (%i sounds correctly analysed), done in %s'
                   % (len(metadata), len(analysis_results),
                      seconds_to_day_hour_minute_second(end_timestamp - start_timestamp)))
        save_to_json(out_file_path, analysis_results, verbose=True)
def create_freesound_loops_dataset(dataset_path, max_sounds, group_by_pack):

    print title('Creating freesound loops dataset')
    out_file_path = os.path.join(dataset_path, "metadata.json")
    promt_user_to_abort_if_file_exists(out_file_path)
    original_sounds_path = os.path.join(dataset_path, 'audio', 'original')
    create_directories([dataset_path, original_sounds_path])
    metadata = dict()
    fs_client = FreesoundClient()
    try:
        fs_client.set_token(FREESOUND_ACCESS_TOKEN, auth_type="oauth")
        auth_type = 'oauth'
    except:
        fs_client.set_token(FREESOUND_API_KEY)
        auth_type = 'token'

    ids_to_retrieve = FSL4_IDS[:max_sounds]
    page_size = 100
    for i in range(0, len(ids_to_retrieve), page_size):
        current_ids = ids_to_retrieve[i:i + page_size]
        ids_filter = ' OR '.join(['id:%i' % sid for sid in current_ids])
        pager = fs_client.text_search(fields="id,name,tags,description,previews,username,pack,type,license",
                                      filter=ids_filter,
                                      page_size=page_size)

        for j, sound in enumerate(pager):
            print_progress('Downloading and/or processing sound %i' %
                           sound.id, len(metadata) + 1, len(ids_to_retrieve))
            estimated_bpm = estimate_bpm_from_metadata(sound.name, sound.description, sound.tags)
            if estimated_bpm:
                metadata[sound.id] = {
                    'id': sound.id,
                    'name': sound.name,
                    'tags': sound.tags,
                    'annotations': {'bpm': estimated_bpm},
                    'description': sound.description,
                    'preview_url': sound.previews.preview_hq_mp3,
                    'username': sound.username,
                    'pack': sound.pack,
                    'type': sound.type,
                    'license': sound.license,
                }
                original_sound_path = os.path.join(dataset_path, "audio/original/%i.%s" % (sound.id, sound.type))
                if not os.path.exists(original_sound_path) and auth_type == 'oauth':
                    # Retrieve original sound
                    try:
                        uri = URIS.uri(URIS.DOWNLOAD, sound.id)
                        FSRequest.retrieve(uri, fs_client, original_sound_path)
                    except Exception as e:
                        # If original sound could not be retrieved, try with preview
                        try:
                            preview_path = os.path.join(dataset_path, "audio/original/%i.mp3" % sound.id)
                            urllib.urlretrieve(sound.previews.preview_hq_mp3, preview_path)
                            original_sound_path = preview_path
                        except urllib.ContentTooShortError:
                            # Skip this sound (no preview or original could be downloaded)
                            del metadata[sound.id]
                            continue

                metadata[sound.id]['original_sound_path'] = original_sound_path.replace(dataset_path, '')
                metadata[sound.id]['wav_sound_path'] = os.path.join(dataset_path, 'audio', 'wav', '%i.wav' % sound.id)
                save_to_json(out_file_path, metadata)

    save_to_json(out_file_path, metadata)
    print ''

    # Create wav versions of the sounds if these do not already exist
    wav_sounds_path = os.path.join(dataset_path, 'audio', 'wav')
    if not os.path.exists(wav_sounds_path):
        print 'Creating wav versions of downloaded files...'
        create_directories([wav_sounds_path])
        convert_audio_files_to_wav(original_sounds_path, wav_sounds_path, 44100, 16, 1, '')

    print 'Created dataset with %i sounds' % len(metadata)
    print 'Saved in %s' % dataset_path
Exemple #13
0
def analyze_dataset(dataset_path, algorithms, max_sounds, save_every,
                    incremental, use_celery):
    """
    Analyze the audio files in a dataset with the specified algorithms.
    """

    available_algorithms = list()
    for algorithms_set in [rhythm_algorithms, general_algorithms]:
        for item in dir(algorithms_set):
            if item.startswith('algorithm_'):
                available_algorithms.append([
                    item.replace('algorithm_', ''),
                    algorithms_set.__getattribute__(item)
                ])

    analysis_algorithms_to_run = list()
    algorithm_names = algorithms.split(',')
    for algorithm_name, algorithm_function in available_algorithms:
        if algorithm_name in algorithm_names:
            out_file_path = os.path.join(dataset_path,
                                         "analysis_%s.json" % algorithm_name)
            if not incremental:
                if promt_user_to_abort_if_file_exists(out_file_path,
                                                      throw_exception=False):
                    analysis_algorithms_to_run.append(
                        [algorithm_name, algorithm_function])
            else:
                analysis_algorithms_to_run.append(
                    [algorithm_name, algorithm_function])
    if not analysis_algorithms_to_run:
        click.echo('No analysis algorithms to run.')

    metadata = load_from_json(os.path.join(dataset_path, "metadata.json"))
    algorithms_run = 0
    for analysis_algorithm_name, analysis_algorithm in analysis_algorithms_to_run:
        start_timestamp = time.time()
        algorithms_run += 1
        click.echo("Analyzing '%s' dataset [%i/%i] - %s" %
                   (dataset_path, algorithms_run,
                    len(analysis_algorithms_to_run), analysis_algorithm_name))
        if max_sounds is None:
            max_sounds = len(metadata)
        else:
            max_sounds = int(max_sounds)

        out_file_path = os.path.join(
            dataset_path, "analysis_%s.json" % analysis_algorithm_name)
        analysis_results = dict()
        if incremental and os.path.exists(out_file_path):
            # Load existing analysis file (if it exists)
            analysis_results = load_from_json(out_file_path)
            click.echo(
                "Continuing existing analysis (%i sounds already analysed)" %
                len(analysis_results))

        label = "Analyzing sounds..."
        asynchronous_job_objects = None
        if use_celery:
            label = "Sending sounds to analyze..."
            asynchronous_job_objects = list()
        with click.progressbar(metadata.values()[:max_sounds],
                               label=label) as sounds_metadata:
            for count, sound in enumerate(sounds_metadata):
                if incremental:
                    if unicode(sound['id']) in analysis_results:
                        # If analysis for that sound already exists, continue iteration with the next sound
                        continue

                # If there is no existing analysis, run it
                sound['file_path'] = os.path.join(dataset_path,
                                                  sound['wav_sound_path'])
                if not use_celery:
                    try:
                        analysis = analysis_algorithm(sound)
                        if analysis:
                            analysis_results[sound['id']] = analysis
                        else:
                            continue
                    except RuntimeError:
                        continue
                    if count % save_every == 0:
                        # Save progress every 100 analysed sounds
                        save_to_json(out_file_path, analysis_results)
                else:
                    sound['file_path'] = os.path.join(dataset_path,
                                                      sound['wav_sound_path'])
                    task_object = run_analysis_algorithm.delay(
                        analysis_algorithm, sound)
                    asynchronous_job_objects.append((sound['id'], task_object))

        if use_celery and len(asynchronous_job_objects):
            # Enter a loop to check status of jobs and add jobs to metadata as they get done
            finished_asynchronous_job_objects = dict()
            n_total_jobs = len(asynchronous_job_objects)
            n_jobs_done_in_last_save = 0
            while len(asynchronous_job_objects) != 0:
                time.sleep(1)
                i = 0
                jobs_to_delete_from_list = list(
                )  # store position of job_objects in asynchronous_job_objects
                # that will be deleted after each while iteration
                for sound_id, job_object in asynchronous_job_objects:
                    max_objects_to_check = 100
                    if job_object.ready():
                        # Job is finished
                        finished_asynchronous_job_objects[
                            sound_id] = job_object
                        jobs_to_delete_from_list.append(i)
                        try:
                            analysis = job_object.get()
                            if analysis:
                                analysis_results[sound_id] = analysis
                            else:
                                continue
                        except RuntimeError:
                            pass
                    i += 1
                    if i >= max_objects_to_check:
                        # We only check the first 'max_objects_to_check' in each iteration.
                        # Assuming that jobs are being processed approximately in the order they were sent,
                        # we should be fine.
                        break
                for index in sorted(jobs_to_delete_from_list, reverse=True):
                    del asynchronous_job_objects[index]
                print_progress('Analyzing sounds...',
                               len(finished_asynchronous_job_objects),
                               n_total_jobs,
                               start_time=start_timestamp,
                               show_progress_bar=True)
                # Estimate time remaining
                if len(finished_asynchronous_job_objects
                       ) - n_jobs_done_in_last_save >= save_every:
                    # Save progress every 100 analysed sounds
                    n_jobs_done_in_last_save += len(
                        finished_asynchronous_job_objects)
                    save_to_json(out_file_path, analysis_results)
            print ''

        # Report number of correctly analyzed sounds
        end_timestamp = time.time()
        click.echo(
            'Analyzed dataset with %i sounds (%i sounds correctly analysed), done in %s'
            % (len(metadata), len(analysis_results),
               seconds_to_day_hour_minute_second(end_timestamp -
                                                 start_timestamp)))
        save_to_json(out_file_path, analysis_results, verbose=True)
Exemple #14
0
def analyze_dataset(dataset_path, algorithms, max_sounds, save_every,
                    incremental, use_celery, force, workers):
    """
    Analyze the audio files in a dataset with the specified algorithms.
    """

    if use_celery and not celery_available:
        print('Celery not found, will analyze ignoring --use_celery option')
        use_celery = False

    available_algorithms = list()
    for algorithms_set in [
            rhythm_algorithms, general_algorithms, tonal_algorithms,
            pitch_algorithms
    ]:
        for item in dir(algorithms_set):
            if item.startswith('algorithm_'):
                available_algorithms.append([
                    item.replace('algorithm_', ''),
                    algorithms_set.__getattribute__(item)
                ])

    analysis_algorithms_to_run = list()
    algorithm_names = algorithms.split(',')
    for algorithm_name, algorithm_function in available_algorithms:
        if algorithm_name in algorithm_names:
            out_file_path = os.path.join(dataset_path,
                                         "analysis_%s.json" % algorithm_name)
            if not incremental and not force:
                if promt_user_to_abort_if_file_exists(out_file_path,
                                                      throw_exception=False):
                    analysis_algorithms_to_run.append(
                        [algorithm_name, algorithm_function])
            else:
                analysis_algorithms_to_run.append(
                    [algorithm_name, algorithm_function])
    if not analysis_algorithms_to_run:
        click.echo('No analysis algorithms to run. Available algorithms are:')
        for name, _ in available_algorithms:
            click.echo('\t%s' % name)
        return

    if use_celery:
        click.echo(
            'We detected you\'re using celery to run the tasks, here are some useful commands:'
        )
        click.echo(
            '\tsudo rabbitmq-server -detached             (start rabbitmq broker)'
        )
        click.echo(
            '\tsudo rabbitmqctl stop                      (stop rabbitmq broker)'
        )
        click.echo(
            '\tcelery -A tasks worker --concurrency=4     (start Celery workers)'
        )
        click.echo(
            '\tcelery -A tasks purge                      (clear Celery queue)'
        )

    metadata = load_from_json(os.path.join(dataset_path, "metadata.json"))
    algorithms_run = 0
    for analysis_algorithm_name, analysis_algorithm in analysis_algorithms_to_run:
        start_timestamp = time.time()
        algorithms_run += 1
        click.echo("Analyzing '%s' dataset [%i/%i] - %s" %
                   (dataset_path, algorithms_run,
                    len(analysis_algorithms_to_run), analysis_algorithm_name))
        if max_sounds is None:
            max_sounds = len(metadata)
        else:
            max_sounds = int(max_sounds)

        out_file_path = os.path.join(
            dataset_path, "analysis_%s.json" % analysis_algorithm_name)
        analysis_results = dict()
        if incremental and os.path.exists(out_file_path):
            # Load existing analysis file (if it exists)
            analysis_results = load_from_json(out_file_path)
            click.echo(
                "Continuing existing analysis (%i sounds already analysed)" %
                len(analysis_results))

        label = "Sending sounds to analyze..."
        asynchronous_job_objects = None
        wp = None
        if use_celery:
            asynchronous_job_objects = list()
        else:
            wp = WorkParallelizer(show_widgets=False)
        with click.progressbar(list(metadata.values())[:max_sounds],
                               label=label) as sounds_metadata:
            for count, sound in enumerate(sounds_metadata):
                if incremental:
                    if str(sound['id']) in analysis_results:
                        # If analysis for that sound already exists, continue iteration with the next sound
                        continue

                # If there is no existing analysis, run it
                try:
                    sound['file_path'] = glob.glob(dataset_path +
                                                   '/audio/wav/' +
                                                   str(sound['id']) +
                                                   '_*.wav')[0]
                    if not use_celery:
                        # Add jobs to work paralellizer
                        wp.add_task(analysis_algorithm,
                                    sound,
                                    task_id=sound['id'])
                    else:
                        task_object = run_analysis_algorithm.delay(
                            analysis_algorithm, sound)
                        asynchronous_job_objects.append(
                            (sound['id'], task_object))
                except Exception:
                    print("Could not find " + str(sound['id']))

        if use_celery and len(asynchronous_job_objects):
            # Enter a loop to check status of jobs and add jobs to metadata as they get done
            finished_asynchronous_job_objects = dict()
            n_total_jobs = len(asynchronous_job_objects)
            n_jobs_done_in_last_save = 0
            while len(asynchronous_job_objects) != 0:
                time.sleep(1)
                i = 0
                jobs_to_delete_from_list = list(
                )  # store position of job_objects in asynchronous_job_objects
                # that will be deleted after each while iteration
                for sound_id, job_object in asynchronous_job_objects:
                    max_objects_to_check = 100
                    if job_object.ready():
                        # Job is finished
                        finished_asynchronous_job_objects[
                            sound_id] = job_object
                        jobs_to_delete_from_list.append(i)
                        try:
                            analysis = job_object.get()
                            if analysis:
                                analysis_results[sound_id] = analysis
                            else:
                                continue
                        except RuntimeError:
                            pass
                    i += 1
                    if i >= max_objects_to_check:
                        # We only check the first 'max_objects_to_check' in each iteration.
                        # Assuming that jobs are being processed approximately in the order they were sent,
                        # we should be fine.
                        break
                for index in sorted(jobs_to_delete_from_list, reverse=True):
                    del asynchronous_job_objects[index]
                print_progress('Analyzing sounds...',
                               len(finished_asynchronous_job_objects),
                               n_total_jobs,
                               start_time=start_timestamp,
                               show_progress_bar=True)
                # Estimate time remaining
                if len(finished_asynchronous_job_objects
                       ) - n_jobs_done_in_last_save >= save_every:
                    # Save progress every 100 analysed sounds
                    n_jobs_done_in_last_save += len(
                        finished_asynchronous_job_objects)
                    save_to_json(out_file_path, analysis_results)
            print('')
        else:
            wp.start(num_workers=workers)
            while True:
                time.sleep(1)
                finished = wp.show_progress()

                tasks_succeeded = wp.tasks_succeeded()
                for task in tasks_succeeded:
                    sound_id = task.id
                    if sound_id not in analysis_results:
                        analysis_results[sound_id] = task.result(timeout=0.0)

                if len(tasks_succeeded) % save_every == 0:
                    save_to_json(out_file_path, analysis_results)

                if finished:
                    break

            wp.show_errors()

        # Report number of correctly analyzed sounds
        end_timestamp = time.time()
        click.echo(
            'Analyzed dataset with %i sounds (%i sounds correctly analysed), done in %s'
            % (len(metadata), len(analysis_results),
               seconds_to_day_hour_minute_second(end_timestamp -
                                                 start_timestamp)))
        save_to_json(out_file_path, analysis_results, verbose=True)