def rekordbox_file_to_analysis_file(dataset_path): """ Read information from rekordbox_rhythm.xml present in dataset_path and convert it into analsysis_rhythm_rekordbox.json to be stored in the same folder and compatible with our evaluation framework. """ rekordbox_file = xml.etree.ElementTree.parse(os.path.join(dataset_path, 'rekordbox_rhythm.xml')).getroot() metadata_file = load_from_json(os.path.join(dataset_path, 'metadata.json')) out_file_path = os.path.join(dataset_path, 'analysis_rhythm_rekordbox.json') analysis = dict() with click.progressbar(metadata_file.keys(), label="Converting...") as metadata_keys: for key in metadata_keys: entry = find_corresponding_rekordbox_entry(metadata_file[key], rekordbox_file) if entry is not False: tempo_entry = entry.find('TEMPO') if tempo_entry is not None: bpm_raw = float(tempo_entry.attrib['Bpm']) else: bpm_raw = 0.0 analysis[key] = {"RekBox": { "bpm": bpm_raw, } } save_to_json(out_file_path, analysis, verbose=True)
def rekordbox_file_to_analysis_file(dataset_path): """ Read information from rekordbox_rhythm.xml present in dataset_path and convert it into analsysis_rhythm_rekordbox.json to be stored in the same folder and compatible with our evaluation framework. """ rekordbox_file = xml.etree.ElementTree.parse( os.path.join(dataset_path, 'rekordbox_rhythm.xml')).getroot() metadata_file = load_from_json(os.path.join(dataset_path, 'metadata.json')) out_file_path = os.path.join(dataset_path, 'analysis_rhythm_rekordbox.json') analysis = dict() with click.progressbar(metadata_file.keys(), label="Converting...") as metadata_keys: for key in metadata_keys: entry = find_corresponding_rekordbox_entry(metadata_file[key], rekordbox_file) if entry is not False: tempo_entry = entry.find('TEMPO') if tempo_entry is not None: bpm_raw = float(tempo_entry.attrib['Bpm']) else: bpm_raw = 0.0 analysis[key] = { "RekBox": { "bpm": bpm_raw, } } save_to_json(out_file_path, analysis, verbose=True)
def create_mixcraft_loops_dataset(dataset_path): """ Analyze audio and metadata folder from mixcraft_loops dataset path and create matadata.json file with all gathered information. If downsampled wav files are not present, this command will also create them. The dataset_path provided needs to contain an original_metadata.csv file with metadata from the Mixcraft library as well as the original audio as present in the 'loops' folder of the application. """ converted_audio_path = os.path.join(dataset_path, 'audio', 'wav') create_directories([dataset_path, converted_audio_path]) csv_data = [row for row in csv.reader(open(os.path.join(dataset_path, 'original_metadata.csv')), delimiter=';')] csv_data = csv_data[1:] # Remove header metadata = dict() with click.progressbar(csv_data, label="Gathering metadata and convertig to wav (if needed)...") as original_metadata: for csv_row in original_metadata: sound_metadata = parse_audio_file_metatada(csv_row) if sound_metadata: sound_metadata['original_sound_path'] = os.path.join(dataset_path, 'audio', 'original', sound_metadata['style'], sound_metadata['name'] + '.ogg') wav_sound_path = "audio/wav/%i.wav" % sound_metadata['id'] sound_metadata['wav_sound_path'] = wav_sound_path if not os.path.exists(os.path.join(dataset_path, sound_metadata['wav_sound_path'])): out_filename = os.path.join(converted_audio_path, "%i.wav" % sound_metadata['id']) convert_to_wav(sound_metadata['original_sound_path'], out_filename, samplerate=44100, nbits=16, nchannels=1) metadata[sound_metadata['id']] = sound_metadata save_to_json(os.path.join(dataset_path, 'metadata.json'), metadata) click.echo('Created dataset with %i sounds' % len(metadata)) click.echo('Saved in %s' % dataset_path)
def create_giantsteps_dataset(dataset_path): """ Before running this script, get the annotations and data from the dataset repository: 1) clone https://github.com/GiantSteps/giantsteps-key-dataset 2) cd to directory and run ./audio_dl.sh """ original_audio_path = os.path.join(dataset_path, 'audio', 'original') converted_audio_path = os.path.join(dataset_path, 'audio', 'wav') metadata_files_path = os.path.join(dataset_path, 'annotations', 'key') if not os.path.exists(original_audio_path): click.echo("Moving downloaded audio files to new original path") shutil.move(os.path.join(dataset_path, 'audio'), os.path.join(dataset_path, 'original')) shutil.move(os.path.join(dataset_path, 'original'), original_audio_path) if not os.path.exists(converted_audio_path): click.echo( "Audio files in the dataset have not been converted to wav, we'll convert them now" ) create_directories([converted_audio_path]) convert_audio_files_to_wav(original_audio_path, converted_audio_path, 44100, 16, 1, '') else: click.echo( "Converted audio files already exist, no need to convert them") metadata = dict() with click.progressbar( [fname for fname in os.listdir(original_audio_path)], label="Gathering metadata...") as dir_filenames: for filename in dir_filenames: if filename.endswith('mp3'): sound_metadata = parse_audio_file_metatada( os.path.join(metadata_files_path, "%s" % filename).replace('.mp3', '.key')) if sound_metadata: sound_metadata[ 'original_sound_path'] = 'audio/original/%s' % filename filename, extension = os.path.splitext(filename) wav_sound_path = "audio/wav/%s.wav" % filename sound_metadata['wav_sound_path'] = wav_sound_path if os.path.exists( os.path.join(dataset_path, sound_metadata['wav_sound_path'])): metadata[sound_metadata['id']] = sound_metadata save_to_json(os.path.join(dataset_path, 'metadata.json'), metadata) click.echo('Created dataset with %i sounds' % len(metadata)) click.echo('Saved in %s' % dataset_path)
def create_apple_loops_dataset(dataset_path): """ Analyze audio and metadata folder from apple_loops dataset path and create matadata.json file with all gathered information. If downsampled wav files are not present, this command will also create them. The dataset_path that is given needs to already contain a directory structure with metadata files extracted from .caf files (we used https://github.com/jhorology/apple-loops-meta-reader and added file_path property to each extracted file). """ metadata_files_path = os.path.join(dataset_path, 'metadata') converted_audio_path = os.path.join(dataset_path, 'audio', 'wav') create_directories([dataset_path, converted_audio_path]) metadata = dict() with click.progressbar( [fname for fname in os.listdir(metadata_files_path)], label="Gathering metadata and convertig to wav (if needed)..." ) as dir_filenames: for filename in dir_filenames: if filename.startswith('.'): continue sound_metadata = parse_audio_file_metatada( os.path.join(metadata_files_path, filename)) if sound_metadata: sound_metadata['original_sound_path'] = sound_metadata[ 'file_path'] filesize = os.path.getsize( sound_metadata['original_sound_path']) if filesize < 1024 * 5: # Original filesize is lower than 5KB, ignore this file as it probably is just midi data continue filename, extension = os.path.splitext(filename) wav_sound_path = "audio/wav/%s.wav" % filename sound_metadata['wav_sound_path'] = wav_sound_path if not os.path.exists( os.path.join(dataset_path, sound_metadata['wav_sound_path'])): out_filename = os.path.join(converted_audio_path, "%s.wav" % filename) convert_to_wav(sound_metadata['original_sound_path'], out_filename, samplerate=44100, nbits=16, nchannels=1) metadata[sound_metadata['id']] = sound_metadata save_to_json(os.path.join(dataset_path, 'metadata.json'), metadata) click.echo('Created dataset with %i sounds' % len(metadata)) click.echo('Saved in %s' % dataset_path)
def create_mixcraft_loops_dataset(dataset_path): """ Analyze audio and metadata folder from mixcraft_loops dataset path and create matadata.json file with all gathered information. If downsampled wav files are not present, this command will also create them. The dataset_path provided needs to contain an original_metadata.csv file with metadata from the Mixcraft library as well as the original audio as present in the 'loops' folder of the application. """ converted_audio_path = os.path.join(dataset_path, 'audio', 'wav') create_directories([dataset_path, converted_audio_path]) csv_data = [ row for row in csv.reader(open( os.path.join(dataset_path, 'original_metadata.csv')), delimiter=';') ] csv_data = csv_data[1:] # Remove header metadata = dict() with click.progressbar( csv_data, label="Gathering metadata and convertig to wav (if needed)..." ) as original_metadata: for csv_row in original_metadata: sound_metadata = parse_audio_file_metatada(csv_row) if sound_metadata: sound_metadata['original_sound_path'] = os.path.join( dataset_path, 'audio', 'original', sound_metadata['style'], sound_metadata['name'] + '.ogg') wav_sound_path = "audio/wav/%i.wav" % sound_metadata['id'] sound_metadata['wav_sound_path'] = wav_sound_path if not os.path.exists( os.path.join(dataset_path, sound_metadata['wav_sound_path'])): out_filename = os.path.join( converted_audio_path, "%i.wav" % sound_metadata['id']) convert_to_wav(sound_metadata['original_sound_path'], out_filename, samplerate=44100, nbits=16, nchannels=1) metadata[sound_metadata['id']] = sound_metadata save_to_json(os.path.join(dataset_path, 'metadata.json'), metadata) click.echo('Created dataset with %i sounds' % len(metadata)) click.echo('Saved in %s' % dataset_path)
def create_looperman_dataset(dataset_path): """ Analyze audio and metadata folder from looperman dataset path and create matadata.json file with all gathered information. If downsampled wav files are not present, this command will also create them. The dataset_path that is given needs to already contain a directory structure with original files and metadata files from looperman. """ metadata_files_path = os.path.join(dataset_path, 'metadata') original_audio_path = os.path.join(dataset_path, 'audio', 'original') converted_audio_path = os.path.join(dataset_path, 'audio', 'wav') if not os.path.exists(converted_audio_path): click.echo( "Audio files in the dataset have not been converted to wav, we'll convert them now" ) create_directories([dataset_path, converted_audio_path]) convert_audio_files_to_wav(original_audio_path, converted_audio_path, 44100, 16, 1, '') else: click.echo( "Converted audio files already exist, no need to convert them") metadata = dict() with click.progressbar( [fname for fname in os.listdir(original_audio_path)], label="Gathering metadata...") as dir_filenames: for filename in dir_filenames: sound_metadata = parse_audio_file_metatada( os.path.join(metadata_files_path, "%s.txt" % filename)) if sound_metadata: sound_metadata[ 'original_sound_path'] = 'audio/original/%s' % filename filename, extension = os.path.splitext(filename) wav_sound_path = "audio/wav/%s.wav" % filename sound_metadata['wav_sound_path'] = wav_sound_path if os.path.exists( os.path.join(dataset_path, sound_metadata['wav_sound_path'])): metadata[sound_metadata['id']] = sound_metadata save_to_json(os.path.join(dataset_path, 'metadata.json'), metadata) click.echo('Created dataset with %i sounds' % len(metadata)) click.echo('Saved in %s' % dataset_path)
def create_philarmonia_dataset(dataset_path): """ """ original_audio_path = os.path.join(dataset_path, 'audio', 'original') converted_audio_path = os.path.join(dataset_path, 'audio', 'wav') if not os.path.exists(converted_audio_path): click.echo( "Audio files in the dataset have not been converted to wav, we'll convert them now" ) create_directories([converted_audio_path]) convert_audio_files_to_wav(original_audio_path, converted_audio_path, 44100, 16, 1, '') else: click.echo( "Converted audio files already exist, no need to convert them") metadata = dict() with click.progressbar( [fname for fname in os.listdir(original_audio_path)], label="Gathering metadata...") as dir_filenames: for filename in dir_filenames: if filename.endswith('mp3'): sound_metadata = parse_audio_file_metatada(filename) if sound_metadata: sound_metadata[ 'original_sound_path'] = 'audio/original/%s' % filename filename, extension = os.path.splitext(filename) wav_sound_path = "audio/wav/%s.wav" % filename sound_metadata['wav_sound_path'] = wav_sound_path if os.path.exists( os.path.join(dataset_path, sound_metadata['wav_sound_path'])): metadata[sound_metadata['id']] = sound_metadata save_to_json(os.path.join(dataset_path, 'metadata.json'), metadata) click.echo('Created dataset with %i sounds' % len(metadata)) click.echo('Saved in %s' % dataset_path)
def create_dataset(dataset_path): """ """ converted_audio_path = os.path.join(dataset_path, 'audio', 'wav') sounds_metadata = csv.reader(open(os.path.join(dataset_path, 'goodsounds_sounds_metadata.csv'))) packs_metadata = [item for item in csv.reader(open(os.path.join(dataset_path, 'goodsounds_packs_metadata.csv')))] create_directories([converted_audio_path]) metadata = dict() with click.progressbar([row for row in sounds_metadata], label="Gathering metadata...") as csv_rows: for csv_row in csv_rows: filename = csv_row[13] pack_row = get_pack_by_id(csv_row[14], packs_metadata) if pack_row is None: continue pack_name = pack_row[1] file_path = os.path.join(dataset_path, 'sound_files', pack_name, 'neumann', filename) new_filename = '%s-%s' % (pack_name, filename) dest_path = os.path.join(converted_audio_path, new_filename) if not os.path.exists(dest_path): try: shutil.move(file_path, dest_path) except IOError: continue sound_metadata = parse_audio_file_metatada(csv_row) if sound_metadata: wav_sound_path = "audio/wav/%s" % new_filename sound_metadata['wav_sound_path'] = wav_sound_path if os.path.exists(os.path.join(dataset_path, sound_metadata['wav_sound_path'])): metadata[sound_metadata['id']] = sound_metadata save_to_json(os.path.join(dataset_path, 'metadata.json'), metadata) click.echo('Created dataset with %i sounds' % len(metadata)) click.echo('Saved in %s' % dataset_path)
def create_apple_loops_dataset(dataset_path): """ Analyze audio and metadata folder from apple_loops dataset path and create matadata.json file with all gathered information. If downsampled wav files are not present, this command will also create them. The dataset_path that is given needs to already contain a directory structure with metadata files extracted from .caf files (we used https://github.com/jhorology/apple-loops-meta-reader and added file_path property to each extracted file). """ metadata_files_path = os.path.join(dataset_path, 'metadata') converted_audio_path = os.path.join(dataset_path, 'audio', 'wav') create_directories([dataset_path, converted_audio_path]) metadata = dict() with click.progressbar([fname for fname in os.listdir(metadata_files_path)], label="Gathering metadata and convertig to wav (if needed)...") as dir_filenames: for filename in dir_filenames: if filename.startswith('.'): continue sound_metadata = parse_audio_file_metatada(os.path.join(metadata_files_path, filename)) if sound_metadata: sound_metadata['original_sound_path'] = sound_metadata['file_path'] filesize = os.path.getsize(sound_metadata['original_sound_path']) if filesize < 1024 * 5: # Original filesize is lower than 5KB, ignore this file as it probably is just midi data continue filename, extension = os.path.splitext(filename) wav_sound_path = "audio/wav/%s.wav" % filename sound_metadata['wav_sound_path'] = wav_sound_path if not os.path.exists(os.path.join(dataset_path, sound_metadata['wav_sound_path'])): out_filename = os.path.join(converted_audio_path, "%s.wav" % filename) convert_to_wav(sound_metadata['original_sound_path'], out_filename, samplerate=44100, nbits=16, nchannels=1) metadata[sound_metadata['id']] = sound_metadata save_to_json(os.path.join(dataset_path, 'metadata.json'), metadata) click.echo('Created dataset with %i sounds' % len(metadata)) click.echo('Saved in %s' % dataset_path)
def analyze_dataset(dataset_path, algorithms, max_sounds, save_every, incremental, use_celery): """ Analyze the audio files in a dataset with the specified algorithms. """ available_algorithms = list() for algorithms_set in [rhythm_algorithms, general_algorithms]: for item in dir(algorithms_set): if item.startswith('algorithm_'): available_algorithms.append([ item.replace('algorithm_', ''), algorithms_set.__getattribute__(item) ]) analysis_algorithms_to_run = list() algorithm_names = algorithms.split(',') for algorithm_name, algorithm_function in available_algorithms: if algorithm_name in algorithm_names: out_file_path = os.path.join(dataset_path, "analysis_%s.json" % algorithm_name) if not incremental: if promt_user_to_abort_if_file_exists(out_file_path, throw_exception=False): analysis_algorithms_to_run.append([algorithm_name, algorithm_function]) else: analysis_algorithms_to_run.append([algorithm_name, algorithm_function]) if not analysis_algorithms_to_run: click.echo('No analysis algorithms to run.') metadata = load_from_json(os.path.join(dataset_path, "metadata.json")) algorithms_run = 0 for analysis_algorithm_name, analysis_algorithm in analysis_algorithms_to_run: start_timestamp = time.time() algorithms_run += 1 click.echo("Analyzing '%s' dataset [%i/%i] - %s" % (dataset_path, algorithms_run, len(analysis_algorithms_to_run), analysis_algorithm_name)) if max_sounds is None: max_sounds = len(metadata) else: max_sounds = int(max_sounds) out_file_path = os.path.join(dataset_path, "analysis_%s.json" % analysis_algorithm_name) analysis_results = dict() if incremental and os.path.exists(out_file_path): # Load existing analysis file (if it exists) analysis_results = load_from_json(out_file_path) click.echo("Continuing existing analysis (%i sounds already analysed)" % len(analysis_results)) label = "Analyzing sounds..." asynchronous_job_objects = None if use_celery: label = "Sending sounds to analyze..." asynchronous_job_objects = list() with click.progressbar(metadata.values()[:max_sounds], label=label) as sounds_metadata: for count, sound in enumerate(sounds_metadata): if incremental: if unicode(sound['id']) in analysis_results: # If analysis for that sound already exists, continue iteration with the next sound continue # If there is no existing analysis, run it sound['file_path'] = os.path.join(dataset_path, sound['wav_sound_path']) if not use_celery: try: analysis = analysis_algorithm(sound) if analysis: analysis_results[sound['id']] = analysis else: continue except RuntimeError: continue if count % save_every == 0: # Save progress every 100 analysed sounds save_to_json(out_file_path, analysis_results) else: sound['file_path'] = os.path.join(dataset_path, sound['wav_sound_path']) task_object = run_analysis_algorithm.delay(analysis_algorithm, sound) asynchronous_job_objects.append((sound['id'], task_object)) if use_celery and len(asynchronous_job_objects): # Enter a loop to check status of jobs and add jobs to metadata as they get done finished_asynchronous_job_objects = dict() n_total_jobs = len(asynchronous_job_objects) n_jobs_done_in_last_save = 0 while len(asynchronous_job_objects) != 0: time.sleep(1) i = 0 jobs_to_delete_from_list = list() # store position of job_objects in asynchronous_job_objects # that will be deleted after each while iteration for sound_id, job_object in asynchronous_job_objects: max_objects_to_check = 100 if job_object.ready(): # Job is finished finished_asynchronous_job_objects[sound_id] = job_object jobs_to_delete_from_list.append(i) try: analysis = job_object.get() if analysis: analysis_results[sound_id] = analysis else: continue except RuntimeError: pass i += 1 if i >= max_objects_to_check: # We only check the first 'max_objects_to_check' in each iteration. # Assuming that jobs are being processed approximately in the order they were sent, # we should be fine. break for index in sorted(jobs_to_delete_from_list, reverse=True): del asynchronous_job_objects[index] print_progress('Analyzing sounds...', len(finished_asynchronous_job_objects), n_total_jobs, start_time=start_timestamp, show_progress_bar=True) # Estimate time remaining if len(finished_asynchronous_job_objects) - n_jobs_done_in_last_save >= save_every: # Save progress every 100 analysed sounds n_jobs_done_in_last_save += len(finished_asynchronous_job_objects) save_to_json(out_file_path, analysis_results) print '' # Report number of correctly analyzed sounds end_timestamp = time.time() click.echo('Analyzed dataset with %i sounds (%i sounds correctly analysed), done in %s' % (len(metadata), len(analysis_results), seconds_to_day_hour_minute_second(end_timestamp - start_timestamp))) save_to_json(out_file_path, analysis_results, verbose=True)
def create_freesound_loops_dataset(dataset_path, max_sounds, group_by_pack): print title('Creating freesound loops dataset') out_file_path = os.path.join(dataset_path, "metadata.json") promt_user_to_abort_if_file_exists(out_file_path) original_sounds_path = os.path.join(dataset_path, 'audio', 'original') create_directories([dataset_path, original_sounds_path]) metadata = dict() fs_client = FreesoundClient() try: fs_client.set_token(FREESOUND_ACCESS_TOKEN, auth_type="oauth") auth_type = 'oauth' except: fs_client.set_token(FREESOUND_API_KEY) auth_type = 'token' ids_to_retrieve = FSL4_IDS[:max_sounds] page_size = 100 for i in range(0, len(ids_to_retrieve), page_size): current_ids = ids_to_retrieve[i:i + page_size] ids_filter = ' OR '.join(['id:%i' % sid for sid in current_ids]) pager = fs_client.text_search(fields="id,name,tags,description,previews,username,pack,type,license", filter=ids_filter, page_size=page_size) for j, sound in enumerate(pager): print_progress('Downloading and/or processing sound %i' % sound.id, len(metadata) + 1, len(ids_to_retrieve)) estimated_bpm = estimate_bpm_from_metadata(sound.name, sound.description, sound.tags) if estimated_bpm: metadata[sound.id] = { 'id': sound.id, 'name': sound.name, 'tags': sound.tags, 'annotations': {'bpm': estimated_bpm}, 'description': sound.description, 'preview_url': sound.previews.preview_hq_mp3, 'username': sound.username, 'pack': sound.pack, 'type': sound.type, 'license': sound.license, } original_sound_path = os.path.join(dataset_path, "audio/original/%i.%s" % (sound.id, sound.type)) if not os.path.exists(original_sound_path) and auth_type == 'oauth': # Retrieve original sound try: uri = URIS.uri(URIS.DOWNLOAD, sound.id) FSRequest.retrieve(uri, fs_client, original_sound_path) except Exception as e: # If original sound could not be retrieved, try with preview try: preview_path = os.path.join(dataset_path, "audio/original/%i.mp3" % sound.id) urllib.urlretrieve(sound.previews.preview_hq_mp3, preview_path) original_sound_path = preview_path except urllib.ContentTooShortError: # Skip this sound (no preview or original could be downloaded) del metadata[sound.id] continue metadata[sound.id]['original_sound_path'] = original_sound_path.replace(dataset_path, '') metadata[sound.id]['wav_sound_path'] = os.path.join(dataset_path, 'audio', 'wav', '%i.wav' % sound.id) save_to_json(out_file_path, metadata) save_to_json(out_file_path, metadata) print '' # Create wav versions of the sounds if these do not already exist wav_sounds_path = os.path.join(dataset_path, 'audio', 'wav') if not os.path.exists(wav_sounds_path): print 'Creating wav versions of downloaded files...' create_directories([wav_sounds_path]) convert_audio_files_to_wav(original_sounds_path, wav_sounds_path, 44100, 16, 1, '') print 'Created dataset with %i sounds' % len(metadata) print 'Saved in %s' % dataset_path
def analyze_dataset(dataset_path, algorithms, max_sounds, save_every, incremental, use_celery): """ Analyze the audio files in a dataset with the specified algorithms. """ available_algorithms = list() for algorithms_set in [rhythm_algorithms, general_algorithms]: for item in dir(algorithms_set): if item.startswith('algorithm_'): available_algorithms.append([ item.replace('algorithm_', ''), algorithms_set.__getattribute__(item) ]) analysis_algorithms_to_run = list() algorithm_names = algorithms.split(',') for algorithm_name, algorithm_function in available_algorithms: if algorithm_name in algorithm_names: out_file_path = os.path.join(dataset_path, "analysis_%s.json" % algorithm_name) if not incremental: if promt_user_to_abort_if_file_exists(out_file_path, throw_exception=False): analysis_algorithms_to_run.append( [algorithm_name, algorithm_function]) else: analysis_algorithms_to_run.append( [algorithm_name, algorithm_function]) if not analysis_algorithms_to_run: click.echo('No analysis algorithms to run.') metadata = load_from_json(os.path.join(dataset_path, "metadata.json")) algorithms_run = 0 for analysis_algorithm_name, analysis_algorithm in analysis_algorithms_to_run: start_timestamp = time.time() algorithms_run += 1 click.echo("Analyzing '%s' dataset [%i/%i] - %s" % (dataset_path, algorithms_run, len(analysis_algorithms_to_run), analysis_algorithm_name)) if max_sounds is None: max_sounds = len(metadata) else: max_sounds = int(max_sounds) out_file_path = os.path.join( dataset_path, "analysis_%s.json" % analysis_algorithm_name) analysis_results = dict() if incremental and os.path.exists(out_file_path): # Load existing analysis file (if it exists) analysis_results = load_from_json(out_file_path) click.echo( "Continuing existing analysis (%i sounds already analysed)" % len(analysis_results)) label = "Analyzing sounds..." asynchronous_job_objects = None if use_celery: label = "Sending sounds to analyze..." asynchronous_job_objects = list() with click.progressbar(metadata.values()[:max_sounds], label=label) as sounds_metadata: for count, sound in enumerate(sounds_metadata): if incremental: if unicode(sound['id']) in analysis_results: # If analysis for that sound already exists, continue iteration with the next sound continue # If there is no existing analysis, run it sound['file_path'] = os.path.join(dataset_path, sound['wav_sound_path']) if not use_celery: try: analysis = analysis_algorithm(sound) if analysis: analysis_results[sound['id']] = analysis else: continue except RuntimeError: continue if count % save_every == 0: # Save progress every 100 analysed sounds save_to_json(out_file_path, analysis_results) else: sound['file_path'] = os.path.join(dataset_path, sound['wav_sound_path']) task_object = run_analysis_algorithm.delay( analysis_algorithm, sound) asynchronous_job_objects.append((sound['id'], task_object)) if use_celery and len(asynchronous_job_objects): # Enter a loop to check status of jobs and add jobs to metadata as they get done finished_asynchronous_job_objects = dict() n_total_jobs = len(asynchronous_job_objects) n_jobs_done_in_last_save = 0 while len(asynchronous_job_objects) != 0: time.sleep(1) i = 0 jobs_to_delete_from_list = list( ) # store position of job_objects in asynchronous_job_objects # that will be deleted after each while iteration for sound_id, job_object in asynchronous_job_objects: max_objects_to_check = 100 if job_object.ready(): # Job is finished finished_asynchronous_job_objects[ sound_id] = job_object jobs_to_delete_from_list.append(i) try: analysis = job_object.get() if analysis: analysis_results[sound_id] = analysis else: continue except RuntimeError: pass i += 1 if i >= max_objects_to_check: # We only check the first 'max_objects_to_check' in each iteration. # Assuming that jobs are being processed approximately in the order they were sent, # we should be fine. break for index in sorted(jobs_to_delete_from_list, reverse=True): del asynchronous_job_objects[index] print_progress('Analyzing sounds...', len(finished_asynchronous_job_objects), n_total_jobs, start_time=start_timestamp, show_progress_bar=True) # Estimate time remaining if len(finished_asynchronous_job_objects ) - n_jobs_done_in_last_save >= save_every: # Save progress every 100 analysed sounds n_jobs_done_in_last_save += len( finished_asynchronous_job_objects) save_to_json(out_file_path, analysis_results) print '' # Report number of correctly analyzed sounds end_timestamp = time.time() click.echo( 'Analyzed dataset with %i sounds (%i sounds correctly analysed), done in %s' % (len(metadata), len(analysis_results), seconds_to_day_hour_minute_second(end_timestamp - start_timestamp))) save_to_json(out_file_path, analysis_results, verbose=True)
def analyze_dataset(dataset_path, algorithms, max_sounds, save_every, incremental, use_celery, force, workers): """ Analyze the audio files in a dataset with the specified algorithms. """ if use_celery and not celery_available: print('Celery not found, will analyze ignoring --use_celery option') use_celery = False available_algorithms = list() for algorithms_set in [ rhythm_algorithms, general_algorithms, tonal_algorithms, pitch_algorithms ]: for item in dir(algorithms_set): if item.startswith('algorithm_'): available_algorithms.append([ item.replace('algorithm_', ''), algorithms_set.__getattribute__(item) ]) analysis_algorithms_to_run = list() algorithm_names = algorithms.split(',') for algorithm_name, algorithm_function in available_algorithms: if algorithm_name in algorithm_names: out_file_path = os.path.join(dataset_path, "analysis_%s.json" % algorithm_name) if not incremental and not force: if promt_user_to_abort_if_file_exists(out_file_path, throw_exception=False): analysis_algorithms_to_run.append( [algorithm_name, algorithm_function]) else: analysis_algorithms_to_run.append( [algorithm_name, algorithm_function]) if not analysis_algorithms_to_run: click.echo('No analysis algorithms to run. Available algorithms are:') for name, _ in available_algorithms: click.echo('\t%s' % name) return if use_celery: click.echo( 'We detected you\'re using celery to run the tasks, here are some useful commands:' ) click.echo( '\tsudo rabbitmq-server -detached (start rabbitmq broker)' ) click.echo( '\tsudo rabbitmqctl stop (stop rabbitmq broker)' ) click.echo( '\tcelery -A tasks worker --concurrency=4 (start Celery workers)' ) click.echo( '\tcelery -A tasks purge (clear Celery queue)' ) metadata = load_from_json(os.path.join(dataset_path, "metadata.json")) algorithms_run = 0 for analysis_algorithm_name, analysis_algorithm in analysis_algorithms_to_run: start_timestamp = time.time() algorithms_run += 1 click.echo("Analyzing '%s' dataset [%i/%i] - %s" % (dataset_path, algorithms_run, len(analysis_algorithms_to_run), analysis_algorithm_name)) if max_sounds is None: max_sounds = len(metadata) else: max_sounds = int(max_sounds) out_file_path = os.path.join( dataset_path, "analysis_%s.json" % analysis_algorithm_name) analysis_results = dict() if incremental and os.path.exists(out_file_path): # Load existing analysis file (if it exists) analysis_results = load_from_json(out_file_path) click.echo( "Continuing existing analysis (%i sounds already analysed)" % len(analysis_results)) label = "Sending sounds to analyze..." asynchronous_job_objects = None wp = None if use_celery: asynchronous_job_objects = list() else: wp = WorkParallelizer(show_widgets=False) with click.progressbar(list(metadata.values())[:max_sounds], label=label) as sounds_metadata: for count, sound in enumerate(sounds_metadata): if incremental: if str(sound['id']) in analysis_results: # If analysis for that sound already exists, continue iteration with the next sound continue # If there is no existing analysis, run it try: sound['file_path'] = glob.glob(dataset_path + '/audio/wav/' + str(sound['id']) + '_*.wav')[0] if not use_celery: # Add jobs to work paralellizer wp.add_task(analysis_algorithm, sound, task_id=sound['id']) else: task_object = run_analysis_algorithm.delay( analysis_algorithm, sound) asynchronous_job_objects.append( (sound['id'], task_object)) except Exception: print("Could not find " + str(sound['id'])) if use_celery and len(asynchronous_job_objects): # Enter a loop to check status of jobs and add jobs to metadata as they get done finished_asynchronous_job_objects = dict() n_total_jobs = len(asynchronous_job_objects) n_jobs_done_in_last_save = 0 while len(asynchronous_job_objects) != 0: time.sleep(1) i = 0 jobs_to_delete_from_list = list( ) # store position of job_objects in asynchronous_job_objects # that will be deleted after each while iteration for sound_id, job_object in asynchronous_job_objects: max_objects_to_check = 100 if job_object.ready(): # Job is finished finished_asynchronous_job_objects[ sound_id] = job_object jobs_to_delete_from_list.append(i) try: analysis = job_object.get() if analysis: analysis_results[sound_id] = analysis else: continue except RuntimeError: pass i += 1 if i >= max_objects_to_check: # We only check the first 'max_objects_to_check' in each iteration. # Assuming that jobs are being processed approximately in the order they were sent, # we should be fine. break for index in sorted(jobs_to_delete_from_list, reverse=True): del asynchronous_job_objects[index] print_progress('Analyzing sounds...', len(finished_asynchronous_job_objects), n_total_jobs, start_time=start_timestamp, show_progress_bar=True) # Estimate time remaining if len(finished_asynchronous_job_objects ) - n_jobs_done_in_last_save >= save_every: # Save progress every 100 analysed sounds n_jobs_done_in_last_save += len( finished_asynchronous_job_objects) save_to_json(out_file_path, analysis_results) print('') else: wp.start(num_workers=workers) while True: time.sleep(1) finished = wp.show_progress() tasks_succeeded = wp.tasks_succeeded() for task in tasks_succeeded: sound_id = task.id if sound_id not in analysis_results: analysis_results[sound_id] = task.result(timeout=0.0) if len(tasks_succeeded) % save_every == 0: save_to_json(out_file_path, analysis_results) if finished: break wp.show_errors() # Report number of correctly analyzed sounds end_timestamp = time.time() click.echo( 'Analyzed dataset with %i sounds (%i sounds correctly analysed), done in %s' % (len(metadata), len(analysis_results), seconds_to_day_hour_minute_second(end_timestamp - start_timestamp))) save_to_json(out_file_path, analysis_results, verbose=True)