def get_obsolete(output_dir, data_type): _track_list = TrackList.load_from_dir(output_dir) return [ str(track_file.name).split('.')[0] for track_file in Path(get_data_dir(output_dir, data_type)).glob('*.pickle') if not _track_list.have_track_id(str(track_file.name).split('.')[0]) ]
def get_missing(output_dir, data_type): _track_list = TrackList.load_from_dir(output_dir) return [ track_id for track_id in _track_list.get_track_ids() if not have_datapoint(output_dir, data_type, track_id) ]
def count(output_dir): track_list = TrackList.load_from_dir(output_dir) N = len(track_list.track_ids) print(f'expecting {N} tracks') N_feat = count_data_points(output_dir, AUDIO_FEATURES) print(f'found {N_feat} features objects') N_ana = count_data_points(output_dir, AUDIO_ANALYSIS) print(f'found {N_ana} analysis objects')
def fetch(output_dir): spinner = Halo('Fetching tracks', spinner='dots') spinner.start() _track_list = TrackList.load_from_dir(output_dir) track_ids = get_missing(output_dir, AUDIO_ANALYSIS) if not exists(get_data_dir(output_dir, AUDIO_ANALYSIS)): makedirs(get_data_dir(output_dir, AUDIO_ANALYSIS)) track_analyses = n_track_analyses_generator(track_ids) count = 0 + count_data_points(output_dir, AUDIO_ANALYSIS) for track_analysis in track_analyses: if 'track_not_found' in track_analysis: _track_list.remove_track_id(track_analysis['track_not_found']) print(f"removed {track_analysis['track_not_found']} from dataset") continue count += 1 extracted = extract_track_analysis(track_analysis) spinner.text = f'Fetching tracks ({(count / _track_list.get_desired_tracks_amount()) * 100:.2f}%)' store_extracted_analysis(output_dir, extracted) spinner.stop() _track_list.dump(output_dir)
def collect_data(data_dir, test_split, test_split_index=0, verbose=False, dry=False, subset=10000, min_conf=0): track_list = TrackList.load_from_dir(data_dir) all_tracks = track_list.get_track_ids() N = len(all_tracks) if dry: # use virtually no data at all - just test the program execution N = subset # Load all data if verbose: print("Collecting data...") data = load_data_dict(data_dir, np.array(all_tracks)) # Filter by minimum confidence level data_minconf = dict() for (track_id, track) in data.items(): if track['key_confidence'] >= min_conf / 100: data_minconf[track_id] = track # k-fold CV splits n = np.min([N, len(data_minconf)]) chunks = np.array_split(np.arange(n), test_split) test_split = chunks[test_split_index] train_split = np.concatenate(chunks[:test_split_index] + chunks[test_split_index + 1:]) # Split data trackids = np.array(list(data_minconf.keys())) tracks = np.array(list(data_minconf.values())) testing_data = dict(zip(trackids[test_split], tracks[test_split])) training_data = dict(zip(trackids[train_split], tracks[train_split])) if verbose: print("Data collected. [min_conf={}, n={}, N={}]".format( min_conf, n, N)) return training_data, testing_data
def list_tracks(mpl_data_path, output_dir, n, list_dir='', _track_list: TrackList = None) -> None: spinner = Halo(text='Listing tracks', spinner='dots') spinner.start() if not exists(get_data_dir(output_dir, AUDIO_FEATURES)): makedirs(get_data_dir(output_dir, AUDIO_FEATURES)) # We will count the amount of tracks per key and mode key_counts = dict() required_per_key = ceil(n / 24) for key in range(24): key_counts[key] = 0 # Start from a provided track list or create a new one track_list_complete = False if _track_list is not None: track_list_complete = True track_id_gen = listing_track_id_generator(output_dir, mpl_data_path, _track_list.have_track_id) for id in get_datapoint_ids(output_dir, AUDIO_FEATURES): f = load_features(output_dir, id) key = f['key'] + (f['mode'] * 12) key_counts[key] += 1 else: track_id_gen = track_id_generator(mpl_data_path) _track_list = TrackList() _track_list.set_desired_tracks_amount(n) total = count_data_points(output_dir, AUDIO_FEATURES) _track_list.dump(output_dir) def finished(): return reduce(operator.and_, [key_counts[key] >= required_per_key for key in range(24)]) or \ count_data_points(output_dir, AUDIO_FEATURES) >= n while not finished(): have_track = partial(have_datapoint, output_dir, AUDIO_FEATURES) track_ids = get_n_track_ids(track_id_gen, 100, have_track) track_feats = n_track_features(track_ids) for track_feat in track_feats: extracted_track_features = extract_audio_features(track_feat) key = extracted_track_features['key'] + (extracted_track_features['mode'] * 12) if key_counts[key] < required_per_key: key_counts[key] += 1 if not track_list_complete: _track_list.add_track_id(extracted_track_features['id']) store_extracted_features(output_dir, extracted_track_features) total += 1 _track_list.dump(output_dir) if finished(): break perc = 100 * (total / n) spinner.start(f'Listing tracks ({perc:.2f}%)') spinner.stop() _track_list.dump(output_dir) if list_dir: _track_list.dump(list_dir)
def create_track_list(track_ids) -> TrackList: _track_list = TrackList() _track_list.set_track_ids(track_ids) return _track_list
def count(output_dir): track_list = TrackList.load_from_dir(output_dir) N = len(track_list.track_ids) print(f'expecting {N} tracks') N_feat = count_data_points(output_dir, AUDIO_FEATURES) print(f'found {N_feat} features objects') N_ana = count_data_points(output_dir, AUDIO_ANALYSIS) print(f'found {N_ana} analysis objects') if __name__ == '__main__': args = get_args() if args.command == 'list': track_list = None if args.use_list: track_list = TrackList.load(args.use_list) args.N = track_list.get_desired_tracks_amount() else: output_dir = (Path(getcwd()) / args.output_dir).absolute() if len(list(Path(get_data_dir(args.output_dir, AUDIO_FEATURES)).glob('*.pickle'))) > 0: print(f'there already some audio features downloaded and stored in the output directory ' f'({output_dir}). Either provide the path to a track_list.pickle with --use-list or make ' f'sure that {output_dir} is empty') exit() list_tracks(args.mpl_dir, args.output_dir, args.N, args.list_dir, track_list) elif args.command == 'fetch': fetch(args.output_dir) elif args.command == 'check': check(args.output_dir) elif args.command == 'count': count(args.output_dir)