def _load(self, path): corpus = audiomate.Corpus(path=path) meta_data = ESC50Reader.load_meta_data(path) folds = collections.defaultdict(list) esc10_utt_ids = [] for record in meta_data: file_name = record[0] file_id = os.path.splitext(file_name)[0] file_path = os.path.abspath(os.path.join(path, 'audio', file_name)) fold = record[1] category = record[3] esc10 = record[4] corpus.new_file(file_path, file_id) utt = corpus.new_utterance(file_id, file_id) utt.set_label_list(annotations.LabelList.create_single(category, idx=audiomate.corpus.LL_SOUND_CLASS)) folds['fold-{}'.format(fold)].append(file_id) if esc10 == 'True': esc10_utt_ids.append(file_id) for fold_id, fold_utt_ids in folds.items(): fold_filter = subset.MatchingUtteranceIdxFilter(utterance_idxs=set(fold_utt_ids)) fold_sv = subset.Subview(corpus, filter_criteria=[fold_filter]) corpus.import_subview(fold_id, fold_sv) esc10_filter = subset.MatchingUtteranceIdxFilter(utterance_idxs=set(esc10_utt_ids)) esc10_sv = subset.Subview(corpus, filter_criteria=[esc10_filter]) corpus.import_subview('esc-10', esc10_sv) return corpus
def prepare_corpus(corpus, name): if name != 'common_voice': print(' - {}: Find utterances that are too long'.format(name)) too_long = utts_too_long(corpus) else: too_long = set() if name == 'mailabs': # we only use mailabs for training # since we don't know the speakers train_utts = set(corpus.utterances.keys()) train_utts = train_utts - too_long dev_utts = set() test_utts = set() elif name == 'tuda': # we only use kinect-raw files # otherwise sentence of the tuda would occur multiple times # in contrast to other datasets train_utts = set(corpus.subviews['train_kinect-raw'].utterances.keys()) train_utts = train_utts - too_long dev_utts = set(corpus.subviews['dev_kinect-raw'].utterances.keys()) test_utts = set(corpus.subviews['test_kinect-raw'].utterances.keys()) elif name == 'common_voice': train_utts = set(corpus.subviews['train'].utterances.keys()) train_utts = train_utts - too_long dev_utts = set(corpus.subviews['dev'].utterances.keys()) test_utts = set(corpus.subviews['test'].utterances.keys()) else: dur_filter = subset.MatchingUtteranceIdxFilter(too_long, inverse=True) dur_subview = subset.Subview(corpus, filter_criteria=[dur_filter]) train, dev, test = create_train_dev_test(dur_subview) train_utts = set(train.utterances.keys()) dev_utts = set(dev.utterances.keys()) test_utts = set(test.utterances.keys()) # Remove all subviews for subname in list(corpus.subviews.keys()): del corpus.subviews[subname] # Add new subviews train_filter = subset.MatchingUtteranceIdxFilter(train_utts) train_subview = subset.Subview(corpus, filter_criteria=[train_filter]) corpus.import_subview('train', train_subview) dev_filter = subset.MatchingUtteranceIdxFilter(dev_utts) dev_subview = subset.Subview(corpus, filter_criteria=[dev_filter]) corpus.import_subview('dev', dev_subview) test_filter = subset.MatchingUtteranceIdxFilter(test_utts) test_subview = subset.Subview(corpus, filter_criteria=[test_filter]) corpus.import_subview('test', test_subview)
def _load(self, path): corpus = audiomate.Corpus(path=path) meta_file_path = os.path.join(path, 'metadata', 'UrbanSound8K.csv') meta_data = textfile.read_separated_lines(meta_file_path, separator=',', max_columns=8)[1:] folds = collections.defaultdict(set) for record in meta_data: file_name = record[0] fold = record[5] label = record[7] file_path = os.path.join(path, 'audio', 'fold{}'.format(fold), file_name) if os.path.isfile(file_path): basename = os.path.splitext(file_name)[0] corpus.new_file(file_path, basename) utt = corpus.new_utterance(basename, basename) utt.set_label_list( annotations.LabelList.create_single( label, idx=audiomate.corpus.LL_SOUND_CLASS)) folds['fold{}'.format(fold)].add(basename) for fold_idx, fold_utterance_ids in folds.items(): utt_filter = subset.MatchingUtteranceIdxFilter( utterance_idxs=fold_utterance_ids) subview = subset.Subview(corpus, filter_criteria=[utt_filter]) corpus.import_subview(fold_idx, subview) return corpus
def load_subset(corpus, path, subset_idx): """ Load subset into corpus. """ csv_file = os.path.join(path, '{}.tsv'.format(subset_idx)) subset_utt_ids = [] entries = textfile.read_separated_lines_generator( csv_file, separator='\t', max_columns=8, ignore_lines_starting_with=['client_id'], keep_empty=True ) for entry in entries: file_idx = CommonVoiceReader.create_assets_if_needed( corpus, path, entry ) subset_utt_ids.append(file_idx) filter = subset.MatchingUtteranceIdxFilter(utterance_idxs=set(subset_utt_ids)) subview = subset.Subview(corpus, filter_criteria=[filter]) corpus.import_subview(subset_idx, subview)
def load_tag(self, corpus, path): """ Iterate over all speakers on load them. Collect all utterance-idx and create a subset of them. """ tag_idx = os.path.basename(path) data_path = os.path.join(path, 'by_book') tag_utt_ids = [] for gender_path in MailabsReader.get_folders(data_path): # IN MIX FOLDERS THERE ARE NO SPEAKERS # HANDLE EVERY UTT AS DIFFERENT ISSUER if os.path.basename(gender_path) == 'mix': utt_ids = self.load_books_of_speaker(corpus, gender_path, None) tag_utt_ids.extend(utt_ids) else: for speaker_path in MailabsReader.get_folders(gender_path): speaker = MailabsReader.load_speaker(corpus, speaker_path) utt_ids = self.load_books_of_speaker( corpus, speaker_path, speaker) tag_utt_ids.extend(utt_ids) filter = subset.MatchingUtteranceIdxFilter( utterance_idxs=set(tag_utt_ids)) subview = subset.Subview(corpus, filter_criteria=[filter]) corpus.import_subview(tag_idx, subview)
def _load(self, path): corpus = super(SWCReader, self)._load(path) utt_filter = subset.MatchingUtteranceIdxFilter(utterance_idxs=set(INVALID_UTT_IDS), inverse=True) filtered = subset.Subview(corpus, filter_criteria=utt_filter) return audiomate.Corpus.from_corpus(filtered)
def load_subset(corpus, path, subset_idx): """ Load subset into corpus. """ csv_file = os.path.join(path, '{}.csv'.format(subset_idx)) utt_ids = [] for entry in textfile.read_separated_lines_generator( csv_file, separator=',', max_columns=8, ignore_lines_starting_with=['filename']): rel_file_path = entry[0] filename = os.path.split(rel_file_path)[1] basename = os.path.splitext(filename)[0] transcription = entry[1] age = CommonVoiceReader.map_age(entry[4]) gender = CommonVoiceReader.map_gender(entry[5]) idx = '{}-{}'.format(subset_idx, basename) file_path = os.path.join(path, rel_file_path) corpus.new_file(file_path, idx) issuer = assets.Speaker(idx, gender=gender, age_group=age) corpus.import_issuers(issuer) utterance = corpus.new_utterance(idx, idx, issuer.idx) utterance.set_label_list( assets.LabelList.create_single( transcription, idx=audiomate.corpus.LL_WORD_TRANSCRIPT)) utt_ids.append(idx) filter = subset.MatchingUtteranceIdxFilter(utterance_idxs=set(utt_ids)) subview = subset.Subview(corpus, filter_criteria=[filter]) corpus.import_subview(subset_idx, subview)
def run(corpus, filters): subview = subset.Subview(corpus, filters) subview.utterances subview.issuers subview.tracks subview.num_utterances subview.num_issuers subview.num_tracks
def _load(self, path): corpus = audiomate.Corpus(path=path) test_folder = os.path.join(path, 'test') train_folder = os.path.join(path, 'train') test_utterance_ids = AEDReader.load_folder(test_folder, corpus) train_utterance_ids = AEDReader.load_folder(train_folder, corpus) test_filter = subset.MatchingUtteranceIdxFilter(utterance_idxs=test_utterance_ids) train_filter = subset.MatchingUtteranceIdxFilter(utterance_idxs=train_utterance_ids) test_subset = subset.Subview(corpus, filter_criteria=[test_filter]) train_subset = subset.Subview(corpus, filter_criteria=[train_filter]) corpus.import_subview('test', test_subset) corpus.import_subview('train', train_subset) return corpus
def test_init_with_corpus_view(self): corpus = resources.create_dataset() subview = subset.Subview( corpus, filter_criteria=[ subset.MatchingUtteranceIdxFilter( utterance_idxs={'utt-1', 'utt-2', 'utt-4'}) ]) it = feeding.DataIterator(subview, [containers.Container('blub')]) assert set(it.utt_ids) == set(subview.utterances.keys())
def load_part(base_path, part_name, corpus, speakers): part_file_path = os.path.join(base_path, 'data', '{}_data.csv'.format(part_name)) entries = textfile.read_separated_lines_generator( part_file_path, separator=',', max_columns=7, ignore_lines_starting_with=[',']) part_ids = [] for entry in entries: file_path = entry[1] file_base = os.path.basename(file_path) idx = os.path.splitext(file_base)[0] speaker_idx = entry[2] part_ids.append(idx) if speaker_idx not in corpus.issuers.keys(): corpus.import_issuers(speakers[speaker_idx]) track = corpus.new_file(os.path.join(base_path, file_path), idx) utt = corpus.new_utterance(idx, track.idx, speaker_idx) transcription = annotations.LabelList.create_single( entry[3], idx=audiomate.corpus.LL_WORD_TRANSCRIPT) utt.set_label_list(transcription) if entry[4] != 'none': action = annotations.LabelList.create_single(entry[4], idx='action') utt.set_label_list(action) if entry[5] != 'none': object_label = annotations.LabelList.create_single( entry[5], idx='object') utt.set_label_list(object_label) if entry[6] != 'none': location = annotations.LabelList.create_single(entry[6], idx='location') utt.set_label_list(location) filter = subset.MatchingUtteranceIdxFilter( utterance_idxs=set(part_ids)) subview = subset.Subview(corpus, filter_criteria=[filter]) corpus.import_subview(part_name, subview)
def _load(self, path): corpus = audiomate.Corpus(path=path) speaker_info_path = os.path.join(path, 'SPEAKERS.TXT') speakers = LibriSpeechReader.load_speakers(speaker_info_path) sf = LibriSpeechReader.available_subfolders for subset_idx, subset_path in sf(path, SUBSETS.keys()).items(): subset_utt_ids = set() for speaker_idx, speaker_path in sf(subset_path).items(): corpus.import_issuers(speakers[speaker_idx]) for chapter_idx, chapter_path in sf(speaker_path).items(): transcript_path = os.path.join( chapter_path, '{}-{}.trans.txt'.format(speaker_idx, chapter_idx) ) transcripts = LibriSpeechReader.load_transcripts(transcript_path) for utt_idx, transcript in transcripts.items(): file_path = os.path.join(chapter_path, '{}.flac'.format(utt_idx)) corpus.new_file(file_path, utt_idx) utterance = corpus.new_utterance( utt_idx, utt_idx, speaker_idx ) utterance.set_label_list( annotations.LabelList.create_single( transcript, idx=audiomate.corpus.LL_WORD_TRANSCRIPT ) ) subset_utt_ids.add(utt_idx) filter = subset.MatchingUtteranceIdxFilter(utterance_idxs=set(subset_utt_ids)) subview = subset.Subview(corpus, filter_criteria=[filter]) corpus.import_subview(subset_idx, subview) return corpus
def test_init_with_corpus_view(self, tmpdir): c = containers.Container(os.path.join(tmpdir.strpath, 'test.h5')) c.open() c.set('utt-1', data=np.arange(20)) c.set('utt-2', data=np.arange(20)) c.set('utt-3', data=np.arange(20)) c.set('utt-4', data=np.arange(20)) c.set('utt-5', data=np.arange(20)) corpus = resources.create_dataset() subview = subset.Subview( corpus, filter_criteria=[ subset.MatchingUtteranceIdxFilter( utterance_idxs={'utt-1', 'utt-2', 'utt-4'}) ]) it = feeding.Dataset(subview, [c]) assert it.utt_ids == ['utt-1', 'utt-2', 'utt-4']
def _load(self, path): corpus = audiomate.Corpus(path=path) for part in ['TEST', 'TRAIN']: part_path = os.path.join(path, part) part_utt_ids = set() for region in os.listdir(part_path): region_path = os.path.join(part_path, region) if os.path.isdir(region_path): for speaker_abbr in os.listdir(region_path): speaker_path = os.path.join(region_path, speaker_abbr) speaker_idx = speaker_abbr[1:] if speaker_idx not in corpus.issuers.keys(): issuer = assets.Speaker(speaker_idx) if speaker_abbr[:1] == 'M': issuer.gender = assets.Gender.MALE elif speaker_abbr[:1] == 'F': issuer.gender = assets.Gender.FEMALE corpus.import_issuers(issuer) for wav_path in glob.glob( os.path.join(speaker_path, '*.WAV')): sentence_idx = os.path.splitext( os.path.basename(wav_path))[0] utt_idx = '{}-{}-{}'.format( region, speaker_abbr, sentence_idx).lower() part_utt_ids.add(utt_idx) raw_text_path = os.path.join( speaker_path, '{}.TXT'.format(sentence_idx)) raw_text = textfile.read_separated_lines( raw_text_path, separator=' ', max_columns=3)[0][2] words_path = os.path.join( speaker_path, '{}.WRD'.format(sentence_idx)) words = textfile.read_separated_lines( words_path, separator=' ', max_columns=3) phones_path = os.path.join( speaker_path, '{}.PHN'.format(sentence_idx)) phones = textfile.read_separated_lines( phones_path, separator=' ', max_columns=3) corpus.new_file(wav_path, utt_idx) utt = corpus.new_utterance(utt_idx, utt_idx, speaker_idx) raw_ll = assets.LabelList.create_single( raw_text, idx=audiomate.corpus.LL_WORD_TRANSCRIPT_RAW) utt.set_label_list(raw_ll) word_ll = assets.LabelList( idx=audiomate.corpus.LL_WORD_TRANSCRIPT) for record in words: start = int(record[0]) / 16000 end = int(record[1]) / 16000 word_ll.append( assets.Label(record[2], start=start, end=end)) utt.set_label_list(word_ll) phone_ll = assets.LabelList( idx=audiomate.corpus.LL_PHONE_TRANSCRIPT) for record in phones: start = int(record[0]) / 16000 end = int(record[1]) / 16000 phone_ll.append( assets.Label(record[2], start=start, end=end)) utt.set_label_list(phone_ll) filter = subset.MatchingUtteranceIdxFilter( utterance_idxs=part_utt_ids) subview = subset.Subview(corpus, filter_criteria=[filter]) corpus.import_subview(part, subview) return corpus
def run(download_folder, output_folder): corpora_names = [ ('common_voice', 'common-voice'), ('mailabs', 'mailabs'), ('swc', 'swc'), ('tuda', 'tuda'), ('voxforge', 'voxforge'), ] print('Load corpora') corpora = {} for name, reader_type in corpora_names: print(' - {} ...'.format(name)) full_path = os.path.join(download_folder, name) c = audiomate.Corpus.load(full_path, reader=reader_type) corpora[name] = c print('Create Train/Dev/Test - if not already exist') for name, corpus in corpora.items(): prepare_corpus(corpus, name) print('Insert full subviews') # # Insert subviews containing all utterances # so we have a reference when merged # for name, corpus in corpora.items(): all_utts = set(corpus.utterances.keys()) full_filter = subset.MatchingUtteranceIdxFilter(all_utts) full_subview = subset.Subview(corpus, filter_criteria=[full_filter]) corpus.import_subview('full', full_subview) print('Suffix subviews') # # Suffix subviews to have the correct names when merging # for name, corpus in corpora.items(): print(' - {} ...'.format(name)) original_subview_names = list(corpus.subviews.keys()) for subview_name in original_subview_names: new_subview_name = '{}_{}'.format(subview_name, name) corpus.subviews[new_subview_name] = corpus.subviews[subview_name] del corpus.subviews[subview_name] print('Merge corpora ...') full_corpus = audiomate.Corpus.merge_corpora(list(corpora.values())) print('Create merged train/test/dev subviews ...') for part in ['train', 'dev', 'test']: utt_ids = set() for name, corpus in corpora.items(): sv = full_corpus.subviews['{}_{}'.format(part, name)] utt_ids.update(sv.utterances.keys()) part_filter = subset.MatchingUtteranceIdxFilter(utt_ids) part_subview = subset.Subview(corpus, filter_criteria=[part_filter]) full_corpus.import_subview(part, part_subview) print('Save ...') os.makedirs(output_folder) full_corpus.save_at(output_folder)