Esempio n. 1
0
    def _load(self, path):
        corpus = audiomate.Corpus(path=path)

        meta_file = os.path.join(path, META_FILENAME)
        records = textfile.read_separated_lines_generator(meta_file,
                                                          separator='\t',
                                                          max_columns=4)

        for record in records:
            idx = record[0]
            speaker_idx = record[1]
            language = record[2]
            transcript = record[3]

            file_path = os.path.join(path, 'audio', language,
                                     '{}.mp3'.format(idx))
            corpus.new_file(file_path, idx)

            if speaker_idx not in corpus.issuers.keys():
                issuer = assets.Speaker(speaker_idx)
                corpus.import_issuers(issuer)

            utterance = corpus.new_utterance(idx, idx, speaker_idx)
            utterance.set_label_list(
                assets.LabelList.create_single(
                    transcript, idx=audiomate.corpus.LL_WORD_TRANSCRIPT_RAW))

        return corpus
Esempio n. 2
0
    def read_issuers(file_path, corpus):
        if not os.path.isfile(file_path):
            return

        data = jsonfile.read_json_file(file_path)

        for issuer_idx, issuer_data in data.items():
            issuer_type = issuer_data.get('type', None)
            issuer_info = issuer_data.get('info', {})

            if issuer_type == 'speaker':
                gender = assets.Gender(
                    issuer_data.get('gender', 'unknown').lower())
                age_group = assets.AgeGroup(
                    issuer_data.get('age_group', 'unknown').lower())
                native_language = issuer_data.get('native_language', None)

                issuer = assets.Speaker(issuer_idx,
                                        gender=gender,
                                        age_group=age_group,
                                        native_language=native_language,
                                        info=issuer_info)
            elif issuer_type == 'artist':
                name = issuer_data.get('name', None)

                issuer = assets.Artist(issuer_idx, name=name, info=issuer_info)
            else:
                issuer = assets.Issuer(issuer_idx, info=issuer_info)

            corpus.import_issuers(issuer)
Esempio n. 3
0
    def load_subset(corpus, path, subset_idx):
        """ Load subset into corpus. """
        csv_file = os.path.join(path, '{}.csv'.format(subset_idx))
        utt_ids = []

        for entry in textfile.read_separated_lines_generator(
                csv_file,
                separator=',',
                max_columns=8,
                ignore_lines_starting_with=['filename']):
            rel_file_path = entry[0]
            filename = os.path.split(rel_file_path)[1]
            basename = os.path.splitext(filename)[0]
            transcription = entry[1]
            age = CommonVoiceReader.map_age(entry[4])
            gender = CommonVoiceReader.map_gender(entry[5])

            idx = '{}-{}'.format(subset_idx, basename)
            file_path = os.path.join(path, rel_file_path)

            corpus.new_file(file_path, idx)
            issuer = assets.Speaker(idx, gender=gender, age_group=age)
            corpus.import_issuers(issuer)
            utterance = corpus.new_utterance(idx, idx, issuer.idx)
            utterance.set_label_list(
                assets.LabelList.create_single(
                    transcription, idx=audiomate.corpus.LL_WORD_TRANSCRIPT))

            utt_ids.append(idx)

        filter = subset.MatchingUtteranceIdxFilter(utterance_idxs=set(utt_ids))
        subview = subset.Subview(corpus, filter_criteria=[filter])
        corpus.import_subview(subset_idx, subview)
Esempio n. 4
0
    def load_file(folder_path, idx, corpus):
        """
        Load speaker, file, utterance, labels for the file with the given id.
        """
        xml_path = os.path.join(folder_path, '{}.xml'.format(idx))
        wav_path = os.path.join(folder_path,
                                '{}{}.wav'.format(idx, WAV_SUFFIX))

        xml_file = open(xml_path, 'r', encoding='utf-8')
        soup = BeautifulSoup(xml_file, 'lxml')

        transcription = soup.recording.cleaned_sentence.string
        transcription_raw = soup.recording.sentence.string
        gender = soup.recording.gender.string
        is_native = soup.recording.muttersprachler.string
        age_class = soup.recording.ageclass.string
        speaker_idx = soup.recording.speaker_id.string

        if speaker_idx not in corpus.issuers.keys():
            start_age_class = int(age_class.split('-')[0])

            if start_age_class < 12:
                age_group = assets.AgeGroup.CHILD
            elif start_age_class < 18:
                age_group = assets.AgeGroup.YOUTH
            elif start_age_class < 65:
                age_group = assets.AgeGroup.ADULT
            else:
                age_group = assets.AgeGroup.SENIOR

            native_lang = None

            if is_native == 'Ja':
                native_lang = 'deu'

            issuer = assets.Speaker(speaker_idx,
                                    gender=assets.Gender(gender),
                                    age_group=age_group,
                                    native_language=native_lang)
            corpus.import_issuers(issuer)

        corpus.new_file(wav_path, idx)
        utt = corpus.new_utterance(idx, idx, speaker_idx)
        utt.set_label_list(
            assets.LabelList.create_single(
                transcription, idx=audiomate.corpus.LL_WORD_TRANSCRIPT))
        utt.set_label_list(
            assets.LabelList.create_single(
                transcription_raw,
                idx=audiomate.corpus.LL_WORD_TRANSCRIPT_RAW))
Esempio n. 5
0
    def _create_or_get_speech_issuer(corpus, file_idx, annotations):
        if file_idx not in annotations:
            return None

        issuer = assets.Speaker(file_idx)

        if file_idx in annotations:
            if annotations[file_idx][0] == 'm':
                issuer.gender = assets.Gender.MALE
            elif annotations[file_idx][0] == 'f':
                issuer.gender = assets.Gender.FEMALE

        corpus.import_issuers(issuer)

        return file_idx
Esempio n. 6
0
    def parse_speaker_info(readme_path):
        """ Parse speaker info and return tuple (idx, gender). """
        idx = None
        gender = assets.Gender.UNKNOWN
        age_group = assets.AgeGroup.UNKNOWN
        native_lang = None

        with open(readme_path, 'r', errors='ignore') as f:
            for raw_line in f:
                line = raw_line.strip()

                if line is not None and line is not '':
                    line = line.rstrip(';.')
                    parts = line.split(':', maxsplit=1)

                    if len(parts) > 1:
                        key = parts[0].strip().lower()
                        value = parts[1].strip()

                        if key == 'user name':
                            idx = value

                        value = value.lower()

                        if key == 'gender':
                            if value in ['männlich', 'male', 'mnnlich']:
                                gender = assets.Gender.MALE
                            elif value in ['weiblich', 'female', '[female]']:
                                gender = assets.Gender.FEMALE

                        if key == 'age range':
                            if value in ['erwachsener', 'adult', '[adult]', '[erwachsener]']:
                                age_group = assets.AgeGroup.ADULT
                            elif value in ['senior', '[senior']:
                                age_group = assets.AgeGroup.SENIOR
                            elif value in ['youth', 'jugendlicher', '[youth]', '[jugendlicher]']:
                                age_group = assets.AgeGroup.YOUTH
                            elif value in ['kind', 'child']:
                                age_group = assets.AgeGroup.CHILD

                        if key == 'language':
                            if value in ['de', 'ger', 'deu', '[de]']:
                                native_lang = 'deu'
                            elif value in ['en', 'eng', '[en]']:
                                native_lang = 'eng'

        return assets.Speaker(idx, gender=gender, age_group=age_group, native_language=native_lang)
    def _load(self, path):
        corpus = audiomate.Corpus(path=path)

        for file_path in glob.glob(os.path.join(path, 'recordings', '*.wav')):
            file_idx = os.path.splitext(os.path.basename(file_path))[0]

            corpus.new_file(file_path, file_idx)

            idx_parts = file_idx.split('_')
            digit = idx_parts[0]
            issuer_idx = '_'.join(idx_parts[1:-1])

            if issuer_idx not in corpus.issuers.keys():
                issuer = assets.Speaker(issuer_idx)
                corpus.import_issuers(issuer)

            utterance = corpus.new_utterance(file_idx, file_idx, issuer_idx)
            utterance.set_label_list(
                assets.LabelList.create_single(
                    str(digit), idx=audiomate.corpus.LL_WORD_TRANSCRIPT))

        return corpus
Esempio n. 8
0
    def _load(self, path):
        corpus = audiomate.Corpus(path=path)

        for part in ['TEST', 'TRAIN']:
            part_path = os.path.join(path, part)
            part_utt_ids = set()

            for region in os.listdir(part_path):
                region_path = os.path.join(part_path, region)

                if os.path.isdir(region_path):

                    for speaker_abbr in os.listdir(region_path):
                        speaker_path = os.path.join(region_path, speaker_abbr)
                        speaker_idx = speaker_abbr[1:]

                        if speaker_idx not in corpus.issuers.keys():
                            issuer = assets.Speaker(speaker_idx)

                            if speaker_abbr[:1] == 'M':
                                issuer.gender = assets.Gender.MALE
                            elif speaker_abbr[:1] == 'F':
                                issuer.gender = assets.Gender.FEMALE

                            corpus.import_issuers(issuer)

                        for wav_path in glob.glob(
                                os.path.join(speaker_path, '*.WAV')):
                            sentence_idx = os.path.splitext(
                                os.path.basename(wav_path))[0]
                            utt_idx = '{}-{}-{}'.format(
                                region, speaker_abbr, sentence_idx).lower()
                            part_utt_ids.add(utt_idx)

                            raw_text_path = os.path.join(
                                speaker_path, '{}.TXT'.format(sentence_idx))
                            raw_text = textfile.read_separated_lines(
                                raw_text_path, separator=' ',
                                max_columns=3)[0][2]

                            words_path = os.path.join(
                                speaker_path, '{}.WRD'.format(sentence_idx))
                            words = textfile.read_separated_lines(
                                words_path, separator=' ', max_columns=3)

                            phones_path = os.path.join(
                                speaker_path, '{}.PHN'.format(sentence_idx))
                            phones = textfile.read_separated_lines(
                                phones_path, separator=' ', max_columns=3)

                            corpus.new_file(wav_path, utt_idx)
                            utt = corpus.new_utterance(utt_idx, utt_idx,
                                                       speaker_idx)

                            raw_ll = assets.LabelList.create_single(
                                raw_text,
                                idx=audiomate.corpus.LL_WORD_TRANSCRIPT_RAW)
                            utt.set_label_list(raw_ll)

                            word_ll = assets.LabelList(
                                idx=audiomate.corpus.LL_WORD_TRANSCRIPT)

                            for record in words:
                                start = int(record[0]) / 16000
                                end = int(record[1]) / 16000
                                word_ll.append(
                                    assets.Label(record[2],
                                                 start=start,
                                                 end=end))

                            utt.set_label_list(word_ll)

                            phone_ll = assets.LabelList(
                                idx=audiomate.corpus.LL_PHONE_TRANSCRIPT)

                            for record in phones:
                                start = int(record[0]) / 16000
                                end = int(record[1]) / 16000
                                phone_ll.append(
                                    assets.Label(record[2],
                                                 start=start,
                                                 end=end))

                            utt.set_label_list(phone_ll)

            filter = subset.MatchingUtteranceIdxFilter(
                utterance_idxs=part_utt_ids)
            subview = subset.Subview(corpus, filter_criteria=[filter])
            corpus.import_subview(part, subview)

        return corpus
Esempio n. 9
0
def create_dataset():
    temp_path = tempfile.mkdtemp()

    ds = audiomate.Corpus(temp_path)

    wav_1_path = sample_wav_file('wav_1.wav')
    wav_2_path = sample_wav_file('wav_2.wav')
    wav_3_path = sample_wav_file('wav_3.wav')
    wav_4_path = sample_wav_file('wav_4.wav')

    file_1 = ds.new_file(wav_1_path, file_idx='wav-1')
    file_2 = ds.new_file(wav_2_path, file_idx='wav_2')
    file_3 = ds.new_file(wav_3_path, file_idx='wav_3')
    file_4 = ds.new_file(wav_4_path, file_idx='wav_4')

    issuer_1 = assets.Speaker('spk-1', gender=assets.Gender.MALE)
    issuer_2 = assets.Speaker('spk-2', gender=assets.Gender.FEMALE)
    issuer_3 = assets.Issuer('spk-3')

    ds.import_issuers([issuer_1, issuer_2, issuer_3])

    utt_1 = ds.new_utterance('utt-1', file_1.idx, issuer_idx=issuer_1.idx)
    utt_2 = ds.new_utterance('utt-2', file_2.idx, issuer_idx=issuer_1.idx)
    utt_3 = ds.new_utterance('utt-3',
                             file_3.idx,
                             issuer_idx=issuer_2.idx,
                             start=0,
                             end=1.5)
    utt_4 = ds.new_utterance('utt-4',
                             file_3.idx,
                             issuer_idx=issuer_2.idx,
                             start=1.5,
                             end=2.5)
    utt_5 = ds.new_utterance('utt-5', file_4.idx, issuer_idx=issuer_3.idx)

    utt_1.set_label_list(
        assets.LabelList(audiomate.corpus.LL_WORD_TRANSCRIPT,
                         labels=[assets.Label('who am i')]))
    utt_2.set_label_list(
        assets.LabelList(
            audiomate.corpus.LL_WORD_TRANSCRIPT,
            labels=[assets.Label('who are you', meta={
                'a': 'hey',
                'b': 2
            })]))
    utt_3.set_label_list(
        assets.LabelList(audiomate.corpus.LL_WORD_TRANSCRIPT,
                         labels=[assets.Label('who is he')]))
    utt_4.set_label_list(
        assets.LabelList(audiomate.corpus.LL_WORD_TRANSCRIPT,
                         labels=[assets.Label('who are they')]))
    utt_5.set_label_list(
        assets.LabelList(audiomate.corpus.LL_WORD_TRANSCRIPT,
                         labels=[assets.Label('who is she')]))

    train_filter = subview.MatchingUtteranceIdxFilter(
        utterance_idxs={'utt-1', 'utt-2', 'utt-3'})
    sv_train = subview.Subview(ds, filter_criteria=[train_filter])

    dev_filter = subview.MatchingUtteranceIdxFilter(
        utterance_idxs={'utt-4', 'utt-5'})
    sv_dev = subview.Subview(ds, filter_criteria=[dev_filter])

    ds.import_subview('train', sv_train)
    ds.import_subview('dev', sv_dev)

    ds.new_feature_container('mfcc', '/some/dummy/path')
    ds.new_feature_container('mel', '/some/dummy/path_mel')

    return ds