コード例 #1
0
    def _run_preparator(cls, args, preparator, output_dir=None):
        output_dir = ((
            cls._output_dir(args) if args.output_dir is None
            else os.path.abspath(os.path.join(args.output_dir, 'data')))
                      if output_dir is None else output_dir)
        preparator.log = utils.logger.get_log(
            os.path.join(output_dir, 'data_preparation.log'), args.verbose)

        # initialize corpus from raw with it's preparator
        corpus = preparator.prepare(
            os.path.join(output_dir, 'wavs'),
            keep_short_utts=args.keep_short_utts)
        corpus.log = utils.logger.get_log(
            os.path.join(output_dir, 'data_validation.log'), args.verbose)

        # raise if the corpus is not in correct abkhazia
        # format. Redirect the log to the preparator logger
        corpus.validate(njobs=args.njobs)

        # save the corpus to the output directory
        corpus.save(output_dir, no_wavs=True)

        # save the alignment
        if not args.no_alignment:
            alignment_file = os.path.join(output_dir, 'alignment.txt')
            utils.open_utf8(alignment_file, 'w').write(
                '\n'.join(
                    '{} {}'.format(k, ' '.join(str(v) for v in vv))
                    for k, v in sorted(preparator.alignment.items())
                    for vv in v)
                + '\n')
コード例 #2
0
    def correct_dictionary(self):
        """Correct problems with the GlobalPhone Mandarin dictionary

        The corrections are completely ad hoc the result is stored in
        a temporary file

        """
        # the following words are in the dictionary but are not used
        # in the transcriptions they will be dropped
        words_to_drop = [u'#fragment#', u'#noise#', u'$', u'(', u')', u'SIL']

        # correct content
        correct_lines = []
        for line in open_utf8(self.dictionary, 'r').xreadlines():
            if all(
                [not (u'{' + word + u'}' in line) for word in words_to_drop]):
                line = line.replace(u'{lai2zhe3bu2ju4 }', u'{lai2zhe3bu2ju4}')
                correct_lines.append(line)

        # update self.dictionary with the corrected content
        fid, filename = tempfile.mkstemp()
        os.close(fid)
        with open_utf8(filename, 'w') as out:
            for line in correct_lines:
                out.write(line)

        self.dictionary = filename
        return True
コード例 #3
0
ファイル: language_model.py プロジェクト: syfengcuhk/abkhazia
    def create(self):
        """Initialize the recipe data in `self.recipe_dir`"""
        self.check_parameters()

        # setup phones
        self.a2k.setup_phones()
        self.a2k.setup_silences()
        self.a2k.setup_variants()

        # setup lexicon and text depending on word/phone ngram level
        text = self.a2k.setup_text()
        lm_text = os.path.join(self.a2k._local_path(), 'lm_text.txt')
        if self.level == 'word':
            shutil.copy(text, lm_text)
            self.a2k.setup_lexicon()
        else:  # phone level
            with utils.open_utf8(lm_text, 'w') as out:
                for k, v in sorted(self.corpus.phonemize_text().iteritems()):
                    out.write(u'{} {}\n'.format(k, v))
            self.a2k.setup_phone_lexicon()

        # setup data files common to both levels
        self.a2k.setup_kaldi_folders()
        self.a2k.setup_machine_specific_scripts()
        self._setup_prepare_lang_wpdpl()
コード例 #4
0
ファイル: arpa.py プロジェクト: colincwilson/abkhazia
    def load(cls, path):
        """Load an ARPA language model from the file `path`"""
        assert os.path.isfile(path)

        data = {}
        order = None
        for line in (l.strip() for l in utils.open_utf8(path, 'r') if l):
            if line.startswith('\\data\\'):
                order = 0
            elif line.startswith('\\end\\'):
                break
            elif line.startswith('\\') and line.endswith(':'):
                order = int(re.search('[0-9]+', line).group(0))
                if order not in data:
                    data[order] = {}
            elif line:
                if order == 0:  # still in \data\ section
                    pass
                elif order > 0:
                    line = line.split('\t')
                    prob = float(line[0])
                    ngram = tuple(line[1].split())
                    backoff = None if len(line) <= 2 else float(line[2])
                    data[order][ngram] = (prob, backoff)
                else:
                    raise IOError(
                        'unable to parse ARPA file line: {}'.format(line))
        return cls(data)
コード例 #5
0
    def load_variants(path):
        """Return a list of variant symbols

        `path` is assumed to be a variants file, usually named 'variants.txt'.

        """
        return [line.strip() for line in utils.open_utf8(path, 'r')]
コード例 #6
0
    def load_segments(path):
        """Return a dict of utterance ids mapped to (wav, tbegin, tend)
        and the set of all required wav files

        `path` is assumed to be a segments file, usually named
        'segments.txt'. If there is only one utterance per wav, tbegin
        and tend are None.

        Append the '.wav' extension to the segments wav-ids if they
        are missing.

        """
        def _wav_tuple(l):
            wav = l[0]
            if os.path.splitext(wav)[1] != '.wav':
                wav += '.wav'

            return ((wav, None, None) if len(l) == 1 else
                    (wav, float(l[1]), float(l[2])))

        lines = (line.strip().split() for line in utils.open_utf8(path, 'r'))
        segments = {line[0]: _wav_tuple(line[1:]) for line in lines}

        wavs = {w[0] for w in segments.values()}
        return segments, wavs
コード例 #7
0
    def load_silences(path):
        """Return a list of silence symbols

        `path` is assumed to be a silences file, usually named 'silences.txt'.

        """
        return [line.strip() for line in utils.open_utf8(path, 'r')]
コード例 #8
0
ファイル: arpa.py プロジェクト: syfengcuhk/abkhazia
    def save(self, path):
        """Save a language model to `path` in the ARPA format

        Do not write empty ngrams to the `path`

        """
        with utils.open_utf8(path, 'w') as fp:
            # write header
            fp.write('\n\\data\\\n')
            for order in range(1, self.order+1):
                size = len(self.ngrams[order])
                if size:
                    fp.write('ngram {}={}\n'.format(order, size))
            fp.write('\n')

            # write ngrams
            for order in range(1, self.order+1):
                if len(self.ngrams[order]):
                    fp.write('\\{}-grams:\n'.format(order))
                    for k, v in sorted(self.ngrams[order].items()):
                        ngram = ' '.join(k)
                        if v[1] is None:  # no backoff
                            fp.write(u'{}\t{}\n'.format(v[0], ngram))
                        else:
                            fp.write(u'{}\t{}\t{}\n'.format(v[0], ngram, v[1]))
                    fp.write('\n')
            fp.write('\\end\\\n')
コード例 #9
0
    def make_lexicon(self):
        # parse dictionary lines
        words, transcripts = [], []
        for line in utils.open_utf8(self.dictionary, 'r').readlines():
            # suppress linebreaks (this does not take into account fancy
            # unicode linebreaks), see
            # http://stackoverflow.com/questions/3219014
            line = re.sub(u'\r\n?|\n', u'', line).split(u' ')
            # parse word
            word = self.strip_accolades(line[0])
            if u'{' in word or u'}' in word:
                raise RuntimeError('Bad formatting of word {}'.format(word))
            words.append(word)

            # parse phonetic transcription
            trs = self.strip_accolades(u' '.join(line[1:])).split(u' ')
            transcript = []
            for phone in trs:
                if phone[0] == u'{':
                    phn = phone[1:]
                    assert phn != u'WB', trs
                elif phone[-1] == u'}':
                    phn = phone[:-1]
                    assert phn == u'WB', trs
                else:
                    phn = phone
                    assert phn != u'WB', trs

                assert not (u'{' in phn), trs
                assert not (u'}' in phn), trs
                if phn != u'WB':
                    transcript.append(phn)
            transcripts.append(u' '.join(transcript))

        return dict(zip(words, transcripts))
コード例 #10
0
ファイル: test_align.py プロジェクト: colincwilson/abkhazia
def test_align(
        corpus, features, lm_word, am_mono, tmpdir, level, post):
    output_dir = str(tmpdir.mkdir('align-phones'))
    flog = os.path.join(output_dir, 'align-phones.log')
    log = utils.logger.get_log(flog)

    aligner = align.Align(corpus, output_dir=output_dir, log=log)
    aligner.feat_dir = features
    aligner.lm_dir = lm_word
    aligner.am_dir = am_mono
    aligner.level = level
    aligner.with_posteriors = post

    if level == 'words' and post:
        with pytest.raises(NotImplementedError):
            aligner.compute()
    else:
        aligner.compute()

        # check if we have no error and an alignment file
        assert_no_expr_in_log(flog, 'error')
        ali_file = os.path.join(output_dir, 'alignment.txt')
        assert os.path.isfile(ali_file)

        if not post:
            res = [l.strip() for l in utils.open_utf8(ali_file, 'r')
                   if l.startswith('s0102a-sent17')]
            assert res == expected_ali[level]
コード例 #11
0
 def setup_wav(self):
     """Create wav.scp in data directory"""
     target = os.path.join(self._output_path(), 'wav.scp')
     wavs = set(w for w, _, _ in self.corpus.segments.itervalues())
     with open_utf8(target, 'w') as out:
         for wav in sorted(wavs):
             wav_path = os.path.join(self.corpus.wav_folder, wav)
             out.write(u'{} {}\n'.format(wav, wav_path))
コード例 #12
0
ファイル: language_model.py プロジェクト: syfengcuhk/abkhazia
    def _compute_lm(self, G_arpa):
        """Generate an ARPA n-gram from an abkhazia corpus

        This method relies on the following Kaldi programs:
        add-start-end.sh, build-lm.sh and compile-lm. It uses the
        IRSTLM library.

        """
        self.log.info('computing %s %s-gram in ARPA format', self.level,
                      self.order)

        # cut -d' ' -f2 lm_text > text_ready. Train need to
        # remove utt-id on first column of text file
        lm_text = os.path.join(self.a2k._local_path(), 'lm_text.txt')
        lm_lines = utils.open_utf8(lm_text, 'r').readlines()

        text_ready = os.path.join(self.a2k._local_path(), 'text_ready.txt')
        with utils.open_utf8(text_ready, 'w') as ready:
            ready.write('\n'.join(
                [' '.join(line.split()[1:]) for line in lm_lines]))

        text_se = os.path.join(self.a2k._local_path(), 'text_se.txt')
        utils.jobs.run('add-start-end.sh',
                       stdin=open(text_ready, 'r'),
                       stdout=open(text_se, 'w').write,
                       env=kaldi_path(),
                       cwd=self.recipe_dir)
        assert os.path.isfile(text_se), 'LM failed on add-start-end'

        # k option is number of split, useful for huge text files
        # build-lm.sh in kaldi/tools/irstlm/bin
        text_lm = os.path.join(self.a2k._local_path(), 'text_lm.gz')
        self._run_command(
            'build-lm.sh -i {0} -n {1} -o {2} -k 1 -s kneser-ney'.format(
                text_se, self.order, text_lm))
        assert os.path.isfile(text_lm), 'LM failed on build-lm'

        text_blm = os.path.join(self.a2k._local_path(), 'text_blm.gz')
        self._run_command(
            # was with the -i option
            'compile-lm {} --text=yes {}'.format(text_lm, text_blm))

        # gzip the compiled lm (from
        # https://docs.python.org/2/library/gzip.html#examples-of-usage)
        with open(text_blm, 'rb') as fin, gzip.open(G_arpa, 'wb') as fout:
            shutil.copyfileobj(fin, fout)
コード例 #13
0
    def load_lexicon(path):
        """Return a dict of word to phones entries loaded from `path`

        `path` is assumed to be a lexicon file, usually named 'lexicon.txt'

        """
        lines = (line.strip().split() for line in utils.open_utf8(path, 'r'))
        return {line[0]: ' '.join(line[1:]) for line in lines}
コード例 #14
0
    def load_phones(path):
        """Return a dict of phones mapped to their IPA equivalent

        `path` is assumed to be a phones file, usually named 'phones.txt'.

        """
        lines = (line.strip().split() for line in utils.open_utf8(path, 'r'))
        return {line[0]: line[1] for line in lines}
コード例 #15
0
    def load_text(path):
        """Return a dict of utterance ids mapped to their textual content

        `path` is assumed to be a text file, usually named 'text.txt'.

        """
        lines = (line.strip().split() for line in utils.open_utf8(path, 'r'))
        return {line[0]: ' '.join(line[1:]) for line in lines}
コード例 #16
0
    def __init__(self, input_dir,
                 log=utils.logger.null_logger(),
                 cmu_dict=None):

        super(WallStreetJournalPreparator, self).__init__(
            input_dir, log=log, cmu_dict=cmu_dict)

        # select only a subpart of recordings and transcriptions.
        # Listing files using the following 2 criterions: 1- files are
        # nested within self.directory_pattern and 2- the 4th letter
        # in the file name is self.file_pattern
        self.log.debug('directory pattern is {}, file pattern is {}'
                       .format(self.directory_pattern, self.file_pattern))

        # setup directory filter
        if self.directory_pattern is None:
            dir_filter = lambda d: True
        else:
            dir_filter = lambda d: d in self.directory_pattern

        # setup file pattern
        if self.file_pattern is None:
            filter_dot = lambda f: f[-4:] == '.dot'
            filter_wv1 = lambda f: f[-4:] == '.wv1'
        else:
            filter_dot = lambda f: (
                f[3] == self.file_pattern and f[-4:] == '.dot')
            filter_wv1 = lambda f: (
                f[3] == self.file_pattern and f[-4:] == '.wv1')

        # filter out the non desired input files
        self.input_recordings = self.filter_files(dir_filter, filter_wv1)
        self.input_transcriptions = self.filter_files(dir_filter, filter_dot)

        self.log.debug('selected {} speech files and {} transcription files'
                       .format(len(self.input_recordings),
                               len(self.input_transcriptions)))

        # filter out the corrupted utterances from input files. The
        # tag '[bad_recording]' in a transcript indicates a problem
        # with the associated recording (if it exists) so exclude it
        self.bad_utts = []
        for trs in self.input_transcriptions:
            for line in utils.open_utf8(trs, 'r').xreadlines():
                if '[bad_recording]' in line:
                    utt_id = re.match(r'(.*) \((.*)\)', line).group(2)
                    self.bad_utts.append(utt_id)

        self.log.debug('found {} corrupted utterances'
                       .format(len(self.bad_utts)))

        # filter out bad utterances
        self.sphs = [sph for sph in self.input_recordings
                     if (os.path.basename(sph).replace('.wv1', '')
                     not in self.bad_utts and
                     os.path.basename(sph).replace('.wv1', '')
                     not in self.exclude_wavs)]
コード例 #17
0
    def setup_silences(self):
        """Create data/local/self.name/{silences, optional_silence}.txt"""
        local_path = self._local_path()
        CorpusSaver.save_silences(
            self.corpus, os.path.join(local_path, 'silence_phones.txt'))

        target = os.path.join(local_path, 'optional_silence.txt')
        with open_utf8(target, 'w') as out:
            out.write(u'SIL\n')
コード例 #18
0
    def setup_utt2spk(self):
        """Create utt2spk and spk2utt in data directory"""
        target = os.path.join(self._output_path(), 'utt2spk')
        CorpusSaver.save_utt2spk(self.corpus, target)

        # create spk2utt
        target = os.path.join(self._output_path(), 'spk2utt')
        with open_utf8(target, 'w') as out:
            for spk, utt in sorted(self.corpus.spk2utt().iteritems()):
                out.write(u'{} {}\n'.format(spk, ' '.join(sorted(utt))))
コード例 #19
0
    def correct_dictionary(self):
        """Correct problems with the GlobalPhone Mandarin dictionary

        The corrections are completely ad hoc the result is stored in
        a temporary file

        """
        # the following words are in the dictionary but are not used
        # in the transcriptions they will be dropped
        words_to_drop = [u'$', u'(', u')']

        # read input file
        with utils.open_utf8(self.dictionary, 'r') as inp:
            lines = inp.readlines()

        # generate output file
        fid, corrected_dictionary = tempfile.mkstemp()
        os.close(fid)

        # correct content
        with utils.open_utf8(corrected_dictionary, 'w') as out:
            for line in lines:
                # skip secondary pronunciations
                if u'(2)' not in line:
                    # skip some words
                    if all([(u'{'+word+u'}' not in line)
                            for word in words_to_drop]):
                        # rewrite tone markers in a manner consistent with
                        # GlobalPhone Mandarin pinyin markings
                        line = line.replace(u'WB ', u'WB')
                        line = line.replace(u'  ', u' ')
                        # ttd and t.t.d have wrongly formatted
                        # transcriptions
                        line = line.replace(u'{{t}', u'{t')

                        line = re.sub(r'\{(\w*) T(\d)\}', u'\\1_\\2', line)
                        line = re.sub(r'\{(\w*) T(\d) WB\}',
                                      u'{\\1_\\2 WB}', line)
                        out.write(line)

        self.dictionary = corrected_dictionary
        return True
コード例 #20
0
    def setup_phone_lexicon(self):
        """Create data/local/self.name/lexicon.txt"""
        local_path = self._local_path()
        target = os.path.join(local_path, 'lexicon.txt')

        # get list of phones (including silence and non-silence phones)
        phones = []
        for origin in (os.path.join(local_path, 'silence_phones.txt'),
                       os.path.join(local_path, 'nonsilence_phones.txt')):
            phones += [line.strip() for line in open_utf8(origin, 'r')]

        # create 'phone' lexicon
        with open_utf8(target, 'w') as out:
            for word in phones:
                out.write(u'{0} {0}\n'.format(word))
            # add <unk> word, in case one wants to use the phone loop
            # lexicon for training it also is necessary if one doesn't
            # want to modify the validating scripts too much
            out.write(u'<unk> SPN\n')

        return target
コード例 #21
0
    def correct_transcription(self):
        """Correct problems with the GlobalPhone Vietnamese transcripts

        The corrections are completely ad hoc and the result are
        stored in a temporary folder.

        - remove trailings spaces and all double spacings and '_' from
          transcriptions on every odd line but the first

        - double spacings and '_' are actually only found for speakers
          200 to 208

        """
        # generate temporary output folder
        corrected_transcription_dir = tempfile.mkdtemp()

        # get the list of transcription files
        trss = utils.list_directory(self.transcription_dir, abspath=True)

        for trs in trss:
            # read transcript file
            lines = utils.open_utf8(trs, 'r').readlines()

            # correct odd lines
            lines[2::2] = [line.replace(u'_', u' ').replace(
                u'  ', u' ').strip() +
                           u'\n' for line in lines[2::2]]

            # # write corrected version to temp
            output_file = os.path.join(
                corrected_transcription_dir, os.path.basename(trs))

            with utils.open_utf8(output_file, 'w') as out:
                for line in lines:
                    out.write(line)

        self.transcription_dir = corrected_transcription_dir
        return True
コード例 #22
0
def nadults(cha):
    """Return the number of adult speakers recorded in `cha`"""
    # speaker IDs, lines starting with '@ID', forced to lowercase
    spks = (l.strip().lower() for l in open_utf8(cha, 'r')
            if l.startswith('@ID'))

    # exclude non-adult speakers
    exclude = [
        'sibl', 'broth', 'sist', 'target', 'child', 'to', 'environ', 'cousin',
        'non_hum', 'play'
    ]

    # count non-excluded speakers
    return sum(all(e not in spk for e in exclude) for spk in spks)
コード例 #23
0
    def make_transcription(self):
        text = dict()
        for trs in utils.list_directory(self.transcription_dir, abspath=True):
            spk_id = os.path.splitext(os.path.basename(trs))[0]
            lines = utils.open_utf8(trs, 'r').readlines()

            # add utterence id from even lines starting at line 2
            ids = [
                spk_id + u'_' + re.sub(ur'\s+|:|;', u'', e)
                for e in lines[1::2]
            ]
            # delete linebreaks on odd lines starting at line 3
            # (this does not take into account fancy unicode
            # linebreaks), see
            # http://stackoverflow.com/questions/3219014
            transcriptions = [
コード例 #24
0
 def parse_kana_to_phone(self, kana_csv):
     """Parse katakana phone transcription and pu it in a dict() """
     kana_to_phon = dict()
     with open_utf8(kana_csv, 'r') as fin:
         kana_transcript = fin.read()
         kana_transcript = kana_transcript.split('\n')
         for line in kana_transcript[1:]:
             if line == '':
                 continue
             phones = line.split('\t')
             katakana = phones[0].decode('utf8')
             bootphon = phones[3]
             if bootphon == '':
                 bootphon == "H"
             kana_to_phon[katakana] = bootphon
     return (kana_to_phon)
コード例 #25
0
ファイル: align.py プロジェクト: colincwilson/abkhazia
    def export(self):
        int2phone = read_int2phone(self.lm_dir)
        ali = self._read_result_utts('ali')
        post = self._read_result_utts('post') if self.with_posteriors else None

        # retrieve the export function according to `level`
        func = {'phones': self._export_phones,
                'words': self._export_words,
                'both': self._export_phones_and_words}[self.level]
        aligned = func(int2phone, ali, post)

        # write it to the target file
        target = os.path.join(self.output_dir, 'alignment.txt')
        with utils.open_utf8(target, 'w') as out:
            out.write('\n'.join(line.strip() for line in aligned) + '\n')

        super(Align, self).export()
コード例 #26
0
def model_type(am_dir):
    """Return the type of the trained model

    Read in meta.txt, or raise IOError if not found

    TODO could be parsed from final.mdl ?

    """
    meta = os.path.join(am_dir, 'meta.txt')
    if not os.path.isfile(meta):
        raise IOError('file not found: {}'.format(meta))

    for line in utils.open_utf8(meta, 'r'):
        if line.startswith('acoustic model type'):
            return line.split(':')[1].strip()

    raise IOError('acoustic model type not found in {}'.format(meta))
コード例 #27
0
    def make_lexicon(self):
        lexicon = dict()
        for line in utils.open_utf8(self.cmu_dict, 'r').readlines():
            # remove newline and trailing spaces
            line = line.strip()

            # skip comments
            if not (len(line) >= 3 and line[:3] == u';;;'):
                # parse line
                word, phones = line.split(u'  ')

                # skip alternative pronunciations, the first one
                # (with no parenthesized number at the end) is
                # supposed to be the most common and is retained
                if not re.match(ur'(.*)\([0-9]+\)$', word):
                    # ignore stress variants of phones
                    lexicon[word] = re.sub(u'[0-9]+', u'', phones).strip()
コード例 #28
0
ファイル: language_model.py プロジェクト: syfengcuhk/abkhazia
    def _change_lm_vocab(self, lm_txt, words_txt):
        """Create a LM from an existing one by changing its vocabulary

        All n-grams in the new vocab are retained with their original
        probabilities. Backoff weights are recomputed and backed-off
        unigrams for all new words are added. -subset option performs
        subsetting of the vocabulary without adding new words.

        This is reimplementation of the change-lm-vocab script from
        SRILM, modified in 3 ways:

         - no more -tolower option

         - the pruning step now updates ngrams count in the header
           (disable annoying warning)

         - the call to ngram is done in 2 steps if the 1 step failed
           (-renorm and prune-lowprobs options failed together on
           -librispeech-test-clean, need 2 calls)

        """
        out_lm = os.path.join(self.output_dir, 'out_lm.txt')
        self.log.debug('pruning vocabulary in %s', out_lm)

        words = set(w.split()[0] for w in utils.open_utf8(words_txt, 'r'))
        lm = ARPALanguageModel.load(lm_txt)
        lm.prune_vocabulary(words)
        lm_pruned = lm_txt + '.pruned'
        lm.save(lm_pruned)

        try:
            self._run_command(
                'ngram -lm {0} -vocab /dev/null -renorm -write-lm {1} '
                '-prune-lowprobs -unk -order {2}'.format(
                    lm_pruned, out_lm, self.order))
        except RuntimeError:
            self._run_command('utils/run.pl {3} '
                              'ngram -lm {0} -vocab /dev/null -write-lm - '
                              '-prune-lowprobs -unk -order {2} | '
                              'ngram -lm - -vocab /dev/null -renorm -unk '
                              '-order {2} -write-lm {1}'.format(
                                  lm_pruned, out_lm, self.order,
                                  os.path.join(self.output_dir, 'ngram.log')))

        return out_lm
コード例 #29
0
ファイル: language_model.py プロジェクト: syfengcuhk/abkhazia
def read_int2phone(lm_dir, word_position_dependent=True):
    """Return a int to phone mapping as a dict

    Kaldi internally codes phones as ints, so this method reverses the
    mapping from ints to phones based on the phones.txt file in
    `lm_dir`. This file is assumed to exist.

    """
    phonemap = dict()
    for line in utils.open_utf8(os.path.join(lm_dir, 'phones.txt'), 'r'):
        phone, code = line.strip().split(' ')

        # remove word position markers
        if word_position_dependent and phone[-2:] in ['_I', '_B', '_E', '_S']:
            phone = phone[:-2]

        phonemap[code] = phone
    return phonemap
コード例 #30
0
    def _parse_chas(self, chas, exclude_spks):
        """Extract cleaned utterances from raw cha files

        Return a dict of utterances where keys are the utterances id and
        values are Utterance named tuples.

        a) the ones considered will be only the ones marked with time-stamps
        b) whose timestamps correspond to wav duration
        c) utterances that are not empty

        """
        self.log.info('parsing %s cha files...', len(chas))
        utts = {}
        for cha, wav in chas.items():
            # duration of the wav in millisecond
            duration = utils.wav.duration(wav)

            # get cleaned utterances from the raw cha file. At that
            # point timestamps are the last word of each line.
            text = utils.cha.clean(
                l.strip() for l in utils.open_utf8(cha, 'r')
                if not re.search(exclude_spks, l))

            cha_id = os.path.splitext(os.path.basename(cha))[0]
            counter = 0
            for words in (t.split() for t in text):
                if len(words) > 1:  # remove empty utterances
                    # parsing the timestamps
                    timestamp = words[-1].split('_')
                    tbegin = int(timestamp[0])/1000.
                    tend = int(timestamp[1])/1000.

                    # reject utterances with out of boundaries
                    # timestamps
                    if not (tbegin > duration or tend > duration):
                        counter += 1
                        utt_id = cha_id + '-sent' + str(counter)

                        utts[utt_id] = self.Utterance(
                            ' '.join(words[:-1]),
                            os.path.basename(wav),
                            tbegin, tend)

        return utts