def process(corpus_path, output_datadir):
    common_utils.make_sure_path_exists(output_datadir)
    nlp = spacy.load('de_core_news_sm')

    # Common voice has repetitions and the text is not normalized
    # we cache text normalizations since they can be slow
    normalize_cache = {}

    # we first load the entire corpus text into memory, sort by ID and then write it out into Kaldis data_dir format
    corpus = {}

    print('Loading', corpus_path + validated_filename)
    with open(corpus_path + validated_filename) as corpus_path_in:
        for line in corpus_path_in:
            split = line.split('\t')
            #        print(split)

            #myid = split[0]
            filename = split[1]
            text = split[2]

            #       print(filename)
            m = re.match(r'[^0-9]*([0-9]+)[^0-9]*mp3', filename)

            # only proceed if we can parse the sequence num from the filename
            if m:
                seq_num = int(m.group(1))

                myid = "%.10d" % seq_num

                spk = myid

                if text not in normalize_cache:
                    normalized_text = normalize_sentences.normalize(nlp, text)
                    normalize_cache[text] = normalized_text
                else:
                    normalized_text = normalize_cache[text]

                #print(myid, filename, text, normalized_text)

                corpus[myid] = (filename, normalized_text)

    print('done loading common voice tsv!')
    print('Now writing out to', output_datadir, 'in Kaldi format!')

    with open(output_datadir + 'wav.scp', 'w') as wav_scp, open(
            output_datadir + 'utt2spk',
            'w') as utt2spk, open(output_datadir + 'text', 'w') as text_out:
        for myid in sorted(corpus.keys()):
            spk = myid
            fullid = spk + '_' + myid
            filename, normalized_text = corpus[myid]

            wav_scp.write(fullid + ' ' + wav_scp_template.replace(
                "$filepath", corpus_path + 'clips/' + filename) + '\n')
            utt2spk.write(fullid + ' ' + spk + '\n')
            text_out.write(fullid + ' ' + normalized_text + '\n')

    print('done!')
def process(text_kaldi_file):
    nlp = spacy.load('de')
    texts = []
    normalize_cache = {}
    i = 0

    nonce = str(int(time.time()))
    print('Making a backup of the original file:', text_kaldi_file, '=>',
          text_kaldi_file + '.orig' + nonce)
    shutil.copyfile(text_kaldi_file, text_kaldi_file + '.orig' + nonce)

    stopwords = load_lowercase_stopwords()

    print('Opening and processing', text_kaldi_file)
    with open(text_kaldi_file) as infile:
        for line in infile:
            i += 1

            if i % 10000 == 0:
                print('At line:', i)

            if line[-1] == '\n':
                line = line[:-1]
            split = line.split()

            # first element in the split id the ID
            if len(split) > 1:
                myid = split[0]
                text = ' '.join(split[1:])

                first_word = split[1]

                if text not in normalize_cache:
                    should_lowercase = first_word in stopwords

                    try:
                        normalized_text = normalize_sentences.normalize(
                            nlp, text)
                        # Npte the normalization step looks at POS tags to decide if the first word should be lowercased, but gets some
                        if should_lowercase:
                            # check that the first word isnt in all upper case:
                            if not normalized_text[1].isupper():
                                normalized_text = normalized_text[0].lower(
                                ) + normalized_text[1:]
                        normalize_cache[text] = normalized_text
                    except:
                        print('Warning, error normalizing:', text)
                        continue
                else:
                    normalized_text = normalize_cache[text]

                texts.append(myid + ' ' + normalized_text)
            else:
                print('Warning,', myid, 'has no text!')

    print('Rewrite', text_kaldi_file)
    with open(text_kaldi_file, 'w') as outfile:
        outfile.write('\n'.join(texts))
Esempio n. 3
0
def test_sent(test_sentence):
    result = normalize_sentences.normalize(nlp, test_sentence)

    print(test_sentence, '->', result)