def process(corpus_path, output_datadir): common_utils.make_sure_path_exists(output_datadir) nlp = spacy.load('de_core_news_sm') # Common voice has repetitions and the text is not normalized # we cache text normalizations since they can be slow normalize_cache = {} # we first load the entire corpus text into memory, sort by ID and then write it out into Kaldis data_dir format corpus = {} print('Loading', corpus_path + validated_filename) with open(corpus_path + validated_filename) as corpus_path_in: for line in corpus_path_in: split = line.split('\t') # print(split) #myid = split[0] filename = split[1] text = split[2] # print(filename) m = re.match(r'[^0-9]*([0-9]+)[^0-9]*mp3', filename) # only proceed if we can parse the sequence num from the filename if m: seq_num = int(m.group(1)) myid = "%.10d" % seq_num spk = myid if text not in normalize_cache: normalized_text = normalize_sentences.normalize(nlp, text) normalize_cache[text] = normalized_text else: normalized_text = normalize_cache[text] #print(myid, filename, text, normalized_text) corpus[myid] = (filename, normalized_text) print('done loading common voice tsv!') print('Now writing out to', output_datadir, 'in Kaldi format!') with open(output_datadir + 'wav.scp', 'w') as wav_scp, open( output_datadir + 'utt2spk', 'w') as utt2spk, open(output_datadir + 'text', 'w') as text_out: for myid in sorted(corpus.keys()): spk = myid fullid = spk + '_' + myid filename, normalized_text = corpus[myid] wav_scp.write(fullid + ' ' + wav_scp_template.replace( "$filepath", corpus_path + 'clips/' + filename) + '\n') utt2spk.write(fullid + ' ' + spk + '\n') text_out.write(fullid + ' ' + normalized_text + '\n') print('done!')
def process(text_kaldi_file): nlp = spacy.load('de') texts = [] normalize_cache = {} i = 0 nonce = str(int(time.time())) print('Making a backup of the original file:', text_kaldi_file, '=>', text_kaldi_file + '.orig' + nonce) shutil.copyfile(text_kaldi_file, text_kaldi_file + '.orig' + nonce) stopwords = load_lowercase_stopwords() print('Opening and processing', text_kaldi_file) with open(text_kaldi_file) as infile: for line in infile: i += 1 if i % 10000 == 0: print('At line:', i) if line[-1] == '\n': line = line[:-1] split = line.split() # first element in the split id the ID if len(split) > 1: myid = split[0] text = ' '.join(split[1:]) first_word = split[1] if text not in normalize_cache: should_lowercase = first_word in stopwords try: normalized_text = normalize_sentences.normalize( nlp, text) # Npte the normalization step looks at POS tags to decide if the first word should be lowercased, but gets some if should_lowercase: # check that the first word isnt in all upper case: if not normalized_text[1].isupper(): normalized_text = normalized_text[0].lower( ) + normalized_text[1:] normalize_cache[text] = normalized_text except: print('Warning, error normalizing:', text) continue else: normalized_text = normalize_cache[text] texts.append(myid + ' ' + normalized_text) else: print('Warning,', myid, 'has no text!') print('Rewrite', text_kaldi_file) with open(text_kaldi_file, 'w') as outfile: outfile.write('\n'.join(texts))
def test_sent(test_sentence): result = normalize_sentences.normalize(nlp, test_sentence) print(test_sentence, '->', result)