def get_corpus(self): SKIP_LIST = [] ## filter(None, CLI_ARGS.skiplist.split(",")) ##extract training and development datasets ##do data merge, ArchiveImporter make final train/test/dev datasets utterances = {} audios = [] wav_root_dir = os.path.join(self.origin_data_path, 'it_IT') # Get audiofile path and transcript for each sentence in tsv samples = [] glob_dir = os.path.join(wav_root_dir, "**/metadata.csv") for record in glob(glob_dir, recursive=True): if any(map(lambda sk: sk in record, SKIP_LIST)): continue enc = encoding_from_path(record) with open(record, "r", encoding=enc) as rec: for re in rec.readlines(): re = re.strip().split("|") audio = os.path.join(os.path.dirname(record), "wavs", re[0] + ".wav") transcript = re[2] samples.append((audio, transcript)) ##append data manifest utterances[audio] = transcript audios.append(audio) ##collect corpus corpus = Corpus(utterances, audios) ################# ## evalita2009 have clips WAV 16000Hz - 1 chnl ## not require resample corpus.make_wav_resample = False return corpus
def get_corpus(self): ##extract training and development datasets ##do data merge, ArchiveImporter make final train/test/dev datasets utterances = {} audios = [] for d in ("development","training"): wav_dir = os.path.join(self.origin_data_path, d) transcript_path = os.path.join(self.origin_data_path, self.archive_name, d+".txt") with open(transcript_path) as fin: for line in fin: t_s = line.split(".wav", maxsplit=1) audio_file_rel_path = t_s[0] + '.wav' transcript = t_s[1].strip() ###preprocess transcript - replace numbers from numeric to literal transcript = ' '.join([EVALITA_DICT_CONVERSION.get(c,'') for c in transcript]) transcript = re.sub(r'\s+', ' ', transcript) ## need absolute path for audios, separator cross os _audio_file_rel_path = ''.join([ os.path.sep if c=='/' else c for c in audio_file_rel_path]) audio_file_abs_path = os.path.join(self.origin_data_path, _audio_file_rel_path ) ##append data manifest utterances[audio_file_abs_path] = transcript audios.append(audio_file_abs_path) ##collect corpus corpus = Corpus(utterances,audios) ################# ## evalita2009 have clips WAV 16000Hz - 1 chnl ## not require resample corpus.make_wav_resample = False return corpus
def get_corpus(self): ##extract training and development datasets ##do data merge, ArchiveImporter make final train/test/dev datasets utterances = {} audios = [] wav_dir = os.path.join(self.origin_data_path, self.extract_dir, "wav", "IT") text_dir = os.path.join(self.origin_data_path, self.extract_dir, "txt", "IT") for subdir, dirs, files in os.walk(wav_dir): uuu = 0 for _dir in dirs: curr_wav_dir = os.path.join(subdir, _dir) curr_txt_dir = os.path.join(text_dir, _dir) ##iterate wav file current folder for fname in os.listdir(curr_wav_dir): fname = os.fsdecode(fname) wav_file_path = os.path.join(wav_dir, _dir, fname) txt_file_path = os.path.join(curr_txt_dir, fname.split('.')[0] + '.txt') if (not os.path.isfile(txt_file_path)): print('audio file {} doesn\'t have a file transcript') continue ##read file transcript transcript = '' ##files have different encoding (utf-8, utf_16_be, etc..) ##need check to open file with correct encoding file_encoding = 'utf-8' enc = CnM.from_path(txt_file_path).best().first() file_encoding = enc.encoding ##fix same encoding if (file_encoding == 'big5' or file_encoding == 'cp1252'): file_encoding = 'utf-8' with open(txt_file_path, "r", encoding=file_encoding) as f: transcript += f.readline() transcript = transcript.strip() ##append data manifest utterances[wav_file_path] = transcript audios.append(wav_file_path) ##collect corpus corpus = Corpus(utterances, audios) ################# ## SIWIS clips need resample wav - clips is 44100Hz 706 kb/s (1 chnl) ## not require resample corpus.make_wav_resample = True return corpus
def get_corpus(self): SKIP_LIST = [] ## filter(None, CLI_ARGS.skiplist.split(",")) ##extract training and development datasets ##do data merge, ArchiveImporter make final train/test/dev datasets utterances = {} audios = [] fixed_token = {} wav_root_dir = os.path.join(self.origin_data_path,'it_IT') bad_examples = self.get_bad_examples() # Get audiofile path and transcript for each sentence in tsv glob_dir = os.path.join(wav_root_dir, "**/metadata.csv") for record in glob(glob_dir, recursive=True): if any( map(lambda sk: sk in record, SKIP_LIST) ): continue enc = encoding_from_path(record) with open(record, "r",encoding=enc) as rec: for re in rec.readlines(): re = re.strip().split("|") filename = re[0] ##filter bad examples (https://github.com/MozillaItalia/DeepSpeech-Italian-Model/issues/124#issuecomment-798613031) if(filename in bad_examples): continue audio = os.path.join(os.path.dirname(record), "wavs", filename + ".wav") transcript_source = re[1] transcript = re[2] ##in MLS normalization of character '’' is wrong in transcription normalization transcript = fix_apostrophe(transcript_source,transcript,fixed_token) ##append data manifest utterances[audio] = transcript audios.append(audio) ##collect corpus corpus = Corpus(utterances,audios) ################# ## evalita2009 have clips WAV 16000Hz - 1 chnl ## not require resample corpus.make_wav_resample = True ##self.save_wrong_token_dictionary(fixed_token) return corpus
def get_corpus(self): ###open wrong token saved on M-AILABS importer to fix apostrophe issue #fixed_tokens = load_mailabs_fixed_token() ##extract training and development datasets ##do data merge, ArchiveImporter make final train/test/dev datasets utterances = {} audios = [] count = 0 for d in ("train", "dev", "test"): # #if(count==1): # break wav_dir = os.path.join(self.origin_data_path, self.archive_name, d, "audio") transcript_path = os.path.join(self.origin_data_path, self.archive_name, d, "transcripts.txt") with open(transcript_path, encoding="utf-8") as fin: for line in fin: t_s = re.split(r'\t+', line) flac_file_path_t = t_s[0].split('_') file_name = t_s[0] + ".flac" audio_file_path = os.path.join(wav_dir, flac_file_path_t[0], flac_file_path_t[1], file_name) transcript = t_s[1].strip() ### fix apostrophe bug with raw data in m-ailabs - see https://github.com/MozillaItalia/DeepSpeech-Italian-Model/issues/124 #transcript = fix_apostrophe(transcript,fixed_tokens) ##append data manifest utterances[audio_file_path] = transcript audios.append(audio_file_path) count += 1 #if(count==1): # break ##collect corpus corpus = Corpus(utterances, audios) ################# ## ## audio .flac require resample corpus.make_wav_resample = True return corpus
def get_corpus(self): ##extract training and development datasets ##do data merge, ArchiveImporter make final train/test/dev datasets utterances = {} audios = [] count = 0 for d in ("train", "dev", "test"): # #if(count==1): # break wav_dir = os.path.join(self.origin_data_path, self.archive_name, d, "audio") transcript_path = os.path.join(self.origin_data_path, self.archive_name, d, "transcripts.txt") with open(transcript_path) as fin: for line in fin: t_s = re.split(r'\t+', line) flac_file_path_t = t_s[0].split('_') file_name = t_s[0] + ".flac" audio_file_path = os.path.join(wav_dir, flac_file_path_t[0], flac_file_path_t[1], file_name) transcript = t_s[1].strip() ##append data manifest utterances[audio_file_path] = transcript audios.append(audio_file_path) count += 1 #if(count==1): # break ##collect corpus corpus = Corpus(utterances, audios) ################# ## ## audio .flac require resample corpus.make_wav_resample = True return corpus
def get_corpus(self): ##extract training and development datasets ##do data merge, ArchiveImporter make final train/test/dev datasets utterances = {} audios = [] wav_dir = os.path.join(self.origin_data_path, self.archive_name, "wav") text_file = os.path.join(self.origin_data_path, self.archive_name, "etc","PROMPTS") wav_files = [f for f in os.listdir(wav_dir) if os.path.isfile(os.path.join(wav_dir, f))] count=0 with open(text_file,encoding='utf-8') as f: for line in f: temp_2 = line.split(" ", 1) ref_url = temp_2[0] transcript = temp_2[1].lower() transcript = transcript.replace('\n','') temp = ref_url.split('/') speaker_id = temp[0] file_n = temp[-1] for wav_file in wav_files: if(file_n in wav_file): ##found , is this wav_file_path = os.path.join(wav_dir,wav_file) utterances[ wav_file_path] = transcript audios.append(wav_file_path) count +=1 break ##collect corpus corpus = Corpus(utterances,audios) ################# ## VoxForge need wav resample ## corpus.make_wav_resample = True return corpus
def get_corpus(self): ##extract training and development datasets ##do data merge, ArchiveImporter make final train/test/dev datasets utterances = {} audios = [] wav_dir = os.path.join(self.origin_data_path, self.archive_name, "wav_1.0.0") text_dir = os.path.join(self.origin_data_path, self.archive_name, "lab_1.0.0") ##iterate wav file current folder count = 0 for fname in os.listdir(wav_dir): fname = os.fsdecode(fname) if (not fname.lower().endswith('.wav')): continue wav_file_path = os.path.join(wav_dir, fname) txt_file_path = os.path.join(text_dir, fname.split('.')[0] + '.lab') if (not os.path.isfile(txt_file_path)): print('audio file {} doesn\'t have a file transcript'.format( str(wav_file_path))) continue ##read file transcript transcript = '' transcript_annotaded = '' file_encoding = 'utf-8' with open(txt_file_path) as f: transcript_annotaded = f.readlines() ##parse annotation - build a clean transcript transcript_toks = [] for line in transcript_annotaded: row_data = line.split() if (len(row_data) <= 3): ##no transcrit here continue annotation = row_data[2] if (annotation == SILENCE_ANNOTATION): continue curr_text = row_data[3:] curr_text = ' '.join(curr_text) ##clear text - accented char escape curr_text = string_escape(curr_text) transcript_toks.append(curr_text) transcript = ' '.join(transcript_toks) transcript = transcript.strip() ##append data manifest utterances[wav_file_path] = transcript audios.append(wav_file_path) count += 1 ##collect corpus corpus = Corpus(utterances, audios) ################# ## MSPKA need wav resample - clips is 22050Hz 353 kb/s (1 chnl) ## corpus.make_wav_resample = True return corpus
def get_corpus(self): ##extract training and development datasets ##do data merge, ArchiveImporter make final train/test/dev datasets utterances = {} audios = [] wav_dir = os.path.join(self.origin_data_path, self.extract_dir, "wav","IT") text_dir = os.path.join(self.origin_data_path, self.extract_dir, "txt","IT") ##read transcript in prompts.txt transcripts = {} ##encoding prompts files is cp1252 encoding = 'cp1252' ###read transcript from prompts file with open(os.path.join(self.origin_data_path,self.extract_dir, "prompts","ALL_IT_prompts_iso.txt"), "r",encoding=encoding) as f: line = f.readline() while line: temp = re.split(r'\t', line) filename = temp[0] transcript = temp[1].strip() transcripts[filename] = transcript # use realine() to read next line line = f.readline() for subdir, dirs, files in os.walk(wav_dir): uuu=0 for _dir in dirs: if(_dir=='converted'): ##wav converted in a previous session run continue curr_wav_dir = os.path.join(subdir, _dir) # ##iterate wav file current folder for fname in os.listdir(curr_wav_dir): fname = os.fsdecode(fname) if(fname=='converted'): ##skip wav converted by importer continue wav_file_path = os.path.join(wav_dir, _dir,fname) try: transcript = transcripts[fname.replace('.wav','.txt')] except: curr_txt_dir = os.path.join(text_dir, _dir) txt_file_path = os.path.join(curr_txt_dir, fname.split('.')[0]+'.txt') #print('missing prompts , read transcript from file {}'.format(txt_file_path)) if(not os.path.isfile(txt_file_path)): raise ValueError('audio file {} doesn\'t have a file transcript'.format(wav_file_path)) #continue transcript = self.read_txt_file(txt_file_path) ##append data manifest utterances[wav_file_path] = transcript audios.append(wav_file_path) ##collect corpus corpus = Corpus(utterances,audios) ################# ## SIWIS clips need resample wav - clips is 44100Hz 706 kb/s (1 chnl) ## not require resample corpus.make_wav_resample = True return corpus