def convert_audio_and_split_transcript(dataset_dir, subset, out_csv_file): """Convert tar.gz to WAV and split the transcript. Args: dataset_dir : the directory which holds the input dataset. subset : the name of the specified dataset. e.g. dev. out_csv_file : the resulting output csv file. """ gfile = tf.compat.v1.gfile logging.info("Processing audio and transcript for {}".format(subset)) audio_dir = os.path.join(dataset_dir, "wav/") trans_dir = os.path.join(dataset_dir, "transcript/") files = [] char_dict = {} if not gfile.Exists(os.path.join(audio_dir, subset)): # not unzip wav yet for filename in os.listdir(audio_dir): os.system("tar -zxvf " + audio_dir + filename + " -C " + audio_dir) with codecs.open(os.path.join(trans_dir, "aishell_transcript_v0.8.txt"), "r", encoding="utf-8") as f: for line in f: items = line.strip().split(" ") wav_filename = items[0] labels = "" for item in items[1:]: labels += item if item in char_dict: char_dict[item] += 1 else: char_dict[item] = 0 files.append((wav_filename + ".wav", labels)) files_size_dict = {} output_wav_dir = os.path.join(audio_dir, subset) for root, subdirs, _ in gfile.Walk(output_wav_dir): for subdir in subdirs: for filename in os.listdir(os.path.join(root, subdir)): files_size_dict[filename] = ( get_wave_file_length(os.path.join(root, subdir, filename)), subdir, ) content = [] for wav_filename, trans in files: if wav_filename in files_size_dict: # wav which has trans is valid filesize, subdir = files_size_dict[wav_filename] abspath = os.path.join(output_wav_dir, subdir, wav_filename) content.append((abspath, filesize, trans, subdir)) files = content # Write to CSV file which contains three columns: # "wav_filename", "wav_length_ms", "transcript", "speakers". df = pandas.DataFrame( data=files, columns=["wav_filename", "wav_length_ms", "transcript", "speaker"]) df.to_csv(out_csv_file, index=False, sep="\t") logging.info("Successfully generated csv file {}".format(out_csv_file))
def convert_audio_and_split_transcript(dataset_dir, subset, out_csv_file): """Convert tar.gz to WAV and split the transcript. Args: dataset_dir : the directory which holds the input dataset. subset : the name of the specified dataset. e.g. dev. out_csv_file : the resulting output csv file. """ gfile = tf.compat.v1.gfile logging.info("Processing audio and transcript for {}".format(subset)) audio_dir = dataset_dir trans_dir = dataset_dir content = [] with open(os.path.join(*[dataset_dir, subset, 'wav.scp']), 'r') as fin: for line in fin: line = line.strip().split() line = line[1] # get text text_f = line[:-4]+'.txt' text = read(text_f).open() text = ' '.text # get speaker id spk_line = open(line[:-4]+'.metadata').read()[22] speaker = spk_line.strip().split()[1] wav_len = get_wave_file_length(line) content.append((line, wav_len, text, speaker)) df = pandas.DataFrame( data=content, columns=["wav_filename", "wav_length_ms", "transcript", "speaker"] ) df.to_csv(out_csv_file, index=False, sep="\t") logging.info("Successfully generated csv file {}".format(out_csv_file))
def convert_audio_and_split_transcript(input_dir, source_name, target_name, output_dir, output_file): """Convert FLAC to WAV and split the transcript. Args: input_dir: the directory which holds the input dataset. source_name: the name of the specified dataset. e.g. test-clean target_name: the directory name for the newly generated audio files. e.g. test-clean-wav output_dir: the directory to place the newly generated csv files. output_file: the name of the newly generated csv file. e.g. test-clean.csv """ logging.info("Processing audio and transcript for %s" % source_name) source_dir = os.path.join(input_dir, source_name) target_dir = os.path.join(input_dir, target_name) if not gfile.Exists(target_dir): gfile.MakeDirs(target_dir) files = [] tfm = Transformer() # Convert all FLAC file into WAV format. At the same time, generate the csv for root, _, filenames in gfile.Walk(source_dir): for filename in fnmatch.filter(filenames, "*.trans.txt"): trans_file = os.path.join(root, filename) with codecs.open(trans_file, "r", "utf-8") as fin: for line in fin: seqid, transcript = line.split(" ", 1) # We do a encode-decode transformation here because the output type # of encode is a bytes object, we need convert it to string. transcript = ( unicodedata.normalize("NFKD", transcript) .encode("ascii", "ignore") .decode("ascii", "ignore") .strip() .lower() ) # Convert FLAC to WAV. flac_file = os.path.join(root, seqid + ".flac") wav_file = os.path.join(target_dir, seqid + ".wav") if not gfile.Exists(wav_file): tfm.build(flac_file, wav_file) # wav_filesize = os.path.getsize(wav_file) wav_length = get_wave_file_length(wav_file) files.append((os.path.abspath(wav_file), wav_length, transcript)) # Write to CSV file which contains three columns: # "wav_filename", "wav_length_ms", "transcript". csv_file_path = os.path.join(output_dir, output_file) df = pandas.DataFrame( data=files, columns=["wav_filename", "wav_length_ms", "transcript"] ) df.to_csv(csv_file_path, index=False, sep="\t") logging.info("Successfully generated csv file {}".format(csv_file_path))
def convert_audio_and_split_transcript(directory, subset, out_csv_file): gfile = tf.compat.v1.gfile logging.info("Processing audio and transcript for {}".format(subset)) audio_dir = os.path.join(directory, 'corpus', subset) trans_dir = os.path.join(directory, 'transcript') files = [] char_dict = {} if not gfile.Exists(os.path.join(directory, subset)): data_file = os.path.join(directory, subset) # os.mkdir(data_file) os.makedirs(data_file, exist_ok=True) for filename in os.listdir(audio_dir): os.system("tar -zxvf" + audio_dir + filename + " -C" + data_file) with codecs.open(os.path.join(trans_dir, "aidatatang_200_zh_transcript.txt"), "r", encoding="utf-8") as f: for line in f: items = line.strip().split(" ") wav_filename = items[0] labels = "" for item in items[1:]: labels += item if item in char_dict: char_dict[item] += 1 else: char_dict[item] = 0 files.append((wav_filename + ".wav", labels)) files_size_dict = {} output_wav_dir = os.path.join(directory, subset) for root, subdirs, _ in gfile.Walk(output_wav_dir): for subdir in subdirs: for filename in os.listdir(os.path.join(root, subdir)): if filename.strip().split('.')[-1] != 'wav': continue files_size_dict[filename] = ( get_wave_file_length(os.path.join(root, subdir, filename)), subdir, ) content = [] for wav_filename, trans in files: if wav_filename in files_size_dict: filesize, subdir = files_size_dict[wav_filename] abspath = os.path.join(output_wav_dir, subdir, wav_filename) content.append((abspath, filesize, trans, subdir)) files = content df = pandas.DataFrame( data=files, columns=["wav_filename", "wav_length_ms", "transcript", "speaker"]) df.to_csv(out_csv_file, index=False, sep="\t") logging.info("Successfully generated csv file {}".format(out_csv_file))
def convert_audio_and_split_transcript(dataset_dir, total_csv_path): """Convert rar to WAV and split the transcript. Args: dataset_dir : the directory which holds the input dataset. total_csv_path : the resulting output csv file. LJSpeech-1.1 dir Tree structure: LJSpeech-1.1 -metadata.csv -LJ001-0002|in being comparatively modern.|in being comparatively modern. ... -wavs -LJ001-0001.wav -LJ001-0002.wav ... -LJ050-0278 -pcms -audio-LJ001-0001.s16 -audio-LJ001-0002.s16 ... """ logging.info("ProcessingA audio and transcript for {}".format("all_files")) wav_dir = os.path.join(dataset_dir, "LJSpeech-1.1/wavs/") files = [] # ProsodyLabel ---word with codecs.open(os.path.join(dataset_dir, "LJSpeech-1.1/metadata.csv"), "r", encoding="utf-8") as f: for line in f: wav_name = line.split('|')[0] + '.wav' wav_file = os.path.join(wav_dir, wav_name) wav_length = get_wave_file_length(wav_file) #get transcript content = line.split('|')[2] clean_content = preprocess(content.rstrip()) transcript = ' '.join(list(clean_content)) transcript = transcript.replace(' ', ' <space>') transcript = 'sp1 ' + transcript + ' sil' #' sil\n' files.append((os.path.abspath(wav_file), wav_length, transcript)) # Write to txt file which contains three columns: fp = open(total_csv_path, 'w', encoding="utf-8") fp.write("wav_filename"+'\t' "wav_length_ms"+'\t' "transcript"+'\n') for i in range(len(files)): fp.write(str(files[i][0])+'\t') fp.write(str(files[i][1])+'\t') fp.write(str(files[i][2])+'\n') fp.close() logging.info("Successfully generated csv file {}".format(total_csv_path))
def convert_audio_and_split_transcript(dataset_dir): """Convert rar to WAV and split the transcript. Args: dataset_dir : the directory which holds the input dataset. -----> "examples/asr/didispeech-2/data/ total_csv_path : the resulting output csv file. """ logging.info( "Processing convert_audio_and_split_transcript {}".format("all_files")) WAVE_dir = os.path.join(dataset_dir, "WAVE") SCRIPT_dir = os.path.join(dataset_dir, "SCRIPT") csv_path = os.path.join(dataset_dir, 'total.csv') #output_dir/adult_total.csv #构建wav_id-wav_file的字典 key = [] value = [] wav_lists = os.listdir(WAVE_dir) #SPEAKERxxx for wav_list in wav_lists: transcript_id = wav_list[:-4] #00004944-00000065 wav_file = os.path.join(WAVE_dir, wav_list) #00004944-00000065.wav key.append(transcript_id) value.append(wav_file) dic = dict(zip(key, value)) files = [] is_sentid_line = True for line in open(transcript_path, encoding="UTF-8-sig", errors='ignore'): #序号 文本 声韵母-类似标贝 if is_sentid_line: transcript_id = line.split('\t')[0] speaker = transcript_id.split('-')[0] transcript = line.split('\t')[1].strip() transcript = stringpartQ2B(transcript) #大写全角转小写半角 #去除符号 punc = ',。!?【】()《》“‘:;[]{}&,.?()\%-+ ̄~$#@=_、/\\%-+~~$#@=_//,.!?' transcript = re.sub(r"[%s]+" % punc, "", transcript) #print(transcript) wav_file = dic[str(transcript_id)] wav_length = get_wave_file_length(wav_file) files.append( (os.path.abspath(wav_file), wav_length, speaker, transcript)) is_sentid_line = not is_sentid_line df = pandas.DataFrame( data=files, columns=["wav_filename", "wav_length_ms", "speaker", "transcript"]) df.to_csv(csv_path, index=False) logging.info("Successfully generated total_csv file {}".format( csv_path)) ##output_dir/adult_total.csv
def convert_audio_and_split_transcript(dataset_dir, total_csv_path): """Convert rar to WAV and split the transcript. Args: dataset_dir : the directory which holds the input dataset. -----> /nfs/project/datasets/data_baker_tts total_csv_path : the resulting output csv file. BZNSYP dir Tree structure: BZNSYP -ProsodyLabeling -000001-010000.txt -Wave -000001.wav -000002.wav ... -PhoneLabeling -000001.interval -000002.interval ... """ logging.info("ProcessingA audio and transcript for {}".format("all_files")) audio_dir = os.path.join(dataset_dir, "BZNSYP/Wave/") prosodyLabel_dir = os.path.join(dataset_dir, "BZNSYP/ProsodyLabeling/") files = [] # ProsodyLabel ---word with codecs.open(os.path.join(prosodyLabel_dir, "000001-010000.txt"), "r", encoding="utf-8") as f: for line in f: if line[0:1] == '0': # 000001 卡尔普#2陪外孙#1玩滑梯#4。 line = line[:-3] # remove "#4。" res = line.replace('#', '') wav_filename = res[0:6] # 000001 # get wav_length wav_file = os.path.join(audio_dir, wav_filename + '.wav') wav_length = get_wave_file_length(wav_file) # remove digit remove_digits = str.maketrans('', '', digits) item = res.translate(remove_digits).strip() # 卡尔普陪外孙玩滑梯 transcript = "" transcript += item transcript = normalize_hkust_trans(transcript) # [('000001.wav', ' 卡尔普陪外孙玩滑梯')...] files.append( (os.path.abspath(wav_file), wav_length, transcript)) # Write to CSV file which contains three columns: df = pandas.DataFrame( data=files, columns=["wav_filename", "wav_length_ms", "transcript"]) df.to_csv(total_csv_path, index=False) logging.info("Successfully generated csv file {}".format(total_csv_path))
def convert_audio_and_make_label(input_dir, subset, output_dir, output_file): """Optionally convert AAC to WAV and make speaker labels. Args: input_dir: the directory which holds the input dataset. subset: the name of the specified subset. e.g. vox1_dev_wav output_dir: the directory to place the newly generated csv files. output_file: the name of the newly generated csv file. e.g. vox1_dev_wav.csv """ logging.info("Preprocessing audio and label for subset %s" % subset) source_dir = os.path.join(input_dir, subset) files = [] # Convert all AAC file into WAV format. At the same time, generate the csv for root, _, filenames in gfile.Walk(source_dir): for filename in filenames: name, ext = os.path.splitext(filename) if ext.lower() == ".wav": _, ext2 = (os.path.splitext(name)) if ext2: continue wav_file = os.path.join(root, filename) elif ext.lower() == ".m4a": # Convert AAC to WAV. aac_file = os.path.join(root, filename) wav_file = aac_file + ".wav" if not gfile.Exists(wav_file): if not decode_aac_with_ffmpeg(aac_file, wav_file): raise RuntimeError("Audio decoding failed.") else: continue speaker_name = root.split(os.path.sep)[-2] if speaker_name not in speaker_id_dict: num = len(speaker_id_dict) speaker_id_dict[speaker_name] = num # wav_filesize = os.path.getsize(wav_file) wav_length = get_wave_file_length(wav_file) files.append((os.path.abspath(wav_file), wav_length, speaker_id_dict[speaker_name], speaker_name)) # Write to CSV file which contains four columns: # "wav_filename", "wav_length_ms", "speaker_id", "speaker_name". csv_file_path = os.path.join(output_dir, output_file) df = pandas.DataFrame(data=files, columns=[ "wav_filename", "wav_length_ms", "speaker_id", "speaker_name" ]) df.to_csv(csv_file_path, index=False, sep="\t") logging.info("Successfully generated csv file {}".format(csv_file_path))
def convert_audio_and_split_transcript(dataset_dir, total_csv_path): """Convert rar to WAV and split the transcript. Args: dataset_dir : the directory which holds the input dataset. total_csv_path : the resulting output csv file. BZNSYP dir Tree structure: BZNSYP -ProsodyLabeling -000001-010000.txt -biaobei_prosody.csv -Wave -000001.wav -000002.wav ... -PhoneLabeling -000001.interval -000002.interval ... """ logging.info("ProcessingA audio and transcript for {}".format("all_files")) audio_dir = os.path.join(dataset_dir, "BZNSYP/") subprocess.call(["mkdir", "Wave_24"], cwd=audio_dir) #mkdir Wave_24 prosodyLabel_dir = os.path.join(dataset_dir, "BZNSYP/ProsodyLabeling/") files = [] # ProsodyLabel ---word with codecs.open(os.path.join(prosodyLabel_dir, "biaobei_prosody.csv"), "r", encoding="utf-8") as f: for line in f: transcript_id = line.strip().split("|")[0] transcript = line.strip().split("|")[1] #downsample 48k to 24k wav48k = os.path.join(audio_dir, 'Wave/' + transcript_id + '.wav') wav24k = os.path.join(audio_dir, 'Wave_24/' + transcript_id + '.wav') subprocess.getoutput( 'cat %s | /usr/bin/sox -t wav - -c 1 -b 16 -t wav - rate 24000 > %s' % (wav48k, wav24k)) # get wav_length wav_length = get_wave_file_length(wav24k) files.append((os.path.abspath(wav24k), wav_length, transcript)) # Write to CSV file which contains three columns: df = pandas.DataFrame( data=files, columns=["wav_filename", "wav_length_ms", "transcript"]) df.to_csv(total_csv_path, index=False) logging.info("Successfully generated csv file {}".format(total_csv_path))
def convert_audio_and_split_transcript(input_dir, source_name, output_dir, output_file): """ LibriTTS dir Tree structure: root -LibriTTS - subset - book id? - speaker id? - bookid_speakerid_paragraphid_sentenceid.normalized.txt - bookid_speakerid_paragraphid_sentenceid.original.txt - bookid_speakerid_paragraphid_sentenceid.wav ... """ logging.info("Processing audio and transcript for %s" % source_name) source_dir = os.path.join(input_dir, source_name) files = [] # generate the csv for root, _, filenames in GFILE.Walk(source_dir): for filename in fnmatch.filter(filenames, "*.normalized.txt"): trans_file = os.path.join(root, filename) seqid = filename.split('.')[0] speaker = seqid.split('_')[1] wav_file = os.path.join(root, seqid + '.wav') wav_length = get_wave_file_length(wav_file) with codecs.open(trans_file, "r", "utf-8") as fin: transcript = fin.readline().strip().lower() files.append((os.path.abspath(wav_file), wav_length, transcript, speaker)) # Write to CSV file which contains four columns: # "wav_filename", "wav_length_ms", "transcript", "speaker". csv_file_path = os.path.join(output_dir, output_file) df = pandas.DataFrame( data=files, columns=["wav_filename", "wav_length_ms", "transcript", "speaker"] ) df.to_csv(csv_file_path, index=False, sep="\t") logging.info("Successfully generated csv file {}".format(csv_file_path))
def convert_audio_and_split_transcript(dataset_dir, subset, out_csv_file): """Convert tar.gz to WAV and split the transcript. Args: dataset_dir : the directory which holds the input dataset. subset : the name of the specified dataset. e.g. dev. out_csv_file : the resulting output csv file. """ gfile = tf.compat.v1.gfile logging.info("Processing audio and transcript for {}".format(subset)) src_dir = os.path.join(dataset_dir, subset) tar_dir = os.path.join(dataset_dir, 'data_copy') if not gfile.Exists(src_dir): raise ValueError(src_dir, "directory is not exists.") files = [] filenames = os.listdir(src_dir) for trans_name in fnmatch.filter(filenames, "*.wav.trn"): wav_name = trans_name.split('.')[0] + '.wav' trans_file = os.path.join(tar_dir, trans_name) wav_file = os.path.join(tar_dir, wav_name) wav_file_size = get_wave_file_length(wav_file) with codecs.open(trans_file, "r", "utf-8") as fin: lines = fin.readlines() # read all lines trans = lines[0].strip('\n') items = trans.strip().split(" ") labels = "" for item in items: labels += item speaker = wav_name.split('_')[0] files.append((wav_file, wav_file_size, labels, speaker)) # Write to CSV file which contains three columns: # "wav_filename", "wav_length_ms", "transcript", "speakers". df = pandas.DataFrame( data=files, columns=["wav_filename", "wav_length_ms", "transcript", "speaker"]) df.to_csv(out_csv_file, index=False, sep="\t") logging.info("Successfully generated csv file {}".format(out_csv_file))
def convert_audio_and_split_transcript(directory, subset, out_csv_file): gfile = tf.compat.v1.gfile logging.info("Processing audio and transcript for {}".format(subset)) audio_dir = os.path.join(directory, subset) trans_dir = os.path.join(directory, subset, "TRANS.txt") files = [] with codecs.open(trans_dir, 'r', encoding='utf-8') as f: lines = f.readlines() for line in lines[1:]: items = line.strip().split('\t') wav_filename = items[0] labels = items[2] speaker = items[1] files.append((wav_filename, speaker, labels)) files_size_dict = {} for root, subdirs, _ in gfile.Walk(audio_dir): for subdir in subdirs: for filename in os.listdir(os.path.join(root, subdir)): files_size_dict[filename] = ( get_wave_file_length(os.path.join(root, subdir, filename)), subdir, ) content = [] for wav_filename, speaker, trans in files: if wav_filename in files_size_dict: filesize, subdir = files_size_dict[wav_filename] abspath = os.path.join(audio_dir, subdir, wav_filename) content.append((abspath, filesize, trans, speaker)) files = content df = pandas.DataFrame( data=files, columns=["wav_filename", "wav_length_ms", "transcript", "speaker"]) df.to_csv(out_csv_file, index=False, sep="\t") logging.info("Successfully generated csv file {}".format(out_csv_file))
def convert_audio_and_split_transcript(dataset_dir, subset, out_csv_file, output_dir): """Convert SPH to WAV and split the transcript. Args: dataset_dir : the directory which holds the input dataset. subset : the name of the specified dataset. e.g. dev. out_csv_file : the resulting output csv file. output_dir : Athena working directory. """ gfile = tf.compat.v1.gfile sph2pip = os.path.join(os.path.dirname(__file__), "../../../../tools/sph2pipe/sph2pipe") text_featurizer = TextFeaturizer() logging.info("Processing audio and transcript for %s" % subset) audio_dir = os.path.join(dataset_dir, "LDC2005S15/") trans_dir = os.path.join(dataset_dir, "LDC2005T32/") output_wav_dir = os.path.join(output_dir, subset + "/wav") if not gfile.Exists(output_wav_dir): gfile.MakeDirs(output_wav_dir) files = [] char_dict = {} sph_files_dict = {} for root, _, filenames in gfile.Walk(audio_dir): for filename in fnmatch.filter(filenames, "*.sph"): if subset in root: sph_key = os.path.splitext(filename)[0] sph_file = os.path.join(root, filename) sph_files_dict[sph_key] = sph_file # Convert all SPH file into WAV format. # Generate the JSON file and char dict file. with TemporaryDirectory(dir=output_dir) as output_tmp_wav_dir: for root, _, filenames in gfile.Walk(trans_dir): if not re.match('.*/' + subset + '.*', root): continue for filename in fnmatch.filter(filenames, "*.txt"): trans_file = os.path.join(root, filename) sph_key = "" speaker_A = "" speaker_B = "" with codecs.open(trans_file, "r", "gb18030") as fin: for line in fin: line = line.strip() if len(line.split(" ")) <= 1: continue if len(line.split(" ")) == 2: sph_key = line.split(" ")[1] speaker_A = sph_key.split("_")[2] speaker_B = sph_key.split("_")[3] continue time_start, time_end, speaker, transcript = line.split( " ", 3) time_start = float(time_start) time_end = float(time_end) # too short, skip the wave file if time_end - time_start <= 0.1: continue speaker = speaker[0] # remove ':' in 'A:' if speaker == "A": channel = 1 speaker_id = speaker_A else: channel = 2 speaker_id = speaker_B # Convert SPH to split WAV. sph_file = sph_files_dict[sph_key] wav_file = os.path.join( output_tmp_wav_dir, sph_key + "." + speaker[0] + ".wav") if not gfile.Exists(sph_file): raise ValueError( "the sph file {} is not exists".format( sph_file)) if not gfile.Exists(wav_file): sph2pipe_cmd = ( sph2pip + " -f wav -c {} -p ".format(str(channel)) + sph_file + " " + wav_file) os.system(sph2pipe_cmd) sub_wav_filename = "{0}-{1}-{2:06d}-{3:06d}".format( sph_key, speaker, int(time_start * 100), int(time_end * 100)) sub_wav_file = os.path.join(output_wav_dir, sub_wav_filename + ".wav") if not gfile.Exists(sub_wav_file): tfm = Transformer() tfm.trim(time_start, time_end) tfm.build(wav_file, sub_wav_file) wav_length = get_wave_file_length(sub_wav_file) transcript = normalize_hkust_trans(transcript) transcript = text_featurizer.delete_punct(transcript) if len(transcript) > 0: for char in transcript: if char in char_dict: char_dict[char] += 1 else: char_dict[char] = 0 files.append(( os.path.abspath(sub_wav_file), wav_length, transcript, speaker_id, )) # Write to CSV file which contains three columns: # "wav_filename", "wav_length_ms", "labels". df = pandas.DataFrame( data=files, columns=["wav_filename", "wav_length_ms", "transcript", "speaker"]) df.to_csv(out_csv_file, index=False, sep="\t") logging.info("Successfully generated csv file {}".format(out_csv_file))
def convert_audio_and_split_transcript(directory, subset, out_csv_file, DATASET): if DATASET not in DATASETS: raise ValueError(DATASET, 'is not in DATASETS') gfile = tf.compat.v1.gfile logging.info("Processing audio and transcript for {}_{}".format( DATASET, subset)) # Prepare aidatatang data to csv if DATASET == 'aidatatang': audio_dir = os.path.join(directory, 'corpus', subset) trans_dir = os.path.join(directory, 'transcript') files = [] char_dict = {} if not gfile.Exists(os.path.join(directory, subset)): data_file = os.path.join(directory, subset) # os.mkdir(data_file) os.makedirs(data_file, exist_ok=True) for filename in os.listdir(audio_dir): os.system("tar -zxvf" + audio_dir + filename + " -C" + data_file) with codecs.open(os.path.join(trans_dir, "aidatatang_200_zh_transcript.txt"), "r", encoding="utf-8") as f: for line in f: items = line.strip().split(" ") wav_filename = items[0] labels = "" for item in items[1:]: labels += item if item in char_dict: char_dict[item] += 1 else: char_dict[item] = 0 files.append((wav_filename + ".wav", labels)) files_size_dict = {} output_wav_dir = os.path.join(directory, subset) for root, subdirs, _ in gfile.Walk(output_wav_dir): for subdir in subdirs: for filename in os.listdir(os.path.join(root, subdir)): if filename.strip().split('.')[-1] != 'wav': continue files_size_dict[filename] = ( get_wave_file_length( os.path.join(root, subdir, filename)), subdir, ) content = [] for wav_filename, trans in files: if wav_filename in files_size_dict: filesize, subdir = files_size_dict[wav_filename] abspath = os.path.join(output_wav_dir, subdir, wav_filename) content.append((abspath, filesize, trans, subdir)) # Prepare magic_data data to csv elif DATASET == 'magic_data': audio_dir = os.path.join(directory, subset) trans_dir = os.path.join(directory, subset, "TRANS.txt") files = [] with codecs.open(trans_dir, 'r', encoding='utf-8') as f: lines = f.readlines() for line in lines[1:]: items = line.strip().split('\t') wav_filename = items[0] labels = items[2] speaker = items[1] files.append((wav_filename, speaker, labels)) files_size_dict = {} for root, subdirs, _ in gfile.Walk(audio_dir): for subdir in subdirs: for filename in os.listdir(os.path.join(root, subdir)): files_size_dict[filename] = ( get_wave_file_length( os.path.join(root, subdir, filename)), subdir, ) content = [] for wav_filename, speaker, trans in files: if wav_filename in files_size_dict: filesize, subdir = files_size_dict[wav_filename] abspath = os.path.join(audio_dir, subdir, wav_filename) content.append((abspath, filesize, trans, speaker)) # Prepare primewords data to csv elif DATASET == 'primewords': audio_dir = os.path.join(directory, "audio_files") trans_dir = os.path.join(directory, "set1_transcript.json") files = [] with codecs.open(trans_dir, 'r', encoding='utf-8') as f: items = eval(f.readline()) for item in items: wav_filename = item['file'] # labels = item['text'] labels = ''.join([x for x in item['text'].split()]) # speaker = item['user_id'] files.append((wav_filename, labels)) files_size_dict = {} for subdir in os.listdir(audio_dir): for root, subsubdirs, _ in gfile.Walk( os.path.join(audio_dir, subdir)): for subsubdir in subsubdirs: for filename in os.listdir(os.path.join(root, subsubdir)): files_size_dict[filename] = (os.path.join( root, subsubdir, filename), root, subsubdir) content = [] for wav_filename, trans in files: if wav_filename in files_size_dict: filesize, root, subsubdir = files_size_dict[wav_filename] abspath = os.path.join(root, subsubdir, wav_filename) content.append((abspath, filesize, trans, None)) # Prepare ST-CMDS data to csv else: content = [] for filename in os.listdir(directory): if filename.split('.')[-1] == 'wav': trans_file = os.path.join(directory, filename.split('.')[0] + '.txt') metadata_file = os.path.join( directory, filename.split('.')[0] + '.metadata') with codecs.open(trans_file, 'r', encoding='utf-8') as f: labels = f.readline() with codecs.open(metadata_file, 'r', encoding='utf-8') as f: speaker = f.readlines()[2].split()[-1] abspath = os.path.join(directory, filename) filesize = get_wave_file_length(abspath) content.append((abspath, filesize, labels, speaker)) files = content df = pandas.DataFrame( data=files, columns=["wav_filename", "wav_length_ms", "transcript", "speaker"]) df.to_csv(out_csv_file, index=False, sep="\t") logging.info("Successfully generated csv file {}".format(out_csv_file))
def convert_audio_and_split_transcript(directory, subset, output_dir): """Convert SPH to WAV and split the transcript. Args: directory: the directory which holds the input dataset. subset: the name of the specified dataset. supports train (switchboard+fisher), switchboard, fisher, hub500 and rt03s. output_dir: the directory to place the newly generated csv files. """ logging.info("Processing audio and transcript for %s" % subset) gfile = tf.compat.v1.gfile sph2pip = os.path.join(os.path.dirname(__file__), "../utils/sph2pipe") swd_audio_trans_dir = [os.path.join(directory, "LDC97S62")] fisher_audio_dirs = [ os.path.join(directory, "LDC2004S13"), os.path.join(directory, "LDC2005S13"), ] fisher_trans_dirs = [ os.path.join(directory, "LDC2004T19"), os.path.join(directory, "LDC2005T19"), ] hub_audio_dir = [os.path.join(directory, "LDC2002S09")] hub_trans_dir = [os.path.join(directory, "LDC2002T43")] rts_audio_trans_dir = [os.path.join(directory, "LDC2007S10")] if subset == "train": # Combination of switchboard corpus and fisher corpus. audio_dir = swd_audio_trans_dir + fisher_audio_dirs trans_dir = swd_audio_trans_dir + fisher_trans_dirs elif subset == "switchboard": audio_dir = swd_audio_trans_dir trans_dir = swd_audio_trans_dir elif subset == "fisher": audio_dir = fisher_audio_dirs trans_dir = fisher_trans_dirs elif subset == "hub500": audio_dir = hub_audio_dir trans_dir = hub_trans_dir elif subset == "rt03s": audio_dir = rts_audio_trans_dir trans_dir = rts_audio_trans_dir else: raise ValueError(subset, " is not in switchboard_fisher") subset_dir = os.path.join(directory, subset) if not gfile.Exists(subset_dir): gfile.MakeDirs(subset_dir) output_wav_dir = os.path.join(directory, subset + "/wav") if not gfile.Exists(output_wav_dir): gfile.MakeDirs(output_wav_dir) tmp_dir = os.path.join(directory, "tmp") if not gfile.Exists(tmp_dir): gfile.MakeDirs(tmp_dir) # Build SPH dict. files = [] sph_files_dict = {} for sub_audio_dir in audio_dir: for root, _, filenames in gfile.Walk(sub_audio_dir): for filename in fnmatch.filter(filenames, "*.[Ss][Pp][Hh]"): sph_key = os.path.splitext(filename)[0] sph_file = os.path.join(root, filename) sph_files_dict[sph_key] = sph_file with TemporaryDirectory(dir=tmp_dir) as output_tmp_wav_dir: for sub_trans_dir in trans_dir: if sub_trans_dir in swd_audio_trans_dir: fnmatch_pat = "*-trans.text" split_and_norm_func = split_line_and_norm_swd elif sub_trans_dir in fisher_trans_dirs: fnmatch_pat = "*.[Tt][Xx][Tt]" split_and_norm_func = split_line_and_norm_fisher elif sub_trans_dir in hub_trans_dir: fnmatch_pat = "hub5e00.english.000405.stm" split_and_norm_func = split_line_and_norm_hub_rts else: fnmatch_pat = "*.stm" split_and_norm_func = split_line_and_norm_hub_rts for root, _, filenames in gfile.Walk(sub_trans_dir): for filename in fnmatch.filter(filenames, fnmatch_pat): trans_file = os.path.join(root, filename) if 1 in [ ele in root for ele in [ "doc", "DOC", "mandarin", "arabic", "concatenated", "bnews", ] ]: continue with codecs.open(trans_file, "r", "utf-8") as fin: for line in fin: line = line.strip() ( sph_key, speaker, time_start, time_end, norm_trans, ) = split_and_norm_func(line, filename) # Too short, skip the wave file if time_end - time_start <= 0.1: continue if norm_trans == "": continue if speaker == "A": channel = 1 else: channel = 2 # Convert SPH to split WAV. if sph_key not in sph_files_dict: print(sph_key + " not found, please check.") continue sph_file = sph_files_dict[sph_key] wav_file = os.path.join( output_tmp_wav_dir, sph_key + "." + speaker + ".wav") if not gfile.Exists(sph_file): raise ValueError( "the sph file {} is not exists".format( sph_file)) sub_wav_filename = "{0}-{1}-{2:06d}-{3:06d}".format( sph_key, speaker, round(time_start * 100), round(time_end * 100), ) sub_wav_file = os.path.join( output_wav_dir, sub_wav_filename + ".wav") if not gfile.Exists(sub_wav_file): if not gfile.Exists(wav_file): sph2pipe_cmd = (sph2pip + " -f wav -c {} -p ".format( str(channel)) + sph_file + " " + wav_file) os.system(sph2pipe_cmd) tfm = Transformer() tfm.trim(time_start, time_end) tfm.build(wav_file, sub_wav_file) # wav_filesize = os.path.getsize(sub_wav_file) wav_length = get_wave_file_length(sub_wav_file) speaker_name = sph_key + "-" + speaker files.append( (os.path.abspath(sub_wav_file), wav_length, norm_trans, speaker_name)) # Write to CSV file which contains four columns: # "wav_filename", "wav_length_ms", "transcript", "speaker". out_csv_file = os.path.join(output_dir, subset + ".csv") df = pandas.DataFrame( data=files, columns=["wav_filename", "wav_length_ms", "transcript", "speaker"]) df.to_csv(out_csv_file, index=False, sep="\t") logging.info("Successfully generated csv file {}".format(out_csv_file))
def convert_audio_and_split_transcript(dataset_dir, subset, output_dir, trans_type="phn", phone_map_amount=None): """Convert SPH to WAV and split the transcript. Args: dataset_dir: the directory which holds the input dataset. subset: the name of the specified dataset. e.g. train output_dir: the directory to place the newly generated csv files. trans_type: the type of transcript. for timit: "char" or "phn". phone_map_amount: convert phones according to "phones.60-48-39.map", should be None or "48" or "39". """ logging.info("Processing audio and transcript for %s" % subset) csv_file_path = os.path.join(output_dir, subset.lower() + ".csv") output_wav_dir = os.path.join(output_dir, "wav", subset) speaker_sets = get_speakers(subset.lower()) sph2pipe = os.path.join(os.path.dirname(__file__), "../../../../tools/sph2pipe/sph2pipe") phone_map_file = "examples/asr/timit/local/phones.60-48-39.map" if not os.path.exists(output_wav_dir): os.makedirs(output_wav_dir) if trans_type == "char": fnmatch_pattern = "*.TXT" elif trans_type == "phn": fnmatch_pattern = "*.PHN" else: raise ValueError(fnmatch_pattern, "is not an effective transcript type in TIMIT") # utterances of "dev" and "test" (core-test) sets come from "test" set if subset == "DEV": subset = "TEST" subset_dir = os.path.join(dataset_dir, subset) phone_map = {} with codecs.open(phone_map_file, "r", "utf-8") as phones: for line in phones: # handle phones that are mapped to empty if len(line.strip().split("\t")) == 1: phone = line.strip() phone_48, phone_39 = "", "" else: phone, phone_48, phone_39 = line.strip().split("\t", 2) if phone_map_amount == "48": phone_map[phone] = phone_48 elif phone_map_amount == "39": phone_map[phone] = phone_39 else: phone_map[phone] = phone files = [] for root, _, filenames in tf.compat.v1.gfile.Walk(subset_dir): for filename in fnmatch.filter(filenames, fnmatch_pattern): trans_file = os.path.join(root, filename) if filename.startswith("SA") or filename.startswith("."): continue with codecs.open(trans_file, "r", "utf-8") as fin: if trans_type == "char": for line in fin: begin_sample, end_sample, transcript = line.strip().split(" ", 2) transcript = transcript.lower() transcript = re.sub('[.,?!;:"]', '', transcript) else: phns = [] for line in fin: begin_sample, end_sample, phn = line.strip().split(" ", 2) phns.append(phn) # join twice to remove extra spaces in transcript caused by empty phones transcript = " ".join(" ".join(phone_map[item] for item in phns).split()) speaker = os.path.basename(root) file_name = os.path.splitext(filename)[0] # sphere to wav sph_file = os.path.join(root, file_name + ".WAV") wav_file = os.path.join(output_wav_dir, speaker + "-" + file_name + ".WAV") if not tf.compat.v1.gfile.Exists(wav_file): sph2pipe_cmd = ( sph2pipe + " -f wav -c 1 -p " + sph_file + " " + wav_file ) os.system(sph2pipe_cmd) wav_length = get_wave_file_length(wav_file) if speaker_sets != []: if speaker in speaker_sets: files.append((os.path.abspath(wav_file), wav_length, transcript, speaker)) else: files.append((os.path.abspath(wav_file), wav_length, transcript, speaker)) # Write to CSV file which contains four columns: # "wav_filename", "wav_length_ms", "transcript", "speaker". df = pandas.DataFrame( data=files, columns=["wav_filename", "wav_length_ms", "transcript", "speaker"] ) df.to_csv(csv_file_path, index=False, sep="\t") logging.info("Successfully generated csv file {}".format(csv_file_path))
import sys, os import pandas import logging _, train_list_file, dev_list_file, test_list_file = sys.argv speaker_id_dict = {} for list_file in (train_list_file, dev_list_file, test_list_file): files = [] csv_file = list_file + ".csv" with open(list_file, "r") as LIST: for line in LIST.readlines(): speaker_name, wav_file = line.strip().split() if speaker_name not in speaker_id_dict: num = len(speaker_id_dict) speaker_id_dict[speaker_name] = num wav_length = get_wave_file_length(wav_file) utt_key = speaker_name + "_" + wav_file.split("/")[-1].split( ".")[0] files.append( (os.path.abspath(wav_file), wav_length, speaker_id_dict[speaker_name], speaker_name, utt_key)) df = pandas.DataFrame(data=files, columns=[ "wav_filename", "wav_length_ms", "speaker_id", "speaker_name", "utt_key" ]) df.to_csv(csv_file, index=False, sep="\t") print("Successfully generated csv file {}".format(csv_file))