Esempio n. 1
0
def convert_audio_and_split_transcript(dataset_dir, subset, out_csv_file):
    """Convert tar.gz to WAV and split the transcript.

  Args:
    dataset_dir  : the directory which holds the input dataset.
    subset       : the name of the specified dataset. e.g. dev.
    out_csv_file : the resulting output csv file.
  """

    gfile = tf.compat.v1.gfile
    logging.info("Processing audio and transcript for {}".format(subset))
    audio_dir = os.path.join(dataset_dir, "wav/")
    trans_dir = os.path.join(dataset_dir, "transcript/")

    files = []
    char_dict = {}
    if not gfile.Exists(os.path.join(audio_dir, subset)):  # not unzip wav yet
        for filename in os.listdir(audio_dir):
            os.system("tar -zxvf " + audio_dir + filename + " -C " + audio_dir)

    with codecs.open(os.path.join(trans_dir, "aishell_transcript_v0.8.txt"),
                     "r",
                     encoding="utf-8") as f:
        for line in f:
            items = line.strip().split(" ")
            wav_filename = items[0]
            labels = ""
            for item in items[1:]:
                labels += item
                if item in char_dict:
                    char_dict[item] += 1
                else:
                    char_dict[item] = 0
            files.append((wav_filename + ".wav", labels))
    files_size_dict = {}
    output_wav_dir = os.path.join(audio_dir, subset)

    for root, subdirs, _ in gfile.Walk(output_wav_dir):
        for subdir in subdirs:
            for filename in os.listdir(os.path.join(root, subdir)):
                files_size_dict[filename] = (
                    get_wave_file_length(os.path.join(root, subdir, filename)),
                    subdir,
                )

    content = []
    for wav_filename, trans in files:
        if wav_filename in files_size_dict:  # wav which has trans is valid
            filesize, subdir = files_size_dict[wav_filename]
            abspath = os.path.join(output_wav_dir, subdir, wav_filename)
            content.append((abspath, filesize, trans, subdir))
    files = content

    # Write to CSV file which contains three columns:
    # "wav_filename", "wav_length_ms", "transcript", "speakers".
    df = pandas.DataFrame(
        data=files,
        columns=["wav_filename", "wav_length_ms", "transcript", "speaker"])
    df.to_csv(out_csv_file, index=False, sep="\t")
    logging.info("Successfully generated csv file {}".format(out_csv_file))
Esempio n. 2
0
def convert_audio_and_split_transcript(dataset_dir, subset, out_csv_file):
    """Convert tar.gz to WAV and split the transcript.

  Args:
    dataset_dir  : the directory which holds the input dataset.
    subset       : the name of the specified dataset. e.g. dev.
    out_csv_file : the resulting output csv file.
  """

    gfile = tf.compat.v1.gfile
    logging.info("Processing audio and transcript for {}".format(subset))
    audio_dir = dataset_dir
    trans_dir = dataset_dir

    content = []
    with open(os.path.join(*[dataset_dir, subset, 'wav.scp']), 'r') as fin:
        for line in fin:
            line = line.strip().split()
            line = line[1]
            # get text
            text_f = line[:-4]+'.txt'
            text = read(text_f).open()
            text = ' '.text
            # get speaker id
            spk_line = open(line[:-4]+'.metadata').read()[22]
            speaker = spk_line.strip().split()[1]
            wav_len =  get_wave_file_length(line)
            content.append((line, wav_len, text, speaker))

    df = pandas.DataFrame(
        data=content, columns=["wav_filename", "wav_length_ms", "transcript", "speaker"]
    )
    df.to_csv(out_csv_file, index=False, sep="\t")
    logging.info("Successfully generated csv file {}".format(out_csv_file))
Esempio n. 3
0
def convert_audio_and_split_transcript(input_dir,
                                       source_name,
                                       target_name,
                                       output_dir,
                                       output_file):
    """Convert FLAC to WAV and split the transcript.
    Args:
        input_dir: the directory which holds the input dataset.
        source_name: the name of the specified dataset. e.g. test-clean
        target_name: the directory name for the newly generated audio files.
                 e.g. test-clean-wav
        output_dir: the directory to place the newly generated csv files.
        output_file: the name of the newly generated csv file. e.g. test-clean.csv
    """

    logging.info("Processing audio and transcript for %s" % source_name)
    source_dir = os.path.join(input_dir, source_name)
    target_dir = os.path.join(input_dir, target_name)

    if not gfile.Exists(target_dir):
        gfile.MakeDirs(target_dir)

    files = []
    tfm = Transformer()
    # Convert all FLAC file into WAV format. At the same time, generate the csv
    for root, _, filenames in gfile.Walk(source_dir):
        for filename in fnmatch.filter(filenames, "*.trans.txt"):
            trans_file = os.path.join(root, filename)
            with codecs.open(trans_file, "r", "utf-8") as fin:
                for line in fin:
                    seqid, transcript = line.split(" ", 1)
                    # We do a encode-decode transformation here because the output type
                    # of encode is a bytes object, we need convert it to string.
                    transcript = (
                        unicodedata.normalize("NFKD", transcript)
                        .encode("ascii", "ignore")
                        .decode("ascii", "ignore")
                        .strip()
                        .lower()
                    )

                    # Convert FLAC to WAV.
                    flac_file = os.path.join(root, seqid + ".flac")
                    wav_file = os.path.join(target_dir, seqid + ".wav")
                    if not gfile.Exists(wav_file):
                        tfm.build(flac_file, wav_file)
                    # wav_filesize = os.path.getsize(wav_file)
                    wav_length = get_wave_file_length(wav_file)

                    files.append((os.path.abspath(wav_file), wav_length, transcript))
    # Write to CSV file which contains three columns:
    # "wav_filename", "wav_length_ms", "transcript".
    csv_file_path = os.path.join(output_dir, output_file)
    df = pandas.DataFrame(
        data=files, columns=["wav_filename", "wav_length_ms", "transcript"]
    )
    df.to_csv(csv_file_path, index=False, sep="\t")
    logging.info("Successfully generated csv file {}".format(csv_file_path))
Esempio n. 4
0
def convert_audio_and_split_transcript(directory, subset, out_csv_file):

    gfile = tf.compat.v1.gfile
    logging.info("Processing audio and transcript for {}".format(subset))
    audio_dir = os.path.join(directory, 'corpus', subset)
    trans_dir = os.path.join(directory, 'transcript')

    files = []
    char_dict = {}
    if not gfile.Exists(os.path.join(directory, subset)):
        data_file = os.path.join(directory, subset)
        # os.mkdir(data_file)
        os.makedirs(data_file, exist_ok=True)
        for filename in os.listdir(audio_dir):
            os.system("tar -zxvf" + audio_dir + filename + " -C" + data_file)

    with codecs.open(os.path.join(trans_dir,
                                  "aidatatang_200_zh_transcript.txt"),
                     "r",
                     encoding="utf-8") as f:
        for line in f:
            items = line.strip().split(" ")
            wav_filename = items[0]
            labels = ""
            for item in items[1:]:
                labels += item
                if item in char_dict:
                    char_dict[item] += 1
                else:
                    char_dict[item] = 0
            files.append((wav_filename + ".wav", labels))

    files_size_dict = {}
    output_wav_dir = os.path.join(directory, subset)

    for root, subdirs, _ in gfile.Walk(output_wav_dir):
        for subdir in subdirs:
            for filename in os.listdir(os.path.join(root, subdir)):
                if filename.strip().split('.')[-1] != 'wav':
                    continue
                files_size_dict[filename] = (
                    get_wave_file_length(os.path.join(root, subdir, filename)),
                    subdir,
                )

    content = []
    for wav_filename, trans in files:
        if wav_filename in files_size_dict:
            filesize, subdir = files_size_dict[wav_filename]
            abspath = os.path.join(output_wav_dir, subdir, wav_filename)
            content.append((abspath, filesize, trans, subdir))
    files = content
    df = pandas.DataFrame(
        data=files,
        columns=["wav_filename", "wav_length_ms", "transcript", "speaker"])
    df.to_csv(out_csv_file, index=False, sep="\t")
    logging.info("Successfully generated csv file {}".format(out_csv_file))
Esempio n. 5
0
def convert_audio_and_split_transcript(dataset_dir, total_csv_path):
    """Convert rar to WAV and split the transcript.
      Args:
    dataset_dir  : the directory which holds the input dataset.
    total_csv_path : the resulting output csv file.

    LJSpeech-1.1 dir Tree structure:
    LJSpeech-1.1
        -metadata.csv
            -LJ001-0002|in being comparatively modern.|in being comparatively modern.
            ...
        -wavs
            -LJ001-0001.wav
            -LJ001-0002.wav
            ...
            -LJ050-0278
        -pcms
            -audio-LJ001-0001.s16
            -audio-LJ001-0002.s16
            ...
    """
    logging.info("ProcessingA audio and transcript for {}".format("all_files"))
    wav_dir = os.path.join(dataset_dir, "LJSpeech-1.1/wavs/")

    files = []
    # ProsodyLabel ---word
    with codecs.open(os.path.join(dataset_dir, "LJSpeech-1.1/metadata.csv"),
                     "r",
                     encoding="utf-8") as f:
        for line in f:
            wav_name = line.split('|')[0] + '.wav'
            wav_file = os.path.join(wav_dir, wav_name)
            wav_length = get_wave_file_length(wav_file)
            #get transcript
            content = line.split('|')[2]
            clean_content = preprocess(content.rstrip())
            transcript = ' '.join(list(clean_content))
            transcript = transcript.replace('  ', ' <space>')
            transcript = 'sp1 ' + transcript + ' sil' #' sil\n'
            files.append((os.path.abspath(wav_file), wav_length, transcript))

    # Write to txt file which contains three columns:
    fp = open(total_csv_path, 'w', encoding="utf-8")

    fp.write("wav_filename"+'\t'
             "wav_length_ms"+'\t'
             "transcript"+'\n')
    for i in range(len(files)):
        fp.write(str(files[i][0])+'\t')
        fp.write(str(files[i][1])+'\t')
        fp.write(str(files[i][2])+'\n')

    fp.close()
    
    logging.info("Successfully generated csv file {}".format(total_csv_path))
Esempio n. 6
0
def convert_audio_and_split_transcript(dataset_dir):
    """Convert rar to WAV and split the transcript.
      Args:
    dataset_dir  : the directory which holds the input dataset.
            -----> "examples/asr/didispeech-2/data/
    total_csv_path : the resulting output csv file.

    """
    logging.info(
        "Processing convert_audio_and_split_transcript {}".format("all_files"))

    WAVE_dir = os.path.join(dataset_dir, "WAVE")
    SCRIPT_dir = os.path.join(dataset_dir, "SCRIPT")
    csv_path = os.path.join(dataset_dir,
                            'total.csv')  #output_dir/adult_total.csv

    #构建wav_id-wav_file的字典
    key = []
    value = []
    wav_lists = os.listdir(WAVE_dir)  #SPEAKERxxx
    for wav_list in wav_lists:
        transcript_id = wav_list[:-4]  #00004944-00000065
        wav_file = os.path.join(WAVE_dir, wav_list)  #00004944-00000065.wav
        key.append(transcript_id)
        value.append(wav_file)

    dic = dict(zip(key, value))

    files = []
    is_sentid_line = True
    for line in open(transcript_path, encoding="UTF-8-sig",
                     errors='ignore'):  #序号 文本 声韵母-类似标贝
        if is_sentid_line:
            transcript_id = line.split('\t')[0]
            speaker = transcript_id.split('-')[0]
            transcript = line.split('\t')[1].strip()
            transcript = stringpartQ2B(transcript)  #大写全角转小写半角
            #去除符号
            punc = ',。!?【】()《》“‘:;[]{}&,.?()\%-+ ̄~$#@=_、/\\%-+~~$#@=_//,.!?'
            transcript = re.sub(r"[%s]+" % punc, "", transcript)
            #print(transcript)
            wav_file = dic[str(transcript_id)]
            wav_length = get_wave_file_length(wav_file)
            files.append(
                (os.path.abspath(wav_file), wav_length, speaker, transcript))
        is_sentid_line = not is_sentid_line

    df = pandas.DataFrame(
        data=files,
        columns=["wav_filename", "wav_length_ms", "speaker", "transcript"])
    df.to_csv(csv_path, index=False)
    logging.info("Successfully generated total_csv file {}".format(
        csv_path))  ##output_dir/adult_total.csv
Esempio n. 7
0
def convert_audio_and_split_transcript(dataset_dir, total_csv_path):
    """Convert rar to WAV and split the transcript.
      Args:
    dataset_dir  : the directory which holds the input dataset.
            -----> /nfs/project/datasets/data_baker_tts
    total_csv_path : the resulting output csv file.

    BZNSYP dir Tree structure:
    BZNSYP
        -ProsodyLabeling
            -000001-010000.txt
        -Wave
            -000001.wav
            -000002.wav
            ...
        -PhoneLabeling
            -000001.interval
            -000002.interval
            ...
    """
    logging.info("ProcessingA audio and transcript for {}".format("all_files"))
    audio_dir = os.path.join(dataset_dir, "BZNSYP/Wave/")
    prosodyLabel_dir = os.path.join(dataset_dir, "BZNSYP/ProsodyLabeling/")

    files = []
    # ProsodyLabel ---word
    with codecs.open(os.path.join(prosodyLabel_dir, "000001-010000.txt"),
                     "r",
                     encoding="utf-8") as f:
        for line in f:
            if line[0:1] == '0':
                # 000001	卡尔普#2陪外孙#1玩滑梯#4。
                line = line[:-3]  # remove "#4。"
                res = line.replace('#', '')
                wav_filename = res[0:6]  # 000001
                # get wav_length
                wav_file = os.path.join(audio_dir, wav_filename + '.wav')
                wav_length = get_wave_file_length(wav_file)
                # remove digit
                remove_digits = str.maketrans('', '', digits)
                item = res.translate(remove_digits).strip()  # 卡尔普陪外孙玩滑梯
                transcript = ""
                transcript += item
                transcript = normalize_hkust_trans(transcript)
                # [('000001.wav', ' 卡尔普陪外孙玩滑梯')...]
                files.append(
                    (os.path.abspath(wav_file), wav_length, transcript))

    # Write to CSV file which contains three columns:
    df = pandas.DataFrame(
        data=files, columns=["wav_filename", "wav_length_ms", "transcript"])
    df.to_csv(total_csv_path, index=False)
    logging.info("Successfully generated csv file {}".format(total_csv_path))
Esempio n. 8
0
def convert_audio_and_make_label(input_dir, subset, output_dir, output_file):
    """Optionally convert AAC to WAV and make speaker labels.
    Args:
        input_dir: the directory which holds the input dataset.
        subset: the name of the specified subset. e.g. vox1_dev_wav
        output_dir: the directory to place the newly generated csv files.
        output_file: the name of the newly generated csv file. e.g. vox1_dev_wav.csv
    """

    logging.info("Preprocessing audio and label for subset %s" % subset)
    source_dir = os.path.join(input_dir, subset)

    files = []
    # Convert all AAC file into WAV format. At the same time, generate the csv
    for root, _, filenames in gfile.Walk(source_dir):
        for filename in filenames:
            name, ext = os.path.splitext(filename)
            if ext.lower() == ".wav":
                _, ext2 = (os.path.splitext(name))
                if ext2:
                    continue
                wav_file = os.path.join(root, filename)
            elif ext.lower() == ".m4a":
                # Convert AAC to WAV.
                aac_file = os.path.join(root, filename)
                wav_file = aac_file + ".wav"
                if not gfile.Exists(wav_file):
                    if not decode_aac_with_ffmpeg(aac_file, wav_file):
                        raise RuntimeError("Audio decoding failed.")
            else:
                continue
            speaker_name = root.split(os.path.sep)[-2]
            if speaker_name not in speaker_id_dict:
                num = len(speaker_id_dict)
                speaker_id_dict[speaker_name] = num
            # wav_filesize = os.path.getsize(wav_file)
            wav_length = get_wave_file_length(wav_file)
            files.append((os.path.abspath(wav_file), wav_length,
                          speaker_id_dict[speaker_name], speaker_name))

    # Write to CSV file which contains four columns:
    # "wav_filename", "wav_length_ms", "speaker_id", "speaker_name".
    csv_file_path = os.path.join(output_dir, output_file)
    df = pandas.DataFrame(data=files,
                          columns=[
                              "wav_filename", "wav_length_ms", "speaker_id",
                              "speaker_name"
                          ])
    df.to_csv(csv_file_path, index=False, sep="\t")
    logging.info("Successfully generated csv file {}".format(csv_file_path))
Esempio n. 9
0
def convert_audio_and_split_transcript(dataset_dir, total_csv_path):
    """Convert rar to WAV and split the transcript.
      Args:
    dataset_dir : the directory which holds the input dataset.
    total_csv_path : the resulting output csv file.

    BZNSYP dir Tree structure:
    BZNSYP
        -ProsodyLabeling
            -000001-010000.txt
            -biaobei_prosody.csv
        -Wave
            -000001.wav
            -000002.wav
            ...
        -PhoneLabeling
            -000001.interval
            -000002.interval
            ...
    """
    logging.info("ProcessingA audio and transcript for {}".format("all_files"))
    audio_dir = os.path.join(dataset_dir, "BZNSYP/")
    subprocess.call(["mkdir", "Wave_24"], cwd=audio_dir)  #mkdir Wave_24

    prosodyLabel_dir = os.path.join(dataset_dir, "BZNSYP/ProsodyLabeling/")

    files = []
    # ProsodyLabel ---word
    with codecs.open(os.path.join(prosodyLabel_dir, "biaobei_prosody.csv"),
                     "r",
                     encoding="utf-8") as f:
        for line in f:
            transcript_id = line.strip().split("|")[0]
            transcript = line.strip().split("|")[1]
            #downsample 48k to 24k
            wav48k = os.path.join(audio_dir, 'Wave/' + transcript_id + '.wav')
            wav24k = os.path.join(audio_dir,
                                  'Wave_24/' + transcript_id + '.wav')
            subprocess.getoutput(
                'cat %s | /usr/bin/sox -t wav - -c 1 -b 16 -t wav - rate 24000  >  %s'
                % (wav48k, wav24k))
            # get wav_length
            wav_length = get_wave_file_length(wav24k)
            files.append((os.path.abspath(wav24k), wav_length, transcript))

    # Write to CSV file which contains three columns:
    df = pandas.DataFrame(
        data=files, columns=["wav_filename", "wav_length_ms", "transcript"])
    df.to_csv(total_csv_path, index=False)
    logging.info("Successfully generated csv file {}".format(total_csv_path))
Esempio n. 10
0
def convert_audio_and_split_transcript(input_dir,
                                       source_name,
                                       output_dir,
                                       output_file):

    """
    LibriTTS dir Tree structure:
    root
        -LibriTTS
            - subset
                - book id?
                    - speaker id?
                        - bookid_speakerid_paragraphid_sentenceid.normalized.txt
                        - bookid_speakerid_paragraphid_sentenceid.original.txt
                        - bookid_speakerid_paragraphid_sentenceid.wav
                        ...
    """
    logging.info("Processing audio and transcript for %s" % source_name)
    source_dir = os.path.join(input_dir, source_name)

    files = []
    # generate the csv
    for root, _, filenames in GFILE.Walk(source_dir):
        for filename in fnmatch.filter(filenames, "*.normalized.txt"):
            trans_file = os.path.join(root, filename)
            seqid = filename.split('.')[0]
            speaker = seqid.split('_')[1]
            wav_file = os.path.join(root, seqid + '.wav')
            wav_length = get_wave_file_length(wav_file)

            with codecs.open(trans_file, "r", "utf-8") as fin:
                transcript = fin.readline().strip().lower()
                files.append((os.path.abspath(wav_file), wav_length, transcript, speaker))

    # Write to CSV file which contains four columns:
    # "wav_filename", "wav_length_ms", "transcript", "speaker".
    csv_file_path = os.path.join(output_dir, output_file)
    df = pandas.DataFrame(
        data=files, columns=["wav_filename", "wav_length_ms", "transcript", "speaker"]
    )
    df.to_csv(csv_file_path, index=False, sep="\t")
    logging.info("Successfully generated csv file {}".format(csv_file_path))
Esempio n. 11
0
def convert_audio_and_split_transcript(dataset_dir, subset, out_csv_file):
    """Convert tar.gz to WAV and split the transcript.
    Args:
        dataset_dir  : the directory which holds the input dataset.
        subset       : the name of the specified dataset. e.g. dev.
        out_csv_file : the resulting output csv file.
    """
    gfile = tf.compat.v1.gfile
    logging.info("Processing audio and transcript for {}".format(subset))
    src_dir = os.path.join(dataset_dir, subset)
    tar_dir = os.path.join(dataset_dir, 'data_copy')

    if not gfile.Exists(src_dir):
        raise ValueError(src_dir, "directory is not exists.")

    files = []
    filenames = os.listdir(src_dir)
    for trans_name in fnmatch.filter(filenames, "*.wav.trn"):
        wav_name = trans_name.split('.')[0] + '.wav'
        trans_file = os.path.join(tar_dir, trans_name)
        wav_file = os.path.join(tar_dir, wav_name)
        wav_file_size = get_wave_file_length(wav_file)
        with codecs.open(trans_file, "r", "utf-8") as fin:
            lines = fin.readlines()  # read all lines
        trans = lines[0].strip('\n')
        items = trans.strip().split(" ")
        labels = ""
        for item in items:
            labels += item
        speaker = wav_name.split('_')[0]

        files.append((wav_file, wav_file_size, labels, speaker))

    # Write to CSV file which contains three columns:
    # "wav_filename", "wav_length_ms", "transcript", "speakers".
    df = pandas.DataFrame(
        data=files,
        columns=["wav_filename", "wav_length_ms", "transcript", "speaker"])
    df.to_csv(out_csv_file, index=False, sep="\t")
    logging.info("Successfully generated csv file {}".format(out_csv_file))
Esempio n. 12
0
def convert_audio_and_split_transcript(directory, subset, out_csv_file):

    gfile = tf.compat.v1.gfile
    logging.info("Processing audio and transcript for {}".format(subset))
    audio_dir = os.path.join(directory, subset)
    trans_dir = os.path.join(directory, subset, "TRANS.txt")

    files = []
    with codecs.open(trans_dir, 'r', encoding='utf-8') as f:
        lines = f.readlines()
        for line in lines[1:]:
            items = line.strip().split('\t')
            wav_filename = items[0]
            labels = items[2]
            speaker = items[1]
            files.append((wav_filename, speaker, labels))
    files_size_dict = {}
    for root, subdirs, _ in gfile.Walk(audio_dir):
        for subdir in subdirs:
            for filename in os.listdir(os.path.join(root, subdir)):
                files_size_dict[filename] = (
                    get_wave_file_length(os.path.join(root, subdir, filename)),
                    subdir,
                )
    content = []
    for wav_filename, speaker, trans in files:
        if wav_filename in files_size_dict:
            filesize, subdir = files_size_dict[wav_filename]
            abspath = os.path.join(audio_dir, subdir, wav_filename)
            content.append((abspath, filesize, trans, speaker))

    files = content
    df = pandas.DataFrame(
        data=files,
        columns=["wav_filename", "wav_length_ms", "transcript", "speaker"])
    df.to_csv(out_csv_file, index=False, sep="\t")
    logging.info("Successfully generated csv file {}".format(out_csv_file))
Esempio n. 13
0
def convert_audio_and_split_transcript(dataset_dir, subset, out_csv_file,
                                       output_dir):
    """Convert SPH to WAV and split the transcript.

  Args:
    dataset_dir  : the directory which holds the input dataset.
    subset       : the name of the specified dataset. e.g. dev.
    out_csv_file : the resulting output csv file.
    output_dir   : Athena working directory.
  """
    gfile = tf.compat.v1.gfile
    sph2pip = os.path.join(os.path.dirname(__file__),
                           "../../../../tools/sph2pipe/sph2pipe")
    text_featurizer = TextFeaturizer()

    logging.info("Processing audio and transcript for %s" % subset)
    audio_dir = os.path.join(dataset_dir, "LDC2005S15/")
    trans_dir = os.path.join(dataset_dir, "LDC2005T32/")

    output_wav_dir = os.path.join(output_dir, subset + "/wav")
    if not gfile.Exists(output_wav_dir):
        gfile.MakeDirs(output_wav_dir)

    files = []
    char_dict = {}

    sph_files_dict = {}
    for root, _, filenames in gfile.Walk(audio_dir):
        for filename in fnmatch.filter(filenames, "*.sph"):
            if subset in root:
                sph_key = os.path.splitext(filename)[0]
                sph_file = os.path.join(root, filename)
                sph_files_dict[sph_key] = sph_file

    # Convert all SPH file into WAV format.
    # Generate the JSON file and char dict file.
    with TemporaryDirectory(dir=output_dir) as output_tmp_wav_dir:
        for root, _, filenames in gfile.Walk(trans_dir):
            if not re.match('.*/' + subset + '.*', root):
                continue
            for filename in fnmatch.filter(filenames, "*.txt"):
                trans_file = os.path.join(root, filename)
                sph_key = ""
                speaker_A = ""
                speaker_B = ""
                with codecs.open(trans_file, "r", "gb18030") as fin:
                    for line in fin:
                        line = line.strip()
                        if len(line.split(" ")) <= 1:
                            continue
                        if len(line.split(" ")) == 2:
                            sph_key = line.split(" ")[1]
                            speaker_A = sph_key.split("_")[2]
                            speaker_B = sph_key.split("_")[3]
                            continue

                        time_start, time_end, speaker, transcript = line.split(
                            " ", 3)
                        time_start = float(time_start)
                        time_end = float(time_end)
                        # too short, skip the wave file
                        if time_end - time_start <= 0.1:
                            continue

                        speaker = speaker[0]  # remove ':' in 'A:'
                        if speaker == "A":
                            channel = 1
                            speaker_id = speaker_A
                        else:
                            channel = 2
                            speaker_id = speaker_B

                        # Convert SPH to split WAV.
                        sph_file = sph_files_dict[sph_key]
                        wav_file = os.path.join(
                            output_tmp_wav_dir,
                            sph_key + "." + speaker[0] + ".wav")
                        if not gfile.Exists(sph_file):
                            raise ValueError(
                                "the sph file {} is not exists".format(
                                    sph_file))
                        if not gfile.Exists(wav_file):
                            sph2pipe_cmd = (
                                sph2pip +
                                " -f wav -c {} -p ".format(str(channel)) +
                                sph_file + " " + wav_file)
                            os.system(sph2pipe_cmd)

                        sub_wav_filename = "{0}-{1}-{2:06d}-{3:06d}".format(
                            sph_key, speaker, int(time_start * 100),
                            int(time_end * 100))
                        sub_wav_file = os.path.join(output_wav_dir,
                                                    sub_wav_filename + ".wav")
                        if not gfile.Exists(sub_wav_file):
                            tfm = Transformer()
                            tfm.trim(time_start, time_end)
                            tfm.build(wav_file, sub_wav_file)

                        wav_length = get_wave_file_length(sub_wav_file)

                        transcript = normalize_hkust_trans(transcript)
                        transcript = text_featurizer.delete_punct(transcript)

                        if len(transcript) > 0:
                            for char in transcript:
                                if char in char_dict:
                                    char_dict[char] += 1
                                else:
                                    char_dict[char] = 0
                            files.append((
                                os.path.abspath(sub_wav_file),
                                wav_length,
                                transcript,
                                speaker_id,
                            ))

    # Write to CSV file which contains three columns:
    # "wav_filename", "wav_length_ms", "labels".
    df = pandas.DataFrame(
        data=files,
        columns=["wav_filename", "wav_length_ms", "transcript", "speaker"])
    df.to_csv(out_csv_file, index=False, sep="\t")
    logging.info("Successfully generated csv file {}".format(out_csv_file))
Esempio n. 14
0
def convert_audio_and_split_transcript(directory, subset, out_csv_file,
                                       DATASET):
    if DATASET not in DATASETS:
        raise ValueError(DATASET, 'is not in DATASETS')
    gfile = tf.compat.v1.gfile
    logging.info("Processing audio and transcript for {}_{}".format(
        DATASET, subset))
    # Prepare aidatatang data to csv
    if DATASET == 'aidatatang':

        audio_dir = os.path.join(directory, 'corpus', subset)
        trans_dir = os.path.join(directory, 'transcript')

        files = []
        char_dict = {}
        if not gfile.Exists(os.path.join(directory, subset)):
            data_file = os.path.join(directory, subset)
            # os.mkdir(data_file)
            os.makedirs(data_file, exist_ok=True)
            for filename in os.listdir(audio_dir):
                os.system("tar -zxvf" + audio_dir + filename + " -C" +
                          data_file)

        with codecs.open(os.path.join(trans_dir,
                                      "aidatatang_200_zh_transcript.txt"),
                         "r",
                         encoding="utf-8") as f:
            for line in f:
                items = line.strip().split(" ")
                wav_filename = items[0]
                labels = ""
                for item in items[1:]:
                    labels += item
                    if item in char_dict:
                        char_dict[item] += 1
                    else:
                        char_dict[item] = 0
                files.append((wav_filename + ".wav", labels))

        files_size_dict = {}
        output_wav_dir = os.path.join(directory, subset)

        for root, subdirs, _ in gfile.Walk(output_wav_dir):
            for subdir in subdirs:
                for filename in os.listdir(os.path.join(root, subdir)):
                    if filename.strip().split('.')[-1] != 'wav':
                        continue
                    files_size_dict[filename] = (
                        get_wave_file_length(
                            os.path.join(root, subdir, filename)),
                        subdir,
                    )
        content = []
        for wav_filename, trans in files:
            if wav_filename in files_size_dict:
                filesize, subdir = files_size_dict[wav_filename]
                abspath = os.path.join(output_wav_dir, subdir, wav_filename)
                content.append((abspath, filesize, trans, subdir))

    # Prepare magic_data data to csv
    elif DATASET == 'magic_data':
        audio_dir = os.path.join(directory, subset)
        trans_dir = os.path.join(directory, subset, "TRANS.txt")

        files = []
        with codecs.open(trans_dir, 'r', encoding='utf-8') as f:
            lines = f.readlines()
            for line in lines[1:]:
                items = line.strip().split('\t')
                wav_filename = items[0]
                labels = items[2]
                speaker = items[1]
                files.append((wav_filename, speaker, labels))
        files_size_dict = {}
        for root, subdirs, _ in gfile.Walk(audio_dir):
            for subdir in subdirs:
                for filename in os.listdir(os.path.join(root, subdir)):
                    files_size_dict[filename] = (
                        get_wave_file_length(
                            os.path.join(root, subdir, filename)),
                        subdir,
                    )
        content = []
        for wav_filename, speaker, trans in files:
            if wav_filename in files_size_dict:
                filesize, subdir = files_size_dict[wav_filename]
                abspath = os.path.join(audio_dir, subdir, wav_filename)
                content.append((abspath, filesize, trans, speaker))

    # Prepare primewords data to csv
    elif DATASET == 'primewords':
        audio_dir = os.path.join(directory, "audio_files")
        trans_dir = os.path.join(directory, "set1_transcript.json")

        files = []
        with codecs.open(trans_dir, 'r', encoding='utf-8') as f:
            items = eval(f.readline())
            for item in items:
                wav_filename = item['file']
                # labels = item['text']

                labels = ''.join([x for x in item['text'].split()])
                # speaker = item['user_id']
                files.append((wav_filename, labels))

        files_size_dict = {}
        for subdir in os.listdir(audio_dir):
            for root, subsubdirs, _ in gfile.Walk(
                    os.path.join(audio_dir, subdir)):
                for subsubdir in subsubdirs:
                    for filename in os.listdir(os.path.join(root, subsubdir)):
                        files_size_dict[filename] = (os.path.join(
                            root, subsubdir, filename), root, subsubdir)
        content = []
        for wav_filename, trans in files:
            if wav_filename in files_size_dict:
                filesize, root, subsubdir = files_size_dict[wav_filename]
                abspath = os.path.join(root, subsubdir, wav_filename)
                content.append((abspath, filesize, trans, None))

    # Prepare ST-CMDS data to csv
    else:
        content = []
        for filename in os.listdir(directory):
            if filename.split('.')[-1] == 'wav':
                trans_file = os.path.join(directory,
                                          filename.split('.')[0] + '.txt')
                metadata_file = os.path.join(
                    directory,
                    filename.split('.')[0] + '.metadata')
                with codecs.open(trans_file, 'r', encoding='utf-8') as f:
                    labels = f.readline()
                with codecs.open(metadata_file, 'r', encoding='utf-8') as f:
                    speaker = f.readlines()[2].split()[-1]
                abspath = os.path.join(directory, filename)
                filesize = get_wave_file_length(abspath)
                content.append((abspath, filesize, labels, speaker))

    files = content
    df = pandas.DataFrame(
        data=files,
        columns=["wav_filename", "wav_length_ms", "transcript", "speaker"])
    df.to_csv(out_csv_file, index=False, sep="\t")
    logging.info("Successfully generated csv file {}".format(out_csv_file))
Esempio n. 15
0
def convert_audio_and_split_transcript(directory, subset, output_dir):
    """Convert SPH to WAV and split the transcript.
    Args:
        directory: the directory which holds the input dataset.
        subset: the name of the specified dataset. supports train 
                (switchboard+fisher), switchboard, fisher, hub500 and rt03s.
        output_dir: the directory to place the newly generated csv files.
    """
    logging.info("Processing audio and transcript for %s" % subset)
    gfile = tf.compat.v1.gfile
    sph2pip = os.path.join(os.path.dirname(__file__), "../utils/sph2pipe")

    swd_audio_trans_dir = [os.path.join(directory, "LDC97S62")]
    fisher_audio_dirs = [
        os.path.join(directory, "LDC2004S13"),
        os.path.join(directory, "LDC2005S13"),
    ]
    fisher_trans_dirs = [
        os.path.join(directory, "LDC2004T19"),
        os.path.join(directory, "LDC2005T19"),
    ]
    hub_audio_dir = [os.path.join(directory, "LDC2002S09")]
    hub_trans_dir = [os.path.join(directory, "LDC2002T43")]
    rts_audio_trans_dir = [os.path.join(directory, "LDC2007S10")]

    if subset == "train":
        # Combination of switchboard corpus and fisher corpus.
        audio_dir = swd_audio_trans_dir + fisher_audio_dirs
        trans_dir = swd_audio_trans_dir + fisher_trans_dirs
    elif subset == "switchboard":
        audio_dir = swd_audio_trans_dir
        trans_dir = swd_audio_trans_dir
    elif subset == "fisher":
        audio_dir = fisher_audio_dirs
        trans_dir = fisher_trans_dirs
    elif subset == "hub500":
        audio_dir = hub_audio_dir
        trans_dir = hub_trans_dir
    elif subset == "rt03s":
        audio_dir = rts_audio_trans_dir
        trans_dir = rts_audio_trans_dir
    else:
        raise ValueError(subset, " is not in switchboard_fisher")

    subset_dir = os.path.join(directory, subset)
    if not gfile.Exists(subset_dir):
        gfile.MakeDirs(subset_dir)
    output_wav_dir = os.path.join(directory, subset + "/wav")
    if not gfile.Exists(output_wav_dir):
        gfile.MakeDirs(output_wav_dir)
    tmp_dir = os.path.join(directory, "tmp")
    if not gfile.Exists(tmp_dir):
        gfile.MakeDirs(tmp_dir)

    # Build SPH dict.
    files = []
    sph_files_dict = {}
    for sub_audio_dir in audio_dir:
        for root, _, filenames in gfile.Walk(sub_audio_dir):
            for filename in fnmatch.filter(filenames, "*.[Ss][Pp][Hh]"):
                sph_key = os.path.splitext(filename)[0]
                sph_file = os.path.join(root, filename)
                sph_files_dict[sph_key] = sph_file

    with TemporaryDirectory(dir=tmp_dir) as output_tmp_wav_dir:
        for sub_trans_dir in trans_dir:
            if sub_trans_dir in swd_audio_trans_dir:
                fnmatch_pat = "*-trans.text"
                split_and_norm_func = split_line_and_norm_swd
            elif sub_trans_dir in fisher_trans_dirs:
                fnmatch_pat = "*.[Tt][Xx][Tt]"
                split_and_norm_func = split_line_and_norm_fisher
            elif sub_trans_dir in hub_trans_dir:
                fnmatch_pat = "hub5e00.english.000405.stm"
                split_and_norm_func = split_line_and_norm_hub_rts
            else:
                fnmatch_pat = "*.stm"
                split_and_norm_func = split_line_and_norm_hub_rts

            for root, _, filenames in gfile.Walk(sub_trans_dir):
                for filename in fnmatch.filter(filenames, fnmatch_pat):
                    trans_file = os.path.join(root, filename)
                    if 1 in [
                            ele in root for ele in [
                                "doc",
                                "DOC",
                                "mandarin",
                                "arabic",
                                "concatenated",
                                "bnews",
                            ]
                    ]:
                        continue
                    with codecs.open(trans_file, "r", "utf-8") as fin:
                        for line in fin:
                            line = line.strip()
                            (
                                sph_key,
                                speaker,
                                time_start,
                                time_end,
                                norm_trans,
                            ) = split_and_norm_func(line, filename)

                            # Too short, skip the wave file
                            if time_end - time_start <= 0.1:
                                continue
                            if norm_trans == "":
                                continue
                            if speaker == "A":
                                channel = 1
                            else:
                                channel = 2

                            # Convert SPH to split WAV.
                            if sph_key not in sph_files_dict:
                                print(sph_key + " not found, please check.")
                                continue
                            sph_file = sph_files_dict[sph_key]
                            wav_file = os.path.join(
                                output_tmp_wav_dir,
                                sph_key + "." + speaker + ".wav")
                            if not gfile.Exists(sph_file):
                                raise ValueError(
                                    "the sph file {} is not exists".format(
                                        sph_file))

                            sub_wav_filename = "{0}-{1}-{2:06d}-{3:06d}".format(
                                sph_key,
                                speaker,
                                round(time_start * 100),
                                round(time_end * 100),
                            )
                            sub_wav_file = os.path.join(
                                output_wav_dir, sub_wav_filename + ".wav")

                            if not gfile.Exists(sub_wav_file):
                                if not gfile.Exists(wav_file):
                                    sph2pipe_cmd = (sph2pip +
                                                    " -f wav -c {} -p ".format(
                                                        str(channel)) +
                                                    sph_file + " " + wav_file)
                                    os.system(sph2pipe_cmd)
                                tfm = Transformer()
                                tfm.trim(time_start, time_end)
                                tfm.build(wav_file, sub_wav_file)

                            # wav_filesize = os.path.getsize(sub_wav_file)
                            wav_length = get_wave_file_length(sub_wav_file)
                            speaker_name = sph_key + "-" + speaker
                            files.append(
                                (os.path.abspath(sub_wav_file), wav_length,
                                 norm_trans, speaker_name))

    # Write to CSV file which contains four columns:
    # "wav_filename", "wav_length_ms", "transcript", "speaker".
    out_csv_file = os.path.join(output_dir, subset + ".csv")
    df = pandas.DataFrame(
        data=files,
        columns=["wav_filename", "wav_length_ms", "transcript", "speaker"])
    df.to_csv(out_csv_file, index=False, sep="\t")
    logging.info("Successfully generated csv file {}".format(out_csv_file))
Esempio n. 16
0
def convert_audio_and_split_transcript(dataset_dir,
                                       subset,
                                       output_dir,
                                       trans_type="phn",
                                       phone_map_amount=None):
    """Convert SPH to WAV and split the transcript.
    Args:
        dataset_dir: the directory which holds the input dataset.
        subset: the name of the specified dataset. e.g. train
        output_dir: the directory to place the newly generated csv files.
        trans_type: the type of transcript. for timit: "char" or "phn".
        phone_map_amount: convert phones according to "phones.60-48-39.map",
                          should be None or "48" or "39".
    """
    logging.info("Processing audio and transcript for %s" % subset)
    csv_file_path = os.path.join(output_dir, subset.lower() + ".csv")
    output_wav_dir = os.path.join(output_dir, "wav", subset)
    speaker_sets = get_speakers(subset.lower())
    sph2pipe = os.path.join(os.path.dirname(__file__), "../../../../tools/sph2pipe/sph2pipe")
    phone_map_file = "examples/asr/timit/local/phones.60-48-39.map"
    if not os.path.exists(output_wav_dir):
        os.makedirs(output_wav_dir)

    if trans_type == "char":
        fnmatch_pattern = "*.TXT"
    elif trans_type == "phn":
        fnmatch_pattern = "*.PHN"
    else:
        raise ValueError(fnmatch_pattern, "is not an effective transcript type in TIMIT")

    # utterances of "dev" and "test" (core-test) sets come from "test" set
    if subset == "DEV":
        subset = "TEST" 
    subset_dir = os.path.join(dataset_dir, subset)

    phone_map = {}
    with codecs.open(phone_map_file, "r", "utf-8") as phones:
        for line in phones:
            # handle phones that are mapped to empty
            if len(line.strip().split("\t")) == 1:
                phone = line.strip()
                phone_48, phone_39 = "", ""
            else:
                phone, phone_48, phone_39 = line.strip().split("\t", 2)
            if phone_map_amount == "48":
                phone_map[phone] = phone_48
            elif phone_map_amount == "39":
                phone_map[phone] = phone_39
            else:
                phone_map[phone] = phone

    files = []
    for root, _, filenames in tf.compat.v1.gfile.Walk(subset_dir):
        for filename in fnmatch.filter(filenames, fnmatch_pattern):
            trans_file = os.path.join(root, filename)
            if filename.startswith("SA") or filename.startswith("."):
                continue
            with codecs.open(trans_file, "r", "utf-8") as fin:
                if trans_type == "char":
                    for line in fin:
                        begin_sample, end_sample, transcript = line.strip().split(" ", 2)
                        transcript = transcript.lower()
                        transcript = re.sub('[.,?!;:"]', '', transcript)
                else:
                    phns = []
                    for line in fin:
                        begin_sample, end_sample, phn = line.strip().split(" ", 2)
                        phns.append(phn)
                    # join twice to remove extra spaces in transcript caused by empty phones
                    transcript = " ".join(" ".join(phone_map[item] for item in phns).split())

            speaker = os.path.basename(root)
            file_name = os.path.splitext(filename)[0]

            # sphere to wav
            sph_file = os.path.join(root, file_name + ".WAV")
            wav_file = os.path.join(output_wav_dir, speaker + "-" + file_name + ".WAV")
            if not tf.compat.v1.gfile.Exists(wav_file):
                sph2pipe_cmd = (
                    sph2pipe
                    + " -f wav -c 1 -p "
                    + sph_file
                    + " "
                    + wav_file
                )
                os.system(sph2pipe_cmd)

            wav_length = get_wave_file_length(wav_file)
            if speaker_sets != []:
                if speaker in speaker_sets:
                    files.append((os.path.abspath(wav_file), wav_length, transcript, speaker))
            else:
                files.append((os.path.abspath(wav_file), wav_length, transcript, speaker))

    # Write to CSV file which contains four columns:
    # "wav_filename", "wav_length_ms", "transcript", "speaker".
    df = pandas.DataFrame(
        data=files, columns=["wav_filename", "wav_length_ms", "transcript", "speaker"]
    )
    df.to_csv(csv_file_path, index=False, sep="\t")
    logging.info("Successfully generated csv file {}".format(csv_file_path))
Esempio n. 17
0
import sys, os
import pandas
import logging

_, train_list_file, dev_list_file, test_list_file = sys.argv
speaker_id_dict = {}

for list_file in (train_list_file, dev_list_file, test_list_file):
    files = []
    csv_file = list_file + ".csv"
    with open(list_file, "r") as LIST:
        for line in LIST.readlines():
            speaker_name, wav_file = line.strip().split()
            if speaker_name not in speaker_id_dict:
                num = len(speaker_id_dict)
                speaker_id_dict[speaker_name] = num
            wav_length = get_wave_file_length(wav_file)
            utt_key = speaker_name + "_" + wav_file.split("/")[-1].split(
                ".")[0]
            files.append(
                (os.path.abspath(wav_file), wav_length,
                 speaker_id_dict[speaker_name], speaker_name, utt_key))

    df = pandas.DataFrame(data=files,
                          columns=[
                              "wav_filename", "wav_length_ms", "speaker_id",
                              "speaker_name", "utt_key"
                          ])
    df.to_csv(csv_file, index=False, sep="\t")
    print("Successfully generated csv file {}".format(csv_file))