def label_filter(label, language):
    label = label.translate(PRE_FILTER)
    label = validate_label(label)
    if label is None:
        return None, "validation"
    substitutions = SUBSTITUTIONS[
        language] if language in SUBSTITUTIONS else []
    for pattern, replacement in substitutions:
        if replacement is None:
            if pattern.match(label):
                return None, "substitution rule"
        else:
            label = pattern.sub(replacement, label)
    chars = []
    dont_normalize = DONT_NORMALIZE[
        language] if language in DONT_NORMALIZE else ""
    alphabet = get_alphabet(language)
    for c in label:
        if (CLI_ARGS.normalize and c not in dont_normalize
                and not in_alphabet(alphabet, c)):
            c = (unicodedata.normalize("NFKD",
                                       c).encode("ascii", "ignore").decode(
                                           "ascii", "ignore"))
        for sc in c:
            if not in_alphabet(alphabet, sc):
                return None, "illegal character"
            chars.append(sc)
    label = "".join(chars)
    label = validate_label(label)
    return label, "validation" if label is None else None
def _parse_transcriptions(trans_file):
    segments = []
    with codecs.open(trans_file, "r", "utf-8") as fin:
        for line in fin:
            if line.startswith("#") or len(line) <= 1:
                continue

            tokens = line.split()
            start_time = float(tokens[1])
            stop_time = float(tokens[2])
            transcript = validate_label(" ".join(tokens[3:]))

            if transcript == None:
                continue

            # We need to do the encode-decode dance here because encode
            # returns a bytes() object on Python 3, and text_to_char_array
            # expects a string.
            transcript = (
                unicodedata.normalize("NFKD", transcript)
                .encode("ascii", "ignore")
                .decode("ascii", "ignore")
            )

            segments.append(
                {
                    "start_time": start_time,
                    "stop_time": stop_time,
                    "transcript": transcript,
                }
            )
    return segments
Beispiel #3
0
def one_sample(sample):
    mp3_filename = sample[0]
    # Storing wav files next to the mp3 ones - just with a different suffix
    wav_filename = path.splitext(mp3_filename)[0] + ".wav"
    _maybe_convert_wav(mp3_filename, wav_filename)
    frames = int(
        subprocess.check_output(["soxi", "-s", wav_filename],
                                stderr=subprocess.STDOUT))
    file_size = -1
    if os.path.exists(wav_filename):
        file_size = path.getsize(wav_filename)
        frames = int(
            subprocess.check_output(["soxi", "-s", wav_filename],
                                    stderr=subprocess.STDOUT))
    label = validate_label(sample[1])
    rows = []
    counter = get_counter()
    if file_size == -1:
        # Excluding samples that failed upon conversion
        counter["failed"] += 1
    elif label is None:
        # Excluding samples that failed on label validation
        counter["invalid_label"] += 1
    elif int(frames / SAMPLE_RATE * 1000 / 10 / 2) < len(str(label)):
        # Excluding samples that are too short to fit the transcript
        counter["too_short"] += 1
    elif frames / SAMPLE_RATE > MAX_SECS:
        # Excluding very long samples to keep a reasonable batch-size
        counter["too_long"] += 1
    else:
        # This one is good - keep it for the target CSV
        rows.append((wav_filename, file_size, label))
    counter["all"] += 1
    counter["total_time"] += frames
    return (counter, rows)
def _split_wav_and_sentences(data_dir, trans_data, original_data,
                             converted_data):
    trans_dir = os.path.join(data_dir, trans_data)
    source_dir = os.path.join(data_dir, original_data)
    target_dir = os.path.join(data_dir, converted_data)
    if not os.path.exists(target_dir):
        os.makedirs(target_dir)

    files = []

    # Loop over transcription files and split corresponding wav
    for root, dirnames, filenames in os.walk(trans_dir):
        for filename in fnmatch.filter(filenames, "*.txt"):
            trans_file = os.path.join(root, filename)
            segments = _parse_transcriptions(trans_file)

            # Open wav corresponding to transcription file
            wav_filenames = [
                os.path.splitext(os.path.basename(trans_file))[0] + "_c" +
                channel + ".wav" for channel in ["1", "2"]
            ]
            wav_files = [
                os.path.join(source_dir, wav_filename)
                for wav_filename in wav_filenames
            ]

            print("splitting {} according to {}".format(wav_files, trans_file))

            origAudios = [
                librosa.load(wav_file, sr=16000, mono=False)
                for wav_file in wav_files
            ]

            # Loop over segments and split wav_file for each segment
            for segment in segments:
                # Create wav segment filename
                start_time = segment["start_time"]
                stop_time = segment["stop_time"]
                new_wav_filename = (
                    os.path.splitext(os.path.basename(trans_file))[0] + "-" +
                    str(start_time) + "-" + str(stop_time) + ".wav")
                new_wav_file = os.path.join(target_dir, new_wav_filename)

                channel = 0 if segment["speaker"] == "A:" else 1
                _split_and_resample_wav(origAudios[channel], start_time,
                                        stop_time, new_wav_file)

                new_wav_filesize = os.path.getsize(new_wav_file)
                transcript = validate_label(segment["transcript"])
                if transcript != None:
                    files.append((os.path.abspath(new_wav_file),
                                  new_wav_filesize, transcript))

    return pandas.DataFrame(
        data=files, columns=["wav_filename", "wav_filesize", "transcript"])
Beispiel #5
0
def check_and_prepare_sentence(sentence):
    sentence = sentence.lower().replace("co2", "c o zwei")
    chars = []
    for c in sentence:
        if CLI_ARGS.normalize and c not in "äöüß" and not in_alphabet(c):
            c = unicodedata.normalize("NFKD", c).encode("ascii", "ignore").decode("ascii", "ignore")
        for sc in c:
            if not in_alphabet(c):
                return None
            chars.append(sc)
    return validate_label("".join(chars))
def check_and_prepare_sentence(sentence):
    sentence = sentence.lower().replace("co2", "c o zwei")
    chars = []
    for c in sentence:
        if (CLI_ARGS.normalize and c not in "äöüß"
                and (ALPHABET is None or not ALPHABET.has_char(c))):
            c = (unicodedata.normalize("NFKD",
                                       c).encode("ascii", "ignore").decode(
                                           "ascii", "ignore"))
        for sc in c:
            if ALPHABET is not None and not ALPHABET.has_char(c):
                return None
            chars.append(sc)
    return validate_label("".join(chars))