def label_filter(label, language): label = label.translate(PRE_FILTER) label = validate_label(label) if label is None: return None, "validation" substitutions = SUBSTITUTIONS[ language] if language in SUBSTITUTIONS else [] for pattern, replacement in substitutions: if replacement is None: if pattern.match(label): return None, "substitution rule" else: label = pattern.sub(replacement, label) chars = [] dont_normalize = DONT_NORMALIZE[ language] if language in DONT_NORMALIZE else "" alphabet = get_alphabet(language) for c in label: if (CLI_ARGS.normalize and c not in dont_normalize and not in_alphabet(alphabet, c)): c = (unicodedata.normalize("NFKD", c).encode("ascii", "ignore").decode( "ascii", "ignore")) for sc in c: if not in_alphabet(alphabet, sc): return None, "illegal character" chars.append(sc) label = "".join(chars) label = validate_label(label) return label, "validation" if label is None else None
def _parse_transcriptions(trans_file): segments = [] with codecs.open(trans_file, "r", "utf-8") as fin: for line in fin: if line.startswith("#") or len(line) <= 1: continue tokens = line.split() start_time = float(tokens[1]) stop_time = float(tokens[2]) transcript = validate_label(" ".join(tokens[3:])) if transcript == None: continue # We need to do the encode-decode dance here because encode # returns a bytes() object on Python 3, and text_to_char_array # expects a string. transcript = ( unicodedata.normalize("NFKD", transcript) .encode("ascii", "ignore") .decode("ascii", "ignore") ) segments.append( { "start_time": start_time, "stop_time": stop_time, "transcript": transcript, } ) return segments
def one_sample(sample): mp3_filename = sample[0] # Storing wav files next to the mp3 ones - just with a different suffix wav_filename = path.splitext(mp3_filename)[0] + ".wav" _maybe_convert_wav(mp3_filename, wav_filename) frames = int( subprocess.check_output(["soxi", "-s", wav_filename], stderr=subprocess.STDOUT)) file_size = -1 if os.path.exists(wav_filename): file_size = path.getsize(wav_filename) frames = int( subprocess.check_output(["soxi", "-s", wav_filename], stderr=subprocess.STDOUT)) label = validate_label(sample[1]) rows = [] counter = get_counter() if file_size == -1: # Excluding samples that failed upon conversion counter["failed"] += 1 elif label is None: # Excluding samples that failed on label validation counter["invalid_label"] += 1 elif int(frames / SAMPLE_RATE * 1000 / 10 / 2) < len(str(label)): # Excluding samples that are too short to fit the transcript counter["too_short"] += 1 elif frames / SAMPLE_RATE > MAX_SECS: # Excluding very long samples to keep a reasonable batch-size counter["too_long"] += 1 else: # This one is good - keep it for the target CSV rows.append((wav_filename, file_size, label)) counter["all"] += 1 counter["total_time"] += frames return (counter, rows)
def _split_wav_and_sentences(data_dir, trans_data, original_data, converted_data): trans_dir = os.path.join(data_dir, trans_data) source_dir = os.path.join(data_dir, original_data) target_dir = os.path.join(data_dir, converted_data) if not os.path.exists(target_dir): os.makedirs(target_dir) files = [] # Loop over transcription files and split corresponding wav for root, dirnames, filenames in os.walk(trans_dir): for filename in fnmatch.filter(filenames, "*.txt"): trans_file = os.path.join(root, filename) segments = _parse_transcriptions(trans_file) # Open wav corresponding to transcription file wav_filenames = [ os.path.splitext(os.path.basename(trans_file))[0] + "_c" + channel + ".wav" for channel in ["1", "2"] ] wav_files = [ os.path.join(source_dir, wav_filename) for wav_filename in wav_filenames ] print("splitting {} according to {}".format(wav_files, trans_file)) origAudios = [ librosa.load(wav_file, sr=16000, mono=False) for wav_file in wav_files ] # Loop over segments and split wav_file for each segment for segment in segments: # Create wav segment filename start_time = segment["start_time"] stop_time = segment["stop_time"] new_wav_filename = ( os.path.splitext(os.path.basename(trans_file))[0] + "-" + str(start_time) + "-" + str(stop_time) + ".wav") new_wav_file = os.path.join(target_dir, new_wav_filename) channel = 0 if segment["speaker"] == "A:" else 1 _split_and_resample_wav(origAudios[channel], start_time, stop_time, new_wav_file) new_wav_filesize = os.path.getsize(new_wav_file) transcript = validate_label(segment["transcript"]) if transcript != None: files.append((os.path.abspath(new_wav_file), new_wav_filesize, transcript)) return pandas.DataFrame( data=files, columns=["wav_filename", "wav_filesize", "transcript"])
def check_and_prepare_sentence(sentence): sentence = sentence.lower().replace("co2", "c o zwei") chars = [] for c in sentence: if CLI_ARGS.normalize and c not in "äöüß" and not in_alphabet(c): c = unicodedata.normalize("NFKD", c).encode("ascii", "ignore").decode("ascii", "ignore") for sc in c: if not in_alphabet(c): return None chars.append(sc) return validate_label("".join(chars))
def check_and_prepare_sentence(sentence): sentence = sentence.lower().replace("co2", "c o zwei") chars = [] for c in sentence: if (CLI_ARGS.normalize and c not in "äöüß" and (ALPHABET is None or not ALPHABET.has_char(c))): c = (unicodedata.normalize("NFKD", c).encode("ascii", "ignore").decode( "ascii", "ignore")) for sc in c: if ALPHABET is not None and not ALPHABET.has_char(c): return None chars.append(sc) return validate_label("".join(chars))