def align_files(self, input_path: str, output_path: str, verbosity: int) -> None: """ Aligns all given files in input_path and writes alignments into output_path :param input_path: Where to look for transcript files :param output_path: Where to write alignment files :param verbosity: Verbosity of debugging output :return: None """ bin_print(verbosity, 1, "Reading files from", input_path) files = [f for f in listdir(input_path) if isfile(join(input_path, f)) and f.split(".")[1] == "wav"] bin_print(verbosity, 2, "WAVE files found:", "\n -", "\n - ".join(files)) file_pairs = [(f, f + ".txt", f + ".wav") for f in [input_path + f.split(".")[0] for f in files]] for file_pair in file_pairs: bin_print(verbosity, 2, "Creating alignment for " + file_pair[0] + ".*") alignment = self.aligner.align(file_pair[1], file_pair[2], verbosity) output_filename = file_pair[0] + "_audacity_" + self.alignment_type + ".txt" with open(output_filename, "w+", encoding="utf-8") as f: f.write("\n".join([sentence.to_audacity_label_format() for sentence in alignment])) bin_print(verbosity, 2, "Wrote " + output_filename) f.close() bin_print(verbosity, 1, "Writing files to", output_path)
def main(argv: list) -> None: title = "Get Google recognition" description = """ Gets the Speech Recognition result of Google Cloud API and stores it in a caching folder. Usage: python get_google_recognition_raw.py --path=<path> --authpath=<path> --bucket=<bucket name> --outpath=<path> [-v|-vv|-vvv] Args: --path: Path to read transcript files from (needed to filter which files to actually transcript) --authpath: Path containing the authentication files necessary to connect to Google Cloud API services --bucket: Name of the bucket containing all FLAC files --outpath: Path to write the raw JSON output to -v|-vv|-vvv: Verbosity level of the output -h: Prints this help """ args = ["path=", "config="] input_args = intro(title, description, args, argv) input_args = intro("Get Google recognition raw", "Gets the Speech Recognition result of Google Cloud API and stores it in a caching folder.\n\nget_google_recognition_raw.py --path=<path> --authpath=<path> --bucket=<bucket name> --outpath=<path> [-v|-vv|-vvv]", ["path=", "authpath=", "outpath=", "bucket="], argv) # Authenticate globally with specified client JSON os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = input_args["authpath"] start = time.time() get_and_save_raw(input_args["path"], input_args["bucket"], input_args["outpath"], input_args["verbosity"]) end = time.time() bin_print(input_args["verbosity"], 0, "Done.") bin_print(input_args["verbosity"], 1, "Time elapsed:", (end - start))
def main(argv: list) -> None: title = "Fix hand alignments" description = """ Fix hand alignments: Reshuffle training data and/or assign `-` to nonexisting sentences. Usage: python fix_hand_alignments.py --path=<path> [-v|-vv|-vvv] [--fix-nonexisting] [--reshuffle-training] Args: --path: Path to read alignment data -v|-vv|-vvv: Verbosity level of the output --fix-nonexisting: If non-existing sentences should be marked with `-` for interval start and end points --reshuffle-training: Select a new 70% of all sentences as training data -h: Prints this help """ args = ["path=", "config=", "fix-nonexisting", "reshuffle-training"] input_args = intro(title, description, args, argv) input_args[ "fix-nonexisting"] = True if "with-list" in input_args else False input_args[ "reshuffle-training"] = True if "get-low-means" in input_args else False start = time.time() fix_hand_alignments(input_args["path"], input_args["fix-nonexisting"], input_args["reshuffle-training"], input_args["verbosity"]) end = time.time() bin_print(input_args["verbosity"], 0, "Done.") bin_print(input_args["verbosity"], 1, "Time elapsed:", (end - start))
def perform_alignment(transcript: str, wav_path: str, verbosity: int) -> List[Sentence]: """ Performs the alignment :param transcript: Transcript path :param wav_path: Path to wav file :param verbosity: Verbosity level :return: Aligned sentences """ start_time = time.time() audio_segment = AudioSegment.from_wav(wav_path) duration = audio_segment.duration_seconds with open(transcript, encoding="utf-8") as f: transcript = f.read() transcript = transcript.replace("\n", " ") sentences = transcript_to_sentences(transcript) precision = 1_000_000 borders = random.sample(range(0, precision), len(sentences) * 2) borders.sort() borders = [border / precision * duration for border in borders] bin_print(verbosity, 3, "Borders for", wav_path, "are", borders) index = 0 for sentence in sentences: sentence.interval.start = borders[index] sentence.interval.end = borders[index + 1] index += 2 end_time = time.time() bin_print(verbosity, 2, "Time elapsed for", wav_path, ":", (end_time - start_time)) return sentences
def main(argv: list) -> None: title = "Create alignment" description = """ Creates an alignment based on configuration. See README.md for setting up a correct configuration. Usage: python create_alignment.py --path=<path> --config=<path> [-v|-vv|-vvv] Args: --path: Path to read raw data from and write alignments to --config: Path to configuration -v|-vv|-vvv: Verbosity level of the output -h: Prints this help """ args = ["path=", "config="] input_args = intro(title, description, args, argv) start = time.time() config = load_config(input_args["config"]) bin_print(input_args["verbosity"], 2, "Loaded configuration: ", config) aligner = get_aligner(config) aligner.align_files(input_args["path"], input_args["path"], input_args["verbosity"]) end = time.time() bin_print(input_args["verbosity"], 0, "Done.") bin_print(input_args["verbosity"], 1, "Time elapsed:", (end - start))
def optimize_function(params: List) -> float: """ Function to optimize against :param params: Parameters given by BOpt :return: Calculated score """ bin_print(verbosity, 2, "Parameters: ", params) alignment_parameters["score_weights"]["gaps_google"] = params[0][0] alignment_parameters["score_weights"]["gaps_transcript"] = params[0][1] alignment_parameters["score_weights"]["alignment_score"] = params[0][2] alignment_parameters["score_weights"]["google_confidence"] = params[0][ 3] results = compare_alignments(input_path, 0, "hand", "google", True, alignment_parameters) correlation_ious = pearsonr_lists( results["scores"]["ious"]["all"], results["scores"]["calculated"]["all"]) correlation_deviation = pearsonr_lists( results["scores"]["deviation"]["all"], results["scores"]["calculated"]["all"]) bin_print(verbosity, 1, "Correlation IOUs: ", correlation_ious) bin_print(verbosity, 1, "Correlation deviation: ", correlation_deviation) # Only maximize correlation with IOU return abs(correlation_ious)
def align(cls, google_output: object, transcript: str, verbosity: int, alignment_parameters: Dict[str, Any]) -> List["Sentence"]: """ Adjusted way of actually aligning with Google output. :param transcript: Transcript as string :param google_output: Google output as JSON object :param verbosity: Verbosity of output :param alignment_parameters: Dict of parameters loaded from a given YAML file. See README for full config. :return: List of aligned sentences """ sentences = cls.get_sentences(transcript) transcript_text = cls.get_transcript_text(transcript) google_words = cls.get_google_words(google_output) google_text = cls.get_google_text(google_words) base_confidences = [ r.alternatives[0]["confidence"] for r in google_output.results ] bin_print(verbosity, 2, "Confidences of all results:", base_confidences) bin_print(verbosity, 3, "Preprocessed transcript text:", transcript_text) bin_print(verbosity, 3, "Preprocessed google text:", google_text) start_time = time() # Call actual implementation of the alignment. alignment = cls.perform_alignment(transcript_text, google_text, verbosity, alignment_parameters) end_time = time() cls.alignment_times.append((end_time - start_time) / len(sentences)) google_alignment = alignment["google"] transcript_alignment = alignment["transcript"] alignment_score = alignment["score"] bin_print(verbosity, 3, prettify_alignment(google_alignment, transcript_alignment)) return cls.align_per_sentence(sentences, transcript_alignment, google_alignment, google_words, alignment_parameters, alignment_score, verbosity)
def optimize_function(params: List) -> float: """ Function to optimize against :param params: Parameters given by BOpt :return: Calculated score """ bin_print(verbosity, 1, "Starting new iteration...") google_files_aligner.alignment_parameters["algorithm"][ "match_reward"] = params[0][0] google_files_aligner.alignment_parameters["algorithm"][ "mismatch_penalty"] = params[0][1] google_files_aligner.alignment_parameters["algorithm"][ "gap_penalty"] = params[0][2] bin_print(verbosity, 3, "Configured params: ", google_files_aligner.alignment_parameters) google_files_aligner.align_files(input_path, output_path, 0) # Not "training_only", because we're using a further boiled down training set. result = compare_alignments(input_path, 0, "hand", "google", False, alignment_parameters) # Configurable, see config.example.yml score = eval( google_files_aligner. alignment_parameters["optimize_params_formula"], {"__builtins__": None}, { "deviation": result["scores"]["deviation"]["mean"], "iou": result["ious"]["mean"], "f1": result["appearance"]["f1_score"], "precision": result["appearance"]["precision"], "recall": result["appearance"]["recall"], }) bin_print(verbosity, 1, "Parameters: ", params) bin_print(verbosity, 1, "Achieved score (smaller == better): ", score) return score
def main(argv: list) -> None: title = "Optimize alignments" description = """ Tries to find the best alignment parameters based on Bayesian optimization. Usage: python optimize_parameters.py --path=<path> --config=<path> --convergence-plot-file=<path> [-v|-vv|-vvv] Args: --path: Path to read alignment data from --config: Path to configuration --convergence-plot-file: Filename for the plot of the convergence, PNG -v|-vv|-vvv: Verbosity level of the output -h: Prints this help """ args = ["path=", "config=", "convergence-plot-file="] input_args = intro(title, description, args, argv) start = time.time() config = load_config(input_args["config"]) bin_print(input_args["verbosity"], 2, "Loaded configuration: ", config) aligner = get_aligner(config) optimize_parameters( input_args["path"], input_args["path"], aligner, config, input_args["convergence-plot-file"], input_args["verbosity"] ) end = time.time() bin_print(input_args["verbosity"], 0, "Done.") bin_print(input_args["verbosity"], 1, "Time elapsed:", (end - start))
def main(argv: list) -> None: title = "Compare alignments" description = """ Compares two kinds of alignments Usage: python compare_alignment.py --path=<path> --type1=basic,hand,random,google --type2=basic,hand,random,google --config=<path> [-v|-vv|-vvv] [--with-list] [--get-low-means] [--training-only] Args: --path: Path to read alignment data --type1: First type to compare, one of basic, hand, random or google --type2: Second type to compare, one of basic, hand, random or google --config Path to config file -v|-vv|-vvv: Verbosity level of the output --with-list: Include a list with all calculated IOUs for copy/paste (to use in an EXCEL sheet, for example) --get-low-means: Includes a list of wav files with a mean IOU < 0.3, for debugging purposes --training-only: Only ever compares sentences marked with [TRAINING] in the first type of the alignment -h: Prints this help """ args = [ "path=", "type1=", "type2=", "config=", "with-list", "get-low-means", "training-only" ] input_args = intro(title, description, args, argv) config = load_config(input_args["config"]) input_args["with-list"] = True if "with-list" in input_args else False input_args[ "get-low-means"] = True if "get-low-means" in input_args else False input_args[ "training-only"] = True if "training-only" in input_args else False start = time.time() results = compare_alignments(input_args["path"], input_args["verbosity"], input_args["type1"], input_args["type2"], input_args["training-only"], config) verbosity = input_args["verbosity"] for file in results["ious"]["per_file"].items(): bin_print(verbosity, 3, "IOUs for", file[0], ":", file[1]["all"]) bin_print( verbosity, 0, file[0], ", " + input_args["type1"] + " vs. " + input_args["type2"] + ":") bin_print(verbosity, 0, " - Mean IOU: ", file[1]["mean"]) bin_print(verbosity, 0, " - Median IOU: ", file[1]["median"]) bin_print(verbosity, 3, "All IOUs:", results["ious"]["all"]) bin_print(verbosity, 0, "========") bin_print(verbosity, 0, input_args["type1"] + " vs. " + input_args["type2"] + ":") bin_print(verbosity, 0, "Total number of sentences:", results["no_sentences"]["total"]) bin_print(verbosity, 0, "--------") bin_print(verbosity, 0, "IOU") bin_print(verbosity, 0, " - Mean IOU: ", results["ious"]["mean"]) bin_print(verbosity, 0, " - Median IOU: ", results["ious"]["median"]) bin_print(verbosity, 0, "--------") bin_print(verbosity, 0, "Deviation (absolute)") bin_print(verbosity, 0, " - Mean deviation: ", results["scores"]["deviation"]["mean"]) bin_print(verbosity, 0, " - Median deviation: ", results["scores"]["deviation"]["median"]) bin_print(verbosity, 0, "--------") bin_print(verbosity, 0, "Calculated score") bin_print(verbosity, 0, " - Mean calculated score: ", np.mean(results["scores"]["calculated"]["all"])) bin_print(verbosity, 0, " - Median calculated score: ", np.median(results["scores"]["calculated"]["all"])) bin_print(verbosity, 0, "--------") bin_print(verbosity, 0, "Number of sentences appearing: ", results["no_sentences"]["appearing"]) tPrecisionRecall = PrettyTable() tPrecisionRecall.field_names = [ "", "Condition positive", "Condition negative" ] tPrecisionRecall.add_row([ "Predicted positive", results["appearance"]["true_positives"], results["appearance"]["false_positives"] ]) tPrecisionRecall.add_row([ "Predicted negative", results["appearance"]["false_negatives"], results["appearance"]["true_negatives"] ]) bin_print(verbosity, 0, "Sentences appearing") bin_print(verbosity, 0, "\n" + str(tPrecisionRecall)) bin_print(verbosity, 0, "Precision: ", results["appearance"]["precision"]) bin_print(verbosity, 0, "Recall: ", results["appearance"]["recall"]) bin_print(verbosity, 0, "F1 score: ", results["appearance"]["f1_score"]) if input_args["with-list"]: bin_print(verbosity, 0, "Outputting all values as copy/pastable list:") print("\n".join([ str(v[0]) for v in [v for v in results["ious"]["all"]] if v[0] <= 1.0 ])) if input_args["get-low-means"]: bin_print( verbosity, 0, "Outputting copy/pastable list of low (<0.3) mean IOU files:") print(results["ious"]["low"]) t_pearson = PrettyTable() t_pearson.field_names = [ "", "IOU", "Deviation", "Alignment score", "Google confidence", "Calculated confidence", "Google gaps percentage", "Transcript gaps percentage", "Calculated score" ] t_pearson.add_row([ "IOU", pearsonr_lists(results["ious"]["all_only"], results["ious"]["all_only"]), pearsonr_lists(results["ious"]["all_only"], results["scores"]["deviation"]["all"]), pearsonr_lists(results["ious"]["all_only"], results["scores"]["alignment_scores"]["all"]), pearsonr_lists(results["ious"]["all_only"], results["scores"]["google_confidence"]["all"]), pearsonr_lists(results["ious"]["all_only"], results["scores"]["calculated"]["all"]), pearsonr_lists(results["ious"]["all_only"], results["scores"]["google_gaps"]["all"]), pearsonr_lists(results["ious"]["all_only"], results["scores"]["transcript_gaps"]["all"]), pearsonr_lists(results["ious"]["all_only"], results["scores"]["calculated"]["all"]) ]) t_pearson.add_row([ "Deviation", pearsonr_lists(results["scores"]["deviation"]["all"], results["ious"]["all_only"]), pearsonr_lists(results["scores"]["deviation"]["all"], results["scores"]["deviation"]["all"]), pearsonr_lists(results["scores"]["deviation"]["all"], results["scores"]["alignment_scores"]["all"]), pearsonr_lists(results["scores"]["deviation"]["all"], results["scores"]["google_confidence"]["all"]), pearsonr_lists(results["scores"]["deviation"]["all"], results["scores"]["calculated"]["all"]), pearsonr_lists(results["scores"]["deviation"]["all"], results["scores"]["google_gaps"]["all"]), pearsonr_lists(results["scores"]["deviation"]["all"], results["scores"]["transcript_gaps"]["all"]), pearsonr_lists(results["scores"]["deviation"]["all"], results["scores"]["calculated"]["all"]) ]) t_pearson.add_row([ "Alignment score", pearsonr_lists(results["scores"]["alignment_scores"]["all"], results["ious"]["all_only"]), pearsonr_lists(results["scores"]["alignment_scores"]["all"], results["scores"]["deviation"]["all"]), pearsonr_lists(results["scores"]["alignment_scores"]["all"], results["scores"]["alignment_scores"]["all"]), pearsonr_lists(results["scores"]["alignment_scores"]["all"], results["scores"]["google_confidence"]["all"]), pearsonr_lists(results["scores"]["alignment_scores"]["all"], results["scores"]["calculated"]["all"]), pearsonr_lists(results["scores"]["alignment_scores"]["all"], results["scores"]["google_gaps"]["all"]), pearsonr_lists(results["scores"]["alignment_scores"]["all"], results["scores"]["transcript_gaps"]["all"]), pearsonr_lists(results["scores"]["alignment_scores"]["all"], results["scores"]["calculated"]["all"]) ]) t_pearson.add_row([ "Google confidence", pearsonr_lists(results["scores"]["google_confidence"]["all"], results["ious"]["all_only"]), pearsonr_lists(results["scores"]["google_confidence"]["all"], results["scores"]["deviation"]["all"]), pearsonr_lists(results["scores"]["google_confidence"]["all"], results["scores"]["alignment_scores"]["all"]), pearsonr_lists(results["scores"]["google_confidence"]["all"], results["scores"]["google_confidence"]["all"]), pearsonr_lists(results["scores"]["google_confidence"]["all"], results["scores"]["calculated"]["all"]), pearsonr_lists(results["scores"]["google_confidence"]["all"], results["scores"]["google_gaps"]["all"]), pearsonr_lists(results["scores"]["google_confidence"]["all"], results["scores"]["transcript_gaps"]["all"]), pearsonr_lists(results["scores"]["google_confidence"]["all"], results["scores"]["calculated"]["all"]) ]) t_pearson.add_row([ "Calculated confidence", pearsonr_lists(results["scores"]["calculated"]["all"], results["ious"]["all_only"]), pearsonr_lists(results["scores"]["calculated"]["all"], results["scores"]["deviation"]["all"]), pearsonr_lists(results["scores"]["calculated"]["all"], results["scores"]["alignment_scores"]["all"]), pearsonr_lists(results["scores"]["calculated"]["all"], results["scores"]["google_confidence"]["all"]), pearsonr_lists(results["scores"]["calculated"]["all"], results["scores"]["calculated"]["all"]), pearsonr_lists(results["scores"]["calculated"]["all"], results["scores"]["google_gaps"]["all"]), pearsonr_lists(results["scores"]["calculated"]["all"], results["scores"]["transcript_gaps"]["all"]), pearsonr_lists(results["scores"]["calculated"]["all"], results["scores"]["calculated"]["all"]) ]) t_pearson.add_row([ "Google gaps percentage", pearsonr_lists(results["scores"]["google_gaps"]["all"], results["ious"]["all_only"]), pearsonr_lists(results["scores"]["google_gaps"]["all"], results["scores"]["deviation"]["all"]), pearsonr_lists(results["scores"]["google_gaps"]["all"], results["scores"]["alignment_scores"]["all"]), pearsonr_lists(results["scores"]["google_gaps"]["all"], results["scores"]["google_confidence"]["all"]), pearsonr_lists(results["scores"]["google_gaps"]["all"], results["scores"]["calculated"]["all"]), pearsonr_lists(results["scores"]["google_gaps"]["all"], results["scores"]["google_gaps"]["all"]), pearsonr_lists(results["scores"]["google_gaps"]["all"], results["scores"]["transcript_gaps"]["all"]), pearsonr_lists(results["scores"]["google_gaps"]["all"], results["scores"]["calculated"]["all"]) ]) t_pearson.add_row([ "Transcript gaps percentage", pearsonr_lists(results["scores"]["transcript_gaps"]["all"], results["ious"]["all_only"]), pearsonr_lists(results["scores"]["transcript_gaps"]["all"], results["scores"]["deviation"]["all"]), pearsonr_lists(results["scores"]["transcript_gaps"]["all"], results["scores"]["alignment_scores"]["all"]), pearsonr_lists(results["scores"]["transcript_gaps"]["all"], results["scores"]["google_confidence"]["all"]), pearsonr_lists(results["scores"]["transcript_gaps"]["all"], results["scores"]["calculated"]["all"]), pearsonr_lists(results["scores"]["transcript_gaps"]["all"], results["scores"]["google_gaps"]["all"]), pearsonr_lists(results["scores"]["transcript_gaps"]["all"], results["scores"]["transcript_gaps"]["all"]), pearsonr_lists(results["scores"]["transcript_gaps"]["all"], results["scores"]["calculated"]["all"]) ]) t_pearson.add_row([ "Calculated score", pearsonr_lists(results["scores"]["calculated"]["all"], results["ious"]["all_only"]), pearsonr_lists(results["scores"]["calculated"]["all"], results["scores"]["deviation"]["all"]), pearsonr_lists(results["scores"]["calculated"]["all"], results["scores"]["alignment_scores"]["all"]), pearsonr_lists(results["scores"]["calculated"]["all"], results["scores"]["google_confidence"]["all"]), pearsonr_lists(results["scores"]["calculated"]["all"], results["scores"]["calculated"]["all"]), pearsonr_lists(results["scores"]["calculated"]["all"], results["scores"]["google_gaps"]["all"]), pearsonr_lists(results["scores"]["calculated"]["all"], results["scores"]["transcript_gaps"]["all"]), pearsonr_lists(results["scores"]["calculated"]["all"], results["scores"]["calculated"]["all"]) ]) bin_print(verbosity, 0, "Score correlations") bin_print(verbosity, 0, "\n" + str(t_pearson)) end = time.time() bin_print(input_args["verbosity"], 0, "Done.") bin_print(input_args["verbosity"], 1, "Time elapsed:", (end - start))
def get_and_save_raw(input_path: str, bucket_name: str, out_path: str, verbosity: int) -> None: """ Gets raw JSON from Google Cloud Speech-to-text API :param input_path: Path to read files from :param bucket_name: Name of the GCS bucket :param verbosity: Verbosity level :return: None """ bin_print(verbosity, 1, "Reading files from", input_path) bin_print(verbosity, 2, "Trying to find all .flac files...") flac_files = [ f for f in listdir(input_path) if isfile(join(input_path, f)) and f.split(".")[1] == "flac" ] bin_print(verbosity, 3, "Found flac files:", flac_files) bin_print(verbosity, 3, "Total flac files:", len(flac_files)) client = speech.SpeechClient() bin_print(verbosity, 1, "Running Google STT...") for flac_file in flac_files: if "stadt_zuerich" in flac_file: bin_print(verbosity, 2, "Processing " + flac_file) try: json = get_raw("gs://" + bucket_name + "/" + flac_file, client) json_path = out_path + "/" + flac_file.replace( ".flac", "_google_output") + ".json" bin_print(verbosity, 2, "Writing " + json_path) f = open(json_path, "w") f.write(json) f.close() except _OperationNotComplete: bin_print(verbosity, 1, "Timeout for " + flac_file)
def align_per_sentence(cls, sentences: List[Sentence], transcript_alignment: str, google_alignment: str, google_words: List[object], alignment_parameters: Dict[str, Any], alignment_score: int, verbosity: int) -> List[Sentence]: """ Assigns start and end times to sentences based on given alignments. :param sentences: All sentences :param transcript_alignment: Aligned transcript :param google_alignment: Aligned google output :param google_words: Google words, to get startTime and endTime :param alignment_parameters: Dict of parameters loaded from a given YAML file. See README for full config. :param alignment_score: Score of the alignment :param verbosity: Verbosity of output :return: List of aligned sentences """ last_end_point = 0 last_end_time = 0.0 sentence_index = 0 for sentence in sentences: start_time = time() sentence_characters = list(preprocess_string(sentence.sentence)) sentence_regex = "-*".join(sentence_characters) try: alignment_match = re.search( sentence_regex, transcript_alignment[last_end_point:]) alignment_start_point = last_end_point + alignment_match.start( ) alignment_end_point = last_end_point + alignment_match.end() last_end_point = last_end_point + alignment_match.end() except AttributeError as e: bin_print( 0, 0, "--------------------------------------------------------------------------" ) bin_print(0, 0, transcript_alignment[last_end_point:]) bin_print(0, 0, "Attribute error", e, "".join(sentence_characters), sentence_regex) # _Shouldn't_ happen, as the regexp is basically part of the transcript we're # looking at. Character's don't vanish from the transcript, so there's always a match. cls.mark_sentence_not_appearing(sentence, alignment_parameters, last_end_time) last_end_time = last_end_time + alignment_parameters[ "no_appearance"]["interval_length"] continue # Mostly none values on either side indicates a false positive, move to beginning of sentence with if is_mostly_none(list(google_alignment[alignment_start_point:alignment_end_point])) \ or is_mostly_none(list(transcript_alignment[alignment_start_point:alignment_end_point])): cls.mark_sentence_not_appearing(sentence, alignment_parameters, last_end_time) last_end_time = last_end_time + alignment_parameters[ "no_appearance"]["interval_length"] continue google_sub_start = len([ c for c in google_alignment[0:alignment_start_point] if c is not "-" and c is not " " ]) google_sub_end = len([ c for c in google_alignment[0:alignment_end_point] if c is not "-" and c is not " " ]) character_count = 0 found_start = False start_word_confidence = 0.0 end_word_confidence = 0.0 for word in google_words: character_count += len(preprocess_string(word["word"])) word_start_time = float(word["startTime"].replace("s", "")) # Guarantee that there's no overlapping sentences if character_count >= google_sub_start and last_end_time <= word_start_time and not found_start: sentence.interval.start = word_start_time start_word_confidence = word["confidence"] found_start = True if found_start and character_count >= google_sub_end: sentence.interval.end = float(word["endTime"].replace( "s", "")) last_end_time = sentence.interval.end end_word_confidence = word["confidence"] break sentence_confidence = get_sentence_confidence( start_word_confidence, end_word_confidence, transcript_alignment[ alignment_start_point:alignment_end_point], google_alignment[alignment_start_point:alignment_end_point], alignment_parameters["algorithm"]["match_reward"], alignment_parameters["algorithm"]["mismatch_penalty"], alignment_parameters["algorithm"]["gap_penalty"]) google_gaps_percentage = get_none_part( list( google_alignment[alignment_start_point:alignment_end_point] )) transcript_gaps_percentage = get_none_part( list(transcript_alignment[ alignment_start_point:alignment_end_point])) sentence.additional_data = AdditionalData( sentence_confidence["average_google_confidence"], sentence_confidence["normalized_sentence_score"], google_gaps_percentage, transcript_gaps_percentage) overall_score = calculate_overall_score( google_gaps_percentage, transcript_gaps_percentage, sentence_confidence["average_google_confidence"], sentence_confidence["normalized_sentence_score"], alignment_parameters["score_weights"]["gaps_google"], alignment_parameters["score_weights"]["gaps_transcript"], alignment_parameters["score_weights"]["alignment_score"], alignment_parameters["score_weights"]["google_confidence"]) if overall_score > alignment_parameters["filtering"]["threshold"]: if alignment_parameters["filtering"]["method"] == "mark": sentence.sentence = "[BAD]" + sentence.sentence sentence_index += 1 else: del (sentences[sentence_index]) else: sentence_index += 1 end_time = time() cls.execution_times.append(end_time - start_time) bin_print(verbosity, 2, "Sentence confidence:", str(sentence_confidence)) return sentences
def compare_alignments(input_path: str, verbosity: int, type1: str, type2: str, training_only: bool, config: Dict[str, Any]) -> Dict[str, Any]: """ Compares all found alignments :param input_path: Input path :param verbosity: Verbosity level :param type1: First type for comparison :param type2: Second type for comparison :param training_only: Determines if a sentence has to be prefixed with [TEST] in order to be considered. :param config: Configuration dict, see README :return: Dict of all results """ if input_path.endswith(os.sep): input_path = input_path[:-1] epsilon = config["no_appearance"]["interval_length"] bin_print(verbosity, 1, "Reading files from", input_path) bin_print(verbosity, 2, "Trying to find all .txt files...") txt_files = [ input_path + os.sep + f for f in listdir(input_path) if isfile(join(input_path, f)) and f.split(".")[1] == "txt" ] bin_print(verbosity, 3, "Found txt files:", txt_files) bin_print( verbosity, 2, "Filtering found files by ones containing alignment by " + type1 + "...") type1_alignments = [f for f in txt_files if "audacity_" + type1 in f] bin_print(verbosity, 3, "Found txt files containing alingment via " + type1 + ":", type1_alignments) ious = [] low_ious = [] google_confidences = [] sentence_scores = [] deviations = [] google_gaps = [] transcript_gaps = [] total_sentences = 0 sentences_appearing_true_positives = 0 sentences_appearing_false_positives = 0 sentences_appearing_true_negatives = 0 sentences_appearing_false_negatives = 0 ious_per_file = {} bin_print(verbosity, 2, "Processing all " + type1 + " alignments...") for type1_alignment in type1_alignments: file_name = type1_alignment.replace("audacity_" + type1, "").replace( input_path, "").replace("_.txt", "") bin_print(verbosity, 3, "Processing", file_name) type1_aligned_sentences = load_alignment(type1_alignment) try: type2_aligned_sentences = load_alignment( type1_alignment.replace("audacity_" + type1, "audacity_" + type2)) except FileNotFoundError: # Corresponding file doesn't exist, skip it completely continue sentence_pairs = [ pair for pair in list( zip(type1_aligned_sentences, type2_aligned_sentences)) if (not training_only or pair[0].sentence.startswith("[TRAINING]")) and not pair[1].sentence.startswith( "[BAD]") # Filter out "bad" sentences. ] total_sentences += len(sentence_pairs) current_ious = [ (intersection_over_union(pair[0].interval, pair[1].interval), pair[0].interval.get_length(), pair[1].interval.get_length(), pair[0].sentence, file_name) for pair in sentence_pairs if (pair[0].interval.get_length() > epsilon and pair[1].interval.get_length() > epsilon) ] current_google_confidence = [ (pair[1].additional_data.google_confidence if pair[1].additional_data else 0.001) for pair in sentence_pairs if (pair[0].interval.get_length() > epsilon and pair[1].interval.get_length() > epsilon) ] current_sentence_scores = [ (pair[1].additional_data.normalized_sentence_score if pair[1].additional_data else 0.001) for pair in sentence_pairs if (pair[0].interval.get_length() > epsilon and pair[1].interval.get_length() > epsilon) ] current_transcript_gaps = [ (pair[1].additional_data.gaps_transcript if pair[1].additional_data else 0.001) for pair in sentence_pairs if (pair[0].interval.get_length() > epsilon and pair[1].interval.get_length() > epsilon) ] current_google_gaps = [(pair[1].additional_data.gaps_google if pair[1].additional_data else 0.001) for pair in sentence_pairs if (pair[0].interval.get_length() > epsilon and pair[1].interval.get_length() > epsilon) ] current_deviations = [ pair[0].interval.get_deviation(pair[1].interval) for pair in sentence_pairs if (pair[0].interval.get_length() > epsilon and pair[1].interval.get_length() > epsilon) ] # Find sentences that are marked on either side as not appearing at all. pairs_sentence_not_appearing = [ pair for pair in sentence_pairs if (pair[0].interval.get_length() <= epsilon or pair[1].interval.get_length() <= epsilon) ] # Count those sentences: which of those don't appear in both oder in either one? for pair in pairs_sentence_not_appearing: if not does_sentence_appear(pair[0], epsilon) and not does_sentence_appear( pair[1], epsilon): sentences_appearing_true_negatives += 1 elif not does_sentence_appear(pair[0], epsilon) and does_sentence_appear( pair[1], epsilon): sentences_appearing_false_positives += 1 elif does_sentence_appear( pair[0], epsilon) and not does_sentence_appear(pair[1], epsilon): sentences_appearing_false_negatives += 1 # All sentences appearing in both are considered true negatives sentences_appearing_true_positives += len(current_ious) if len(current_ious) == 0: bin_print(verbosity, 2, "No sentences found, skipping...") continue if len(current_ious) > 0: mean_iou = np.mean([v[0] for v in current_ious]) median_iou = np.median([v[0] for v in current_ious]) else: mean_iou = np.nan median_iou = np.nan ious_per_file[file_name] = { "mean": mean_iou, "median": median_iou, "all": current_ious } if mean_iou <= 0.3: low_ious.append(file_name + ".wav") ious += current_ious google_confidences += current_google_confidence sentence_scores += current_sentence_scores deviations += current_deviations google_gaps += current_google_gaps transcript_gaps += current_transcript_gaps try: precision = sentences_appearing_true_positives / ( sentences_appearing_true_positives + sentences_appearing_false_positives) except ZeroDivisionError: precision = 0.0 try: recall = sentences_appearing_true_positives / ( sentences_appearing_true_positives + sentences_appearing_false_negatives) except ZeroDivisionError: recall = 0.0 try: f1_score = 2 * ((precision * recall) / (precision + recall)) except ZeroDivisionError: f1_score = 0.0 return { "no_sentences": { "appearing": len(ious), "total": total_sentences, }, "ious": { "all": ious, "all_only": [iou[0] for iou in ious], "low": low_ious, "mean": np.mean([v[0] for v in ious]) if len(ious) > 0 else np.nan, "median": np.median([v[0] for v in ious]) if len(ious) > 0 else np.nan, "per_file": ious_per_file }, "scores": { "deviation": { "all": deviations, "mean": np.mean(deviations) if len(deviations) > 0 else np.nan, "median": np.median(deviations) if len(deviations) > 0 else np.nan, }, "google_confidence": { "all": google_confidences, "mean": np.mean(google_confidences) if len(google_confidences) > 0 else np.nan, "median": np.median(google_confidences) if len(google_confidences) > 0 else np.nan }, "alignment_scores": { "all": sentence_scores, "mean": np.mean(sentence_scores) if len(sentence_scores) > 0 else np.nan, "median": np.median(sentence_scores) if len(sentence_scores) > 0 else np.nan }, "google_gaps": { "all": google_gaps, "mean": np.mean(google_gaps) if len(google_gaps) > 0 else np.nan, "median": np.median(google_gaps) if len(google_gaps) > 0 else np.nan }, "transcript_gaps": { "all": transcript_gaps, "mean": np.mean(transcript_gaps) if len(transcript_gaps) > 0 else np.nan, "median": np.median(transcript_gaps) if len(transcript_gaps) > 0 else np.nan }, "calculated": { "all": [ calculate_overall_score( tuple[0], tuple[1], tuple[2], tuple[3], config["score_weights"]["gaps_google"], config["score_weights"]["gaps_transcript"], config["score_weights"]["alignment_score"], config["score_weights"]["google_confidence"], ) for tuple in zip(google_gaps, transcript_gaps, google_confidences, sentence_scores) ] } }, "appearance": { "true_positives": sentences_appearing_true_positives, "false_positives": sentences_appearing_false_positives, "true_negatives": sentences_appearing_true_negatives, "false_negatives": sentences_appearing_false_negatives, "precision": precision, "recall": recall, "f1_score": f1_score, } }
def optimize_parameters(input_path: str, output_path: str, google_files_aligner: GoogleFilesAligner, alignment_parameters: Dict[str, Any], convergence_plot_file: str, verbosity: int) -> None: """ Tries to find the best parameters for google alignment. :param input_path: Path to load all alignments from :param output_path: Path to write the alignments to :param google_files_aligner: GoogleFLiesAligner to re-align every epoch :param alignment_parameters: Alignment parameters for comparison :param convergence_plot_file: Where to save the convergence plot :param verbosity: Verbosity of the output :return: None """ def optimize_function(params: List) -> float: """ Function to optimize against :param params: Parameters given by BOpt :return: Calculated score """ bin_print(verbosity, 1, "Starting new iteration...") google_files_aligner.alignment_parameters["algorithm"][ "match_reward"] = params[0][0] google_files_aligner.alignment_parameters["algorithm"][ "mismatch_penalty"] = params[0][1] google_files_aligner.alignment_parameters["algorithm"][ "gap_penalty"] = params[0][2] bin_print(verbosity, 3, "Configured params: ", google_files_aligner.alignment_parameters) google_files_aligner.align_files(input_path, output_path, 0) # Not "training_only", because we're using a further boiled down training set. result = compare_alignments(input_path, 0, "hand", "google", False, alignment_parameters) # Configurable, see config.example.yml score = eval( google_files_aligner. alignment_parameters["optimize_params_formula"], {"__builtins__": None}, { "deviation": result["scores"]["deviation"]["mean"], "iou": result["ious"]["mean"], "f1": result["appearance"]["f1_score"], "precision": result["appearance"]["precision"], "recall": result["appearance"]["recall"], }) bin_print(verbosity, 1, "Parameters: ", params) bin_print(verbosity, 1, "Achieved score (smaller == better): ", score) return score domain = [ { "name": "match_reward", "type": "continuous", "domain": (0, 100) }, { "name": "mismatch_penalty", "type": "continuous", "domain": (-100, 0) }, { "name": "gap_penalty", "type": "continuous", "domain": (-100, 0) }, ] bopt = BayesianOptimization(f=optimize_function, domain=domain, model_type="GP", acquisition_type="EI", acquisition_jitter=0.05) bopt.run_optimization(max_iter=25) bopt.plot_convergence(filename=convergence_plot_file) bin_print(verbosity, 0, "Best values:", bopt.x_opt)
def optimize_score(input_path: str, alignment_parameters: Dict[str, Any], convergence_plot_file: str, verbosity: int) -> None: """ Tries to find the best parameters for overall score. :param input_path: Path to load all alignments from :param alignment_parameters: Alignment parameters for comparison :param convergence_plot_file: Where to save the convergence plot :param verbosity: Verbosity of the output :return: None """ def optimize_function(params: List) -> float: """ Function to optimize against :param params: Parameters given by BOpt :return: Calculated score """ bin_print(verbosity, 2, "Parameters: ", params) alignment_parameters["score_weights"]["gaps_google"] = params[0][0] alignment_parameters["score_weights"]["gaps_transcript"] = params[0][1] alignment_parameters["score_weights"]["alignment_score"] = params[0][2] alignment_parameters["score_weights"]["google_confidence"] = params[0][ 3] results = compare_alignments(input_path, 0, "hand", "google", True, alignment_parameters) correlation_ious = pearsonr_lists( results["scores"]["ious"]["all"], results["scores"]["calculated"]["all"]) correlation_deviation = pearsonr_lists( results["scores"]["deviation"]["all"], results["scores"]["calculated"]["all"]) bin_print(verbosity, 1, "Correlation IOUs: ", correlation_ious) bin_print(verbosity, 1, "Correlation deviation: ", correlation_deviation) # Only maximize correlation with IOU return abs(correlation_ious) domain = [ { "name": "gaps_google", "type": "continuous", "domain": (-100, 100) }, { "name": "gaps_transcript", "type": "continuous", "domain": (-100, 100) }, { "name": "alignment_score", "type": "continuous", "domain": (-100, 100) }, { "name": "google_confidence", "type": "continuous", "domain": (-100, 100) }, ] bopt = BayesianOptimization(f=optimize_function, domain=domain, model_type="GP", acquisition_type="EI", acquisition_jitter=0.05, maximize=True) bopt.run_optimization(max_iter=250) bopt.plot_convergence(filename=convergence_plot_file) bin_print(verbosity, 0, "Best values:", bopt.x_opt)
def align_files(self, input_path: str, output_path: str, verbosity: int) -> None: """ Aligns all given files in input_path and writes alignments into output_path :param input_path: Where to look for transcript files :param output_path: Where to write alignment files :param verbosity: Verbosity of debugging output :return: None """ bin_print(verbosity, 1, "Loading all transcript files from " + input_path + "...") file_names = [f.replace(".wav", "") for f in listdir(input_path) if isfile(join(input_path, f)) and f.split(".")[1] == "wav"] bin_print(verbosity, 3, "Found files:", file_names) for file in file_names: bin_print(verbosity, 2, "Aligning " + file + "...") transcript_file = file + ".txt" with open(join(input_path, transcript_file), encoding="utf-8-sig") as read_file: transcript = read_file.read() with open(join(input_path, file + "_google_output.json"), "r", encoding="utf-8-sig") as read_file: # Convert back to object-like structure, so the underlying # alignment function doesn't imply non-object like structures, # such as dicts. This is particularly useful when working with # Googles output directly. google_output = load(read_file) google_output = Struct(**google_output) google_output.results = [Struct(**r) for r in google_output.results] alignment = self.aligner.align(google_output, transcript, verbosity, self.alignment_parameters) output_filename = output_path + "/" + file + "_audacity_" + self.alignment_type + ".txt" with open(output_filename, "w+", encoding="utf-8") as f: f.write("\n".join([sentence.to_audacity_label_format() for sentence in alignment])) bin_print(verbosity, 2, "Wrote " + output_filename) f.close() bin_print(verbosity, 0, "Execution time per sentence (mean): ", (np.mean(self.aligner.execution_times) + np.mean(self.aligner.alignment_times))) bin_print(verbosity, 0, "Execution time per sentence (max): ", (np.max(self.aligner.execution_times) + np.max(self.aligner.alignment_times)))