def get_transcript_text(transcript: str) -> str: """ Get preprocessed transcript as string :param transcript: Transcript string :return: Preprocessed transcript """ return preprocess_string(transcript)
def test_preprocess_string(self, input_data: str, expected_output: str) -> None: """ Tests preprocess_string function's behaviour :param input: Input string :param expected_output: Expected output string :return: None """ self.assertEqual(expected_output, preprocess_string(expected_output))
def get_google_text(google_words: List) -> str: """ Get a complete text out of Google output. :param google_words: List of Google output words, straight from JSON object. :return: Google output as string """ google_word_list = [ w["word"] for w in google_words if not all(c in punctuation for c in w["word"]) ] return preprocess_string(" ".join(google_word_list))
def get_google_words(google_output: object) -> List[dict]: """ Preprocesses the Google output to further work with it. :param google_output: JSON object :return: List of dict for all words in a google_output. """ words = [] for result in google_output.results: alternative = result.alternatives[0] for word in alternative["words"]: words.append({ "word": preprocess_string(word["word"]), "startTime": word["startTime"], "endTime": word["endTime"], "confidence": alternative["confidence"] }) return words
def align_per_sentence(cls, sentences: List[Sentence], transcript_alignment: str, google_alignment: str, google_words: List[object], alignment_parameters: Dict[str, Any], alignment_score: int, verbosity: int) -> List[Sentence]: """ Assigns start and end times to sentences based on given alignments. :param sentences: All sentences :param transcript_alignment: Aligned transcript :param google_alignment: Aligned google output :param google_words: Google words, to get startTime and endTime :param alignment_parameters: Dict of parameters loaded from a given YAML file. See README for full config. :param alignment_score: Score of the alignment :param verbosity: Verbosity of output :return: List of aligned sentences """ last_end_point = 0 last_end_time = 0.0 sentence_index = 0 for sentence in sentences: start_time = time() sentence_characters = list(preprocess_string(sentence.sentence)) sentence_regex = "-*".join(sentence_characters) try: alignment_match = re.search( sentence_regex, transcript_alignment[last_end_point:]) alignment_start_point = last_end_point + alignment_match.start( ) alignment_end_point = last_end_point + alignment_match.end() last_end_point = last_end_point + alignment_match.end() except AttributeError as e: bin_print( 0, 0, "--------------------------------------------------------------------------" ) bin_print(0, 0, transcript_alignment[last_end_point:]) bin_print(0, 0, "Attribute error", e, "".join(sentence_characters), sentence_regex) # _Shouldn't_ happen, as the regexp is basically part of the transcript we're # looking at. Character's don't vanish from the transcript, so there's always a match. cls.mark_sentence_not_appearing(sentence, alignment_parameters, last_end_time) last_end_time = last_end_time + alignment_parameters[ "no_appearance"]["interval_length"] continue # Mostly none values on either side indicates a false positive, move to beginning of sentence with if is_mostly_none(list(google_alignment[alignment_start_point:alignment_end_point])) \ or is_mostly_none(list(transcript_alignment[alignment_start_point:alignment_end_point])): cls.mark_sentence_not_appearing(sentence, alignment_parameters, last_end_time) last_end_time = last_end_time + alignment_parameters[ "no_appearance"]["interval_length"] continue google_sub_start = len([ c for c in google_alignment[0:alignment_start_point] if c is not "-" and c is not " " ]) google_sub_end = len([ c for c in google_alignment[0:alignment_end_point] if c is not "-" and c is not " " ]) character_count = 0 found_start = False start_word_confidence = 0.0 end_word_confidence = 0.0 for word in google_words: character_count += len(preprocess_string(word["word"])) word_start_time = float(word["startTime"].replace("s", "")) # Guarantee that there's no overlapping sentences if character_count >= google_sub_start and last_end_time <= word_start_time and not found_start: sentence.interval.start = word_start_time start_word_confidence = word["confidence"] found_start = True if found_start and character_count >= google_sub_end: sentence.interval.end = float(word["endTime"].replace( "s", "")) last_end_time = sentence.interval.end end_word_confidence = word["confidence"] break sentence_confidence = get_sentence_confidence( start_word_confidence, end_word_confidence, transcript_alignment[ alignment_start_point:alignment_end_point], google_alignment[alignment_start_point:alignment_end_point], alignment_parameters["algorithm"]["match_reward"], alignment_parameters["algorithm"]["mismatch_penalty"], alignment_parameters["algorithm"]["gap_penalty"]) google_gaps_percentage = get_none_part( list( google_alignment[alignment_start_point:alignment_end_point] )) transcript_gaps_percentage = get_none_part( list(transcript_alignment[ alignment_start_point:alignment_end_point])) sentence.additional_data = AdditionalData( sentence_confidence["average_google_confidence"], sentence_confidence["normalized_sentence_score"], google_gaps_percentage, transcript_gaps_percentage) overall_score = calculate_overall_score( google_gaps_percentage, transcript_gaps_percentage, sentence_confidence["average_google_confidence"], sentence_confidence["normalized_sentence_score"], alignment_parameters["score_weights"]["gaps_google"], alignment_parameters["score_weights"]["gaps_transcript"], alignment_parameters["score_weights"]["alignment_score"], alignment_parameters["score_weights"]["google_confidence"]) if overall_score > alignment_parameters["filtering"]["threshold"]: if alignment_parameters["filtering"]["method"] == "mark": sentence.sentence = "[BAD]" + sentence.sentence sentence_index += 1 else: del (sentences[sentence_index]) else: sentence_index += 1 end_time = time() cls.execution_times.append(end_time - start_time) bin_print(verbosity, 2, "Sentence confidence:", str(sentence_confidence)) return sentences