def get_ids_from_property_value(data: dict, property_name: str, property_value: str, fix_data_delegate: Callable = None, return_on_first: bool = True) -> list: # data structure: {id: content} # fixed_data structure: {description: id} if not data or not property_name or not property_value: print(f'- get_ids_from_property_value: invalid data or property info. Return empty list.') return [] if not fix_data_delegate: fix_data_delegate = fix_property_value fixed_value = fix_data_delegate(property_value) fixed_data = {entry_id: fix_data_delegate(entry_data[property_name]) for entry_id, entry_data in data.items() if entry_data[property_name]} results = [] results.extend([entry_id for entry_id, entry_property in fixed_data.items() if entry_property.startswith(fixed_value)]) results.extend([entry_id for entry_id, entry_property in fixed_data.items() if fixed_value in entry_property]) results = list(set(results)) if results and return_on_first: similarity_data = {key: fix_data_delegate(value[property_name]) for key, value in data.items() if key in results} similarity_map = util.get_similarity(similarity_data, fixed_value) max_similarity = max(similarity_map.values()) best_hits = [key for key, value in similarity_map.items() if value == max_similarity] return best_hits return results
def try_more_or_less_words(caption, i, similarity, words, words_in_captions, use_levishtein): if i >= len(words): return i, similarity, words_in_captions with_next_word = words_in_captions + [words[i]] one_less_word = words_in_captions[:-1] similarity_with_next_word, lev_distance1 = get_similarity( caption, with_next_word) similarity_one_less_word, lev_distance2 = get_similarity( caption, one_less_word) if use_levishtein: if lev_distance1 < similarity or lev_distance2 < similarity: if lev_distance1 < lev_distance2: i += 1 words_in_captions = with_next_word similarity = lev_distance1 return try_more_or_less_words(caption, i, similarity, words, words_in_captions, use_levishtein) else: i -= 1 words_in_captions = one_less_word similarity = lev_distance2 return try_more_or_less_words(caption, i, similarity, words, words_in_captions, use_levishtein) return i, similarity, words_in_captions else: if similarity_with_next_word > similarity or similarity_one_less_word > similarity: if similarity_with_next_word > similarity_one_less_word: i += 1 words_in_captions = with_next_word similarity = similarity_with_next_word return try_more_or_less_words(caption, i, similarity, words, words_in_captions, use_levishtein) else: i -= 1 words_in_captions = one_less_word similarity = similarity_one_less_word return try_more_or_less_words(caption, i, similarity, words, words_in_captions, use_levishtein) return i, similarity, words_in_captions
def check_backwards(i: int, subtitle_pairs: List[SubtitlePairWords], total_diff: int): this_pair = copy.deepcopy(subtitle_pairs[i]) next_one = copy.deepcopy(subtitle_pairs[i + 1]) next_one.words.insert(0, this_pair.words[-1]) this_pair.words.remove(this_pair.words[-1]) this_pair.similarity = get_similarity(this_pair.caption, this_pair.words)[1] next_one.similarity = get_similarity(next_one.caption, next_one.words)[1] new_windowed_array = [subtitle_pairs[i - 1], this_pair, next_one] new_diff = sum(x.similarity for x in new_windowed_array) # print(f'new diff {new_diff}') if new_diff < total_diff: best_one = copy.deepcopy(this_pair) best_one_next = copy.deepcopy(next_one) diff_so_far = total_diff while new_diff < diff_so_far: best_one = copy.deepcopy(this_pair) best_one_next = copy.deepcopy(next_one) diff_so_far = new_diff if len(this_pair.words) == 0: break next_one.words.insert(0, this_pair.words[-1]) this_pair.words.remove(this_pair.words[-1]) this_pair.similarity = get_similarity(this_pair.caption, this_pair.words)[1] next_one.similarity = get_similarity(next_one.caption, next_one.words)[1] new_windowed_array = [subtitle_pairs[i - 1], this_pair, next_one] new_diff = sum(x.similarity for x in new_windowed_array) return best_one, best_one_next, diff_so_far return subtitle_pairs[i], subtitle_pairs[i + 1], total_diff
def get_ids_from_property_value(data: dict, property_name: str, property_value: str, fix_data_delegate: Callable = None, match_exact: bool = False) -> list: # data structure: {id: content} # fixed_data structure: {description: id} if not data or not property_name or not property_value: print( f'- get_ids_from_property_value: invalid data or property info. Return empty list.' ) return [] if not fix_data_delegate: fix_data_delegate = _fix_property_value fixed_value = fix_data_delegate(property_value) fixed_data = { entry_id: fix_data_delegate(entry_data[property_name]) for entry_id, entry_data in data.items() if entry_data[property_name] } if match_exact: results = [ key for key, value in fixed_data.items() if value == property_value ] else: similarity_map = {} for entry_id, entry_property in fixed_data.items(): if entry_property.startswith( fixed_value) or fixed_value in entry_property: similarity_value = util.get_similarity(entry_property, fixed_value) if similarity_value in similarity_map.keys(): similarity_map[similarity_value].append( (entry_id, entry_property)) else: similarity_map[similarity_value] = [(entry_id, entry_property)] for similarity_value, entries in similarity_map.items(): similarity_map[similarity_value] = sorted( entries, key=lambda entry: entry[1]) similarity_values = sorted(list(similarity_map.keys()), reverse=True) results = [] for similarity_value in similarity_values: if not match_exact or (match_exact is True and similarity_value.is_integer()): entry_ids = [ entry_id for (entry_id, _) in similarity_map[similarity_value] ] results.extend(entry_ids) return results
def process_subtitles(file_name: str, use_levishtein=False) -> List[SubtitlePairWords]: subtitles = webvtt.read(f'data/{file_name}.vtt') with open(f'data/{file_name}.json', encoding='utf-8', errors='ignore') as fh: json_text = json.load(fh) generated_subtitles = [ x['turns'] for x in json_text['sections'] if (x['type'] == "speech" and "turns" in x.keys()) ] generated_subtitles = [ item for sublist in generated_subtitles for item in sublist ] i = 0 pairs: List[SubtitlePairWords] = [] words_arrays = list(map(lambda x: x['words'], generated_subtitles)) words = [item for sublist in words_arrays for item in sublist] for caption in subtitles.captions: words_in_captions: List[Dict] = [] while i < len(words) and is_word_in_caption(caption, words[i]): words_in_captions.append(words[i]) i += 1 cos_similarity, lev_distance = get_similarity(caption, words_in_captions) similarity = lev_distance if use_levishtein else cos_similarity if i < len(words): i, similarity, words_in_captions = try_more_or_less_words( caption, i, similarity, words, words_in_captions, use_levishtein) pairs.append(SubtitlePairWords(words_in_captions, caption, similarity)) return pairs
def moving_window(file_name): pairs = process_subtitles(file_name, use_levishtein=True) window_size = 3 for i, pair in enumerate(pairs): if i <= (window_size // 2): start = i end = i + (window_size // 2) + 1 elif i + (window_size // 2) >= len(pairs): start = i - window_size end = i else: start = i - (window_size // 2) end = i + (window_size // 2) + 1 windowed_array = pairs[start:end] total_diff = sum(x.similarity for x in windowed_array) # print(f'old diff {total_diff}') if pair.similarity < 2: continue if i == len(pairs) - 1: break if len(pair.words) == 0 or len(pairs[i + 1].words) == 0: continue if i == 0: # print(pairs[0]) forward_this_pair = copy.deepcopy(pair) forward_next_one = copy.deepcopy(pairs[i + 1]) forward_this_pair.words.append(forward_next_one.words[0]) forward_next_one.words.remove(forward_next_one.words[0]) forward_this_pair.similarity = get_similarity( forward_this_pair.caption, forward_this_pair.words)[1] forward_next_one.similarity = get_similarity( forward_next_one.caption, forward_next_one.words)[1] forward_new_windowed_array = [forward_this_pair, forward_next_one] forward_new_diff = sum(x.similarity for x in forward_new_windowed_array) backwards_this_pair = copy.deepcopy(pair) backwards_next_one = copy.deepcopy(pairs[i + 1]) backwards_next_one.words.insert(0, backwards_this_pair.words[-1]) backwards_this_pair.words.remove(backwards_this_pair.words[-1]) backwards_this_pair.similarity = get_similarity( backwards_this_pair.caption, backwards_this_pair.words)[1] backwards_next_one.similarity = get_similarity( backwards_next_one.caption, backwards_next_one.words)[1] backwards_new_windowed_array = [ backwards_this_pair, backwards_next_one ] backwards_new_diff = sum(x.similarity for x in backwards_new_windowed_array) if backwards_new_diff < total_diff and backwards_new_diff < forward_new_diff: pairs[i] = backwards_this_pair pairs[i + 1] = backwards_next_one if forward_new_diff < total_diff and forward_new_diff < backwards_new_diff: pairs[i] = forward_this_pair pairs[i + 1] = forward_next_one else: # print(pair) forward_this_pair, forward_next_one, forward_new_diff = check_forward( i, pairs, total_diff) backwards_this_pair, backwards_next_one, backwards_new_diff = check_backwards( i, pairs, total_diff) if backwards_new_diff < total_diff and backwards_new_diff < forward_new_diff: pairs[i] = backwards_this_pair pairs[i + 1] = backwards_next_one if forward_new_diff < total_diff and forward_new_diff < backwards_new_diff: pairs[i] = forward_this_pair pairs[i + 1] = forward_next_one return pairs
def moving_window(file_name): pairs = process_subtitles(file_name, True) window_size = 3 for i, pair in enumerate(pairs): if i <= (window_size // 2): start = i end = i + window_size elif i + (window_size // 2) >= len(pairs): start = i - window_size end = i else: start = i - (window_size // 2) end = i + (window_size // 2) + 1 windowed_array = pairs[start:end] total_diff = sum(x.similarity for x in windowed_array) # print(f'old diff {total_diff}') if pair.similarity < 2: continue if i == len(pairs) - 1: break if len(pair.words) == 0 or len(pairs[i + 1].words) == 0: continue if i == 0: # print(pairs[0]) this_pair = copy.deepcopy(pair) next_one = copy.deepcopy(pairs[i + 1]) this_pair.words.append(next_one.words[0]) next_one.words.remove(next_one.words[0]) this_pair.similarity = get_similarity(this_pair.caption, this_pair.words)[1] next_one.similarity = get_similarity(next_one.caption, next_one.words)[1] new_windowed_array = [this_pair, next_one, pairs[i + 2]] new_diff = sum(x.similarity for x in new_windowed_array) # print(f'new diff {new_diff}') if new_diff < total_diff: pairs[i] = this_pair pairs[i + 1] = next_one else: # print(pair) this_pair = copy.deepcopy(pair) next_one = copy.deepcopy(pairs[i + 1]) this_pair.words.append(next_one.words[0]) next_one.words.remove(next_one.words[0]) this_pair.similarity = get_similarity(this_pair.caption, this_pair.words)[1] next_one.similarity = get_similarity(next_one.caption, next_one.words)[1] new_windowed_array = [pairs[i - 1], this_pair, next_one] new_diff = sum(x.similarity for x in new_windowed_array) # print(f'new diff {new_diff}') if new_diff < total_diff: while new_diff < total_diff: best_one = copy.deepcopy(this_pair) best_one_next = copy.deepcopy(next_one) total_diff = new_diff if len(next_one.words) == 0: break this_pair.words.append(next_one.words[0]) next_one.words.remove(next_one.words[0]) this_pair.similarity = get_similarity( this_pair.caption, this_pair.words)[1] next_one.similarity = get_similarity( next_one.caption, next_one.words)[1] new_windowed_array = [pairs[i - 1], this_pair, next_one] new_diff = sum(x.similarity for x in new_windowed_array) pairs[i] = best_one pairs[i + 1] = best_one_next return pairs