Ejemplo n.º 1
0
def get_ids_from_property_value(data: dict, property_name: str, property_value: str, fix_data_delegate: Callable = None, return_on_first: bool = True) -> list:
    # data structure: {id: content}
    # fixed_data structure: {description: id}
    if not data or not property_name or not property_value:
        print(f'- get_ids_from_property_value: invalid data or property info. Return empty list.')
        return []

    if not fix_data_delegate:
        fix_data_delegate = fix_property_value

    fixed_value = fix_data_delegate(property_value)
    fixed_data = {entry_id: fix_data_delegate(entry_data[property_name]) for entry_id, entry_data in data.items() if entry_data[property_name]}

    results = []
    results.extend([entry_id for entry_id, entry_property in fixed_data.items() if entry_property.startswith(fixed_value)])
    results.extend([entry_id for entry_id, entry_property in fixed_data.items() if fixed_value in entry_property])
    results = list(set(results))

    if results and return_on_first:
        similarity_data = {key: fix_data_delegate(value[property_name]) for key, value in data.items() if key in results}
        similarity_map = util.get_similarity(similarity_data, fixed_value)
        max_similarity = max(similarity_map.values())
        best_hits = [key for key, value in similarity_map.items() if value == max_similarity]
        return best_hits

    return results
Ejemplo n.º 2
0
def try_more_or_less_words(caption, i, similarity, words, words_in_captions,
                           use_levishtein):
    if i >= len(words):
        return i, similarity, words_in_captions

    with_next_word = words_in_captions + [words[i]]
    one_less_word = words_in_captions[:-1]

    similarity_with_next_word, lev_distance1 = get_similarity(
        caption, with_next_word)
    similarity_one_less_word, lev_distance2 = get_similarity(
        caption, one_less_word)
    if use_levishtein:
        if lev_distance1 < similarity or lev_distance2 < similarity:
            if lev_distance1 < lev_distance2:
                i += 1
                words_in_captions = with_next_word
                similarity = lev_distance1
                return try_more_or_less_words(caption, i, similarity, words,
                                              words_in_captions,
                                              use_levishtein)
            else:
                i -= 1
                words_in_captions = one_less_word
                similarity = lev_distance2
                return try_more_or_less_words(caption, i, similarity, words,
                                              words_in_captions,
                                              use_levishtein)
        return i, similarity, words_in_captions
    else:
        if similarity_with_next_word > similarity or similarity_one_less_word > similarity:
            if similarity_with_next_word > similarity_one_less_word:
                i += 1
                words_in_captions = with_next_word
                similarity = similarity_with_next_word
                return try_more_or_less_words(caption, i, similarity, words,
                                              words_in_captions,
                                              use_levishtein)
            else:
                i -= 1
                words_in_captions = one_less_word
                similarity = similarity_one_less_word
                return try_more_or_less_words(caption, i, similarity, words,
                                              words_in_captions,
                                              use_levishtein)
        return i, similarity, words_in_captions
def check_backwards(i: int, subtitle_pairs: List[SubtitlePairWords],
                    total_diff: int):
    this_pair = copy.deepcopy(subtitle_pairs[i])

    next_one = copy.deepcopy(subtitle_pairs[i + 1])

    next_one.words.insert(0, this_pair.words[-1])
    this_pair.words.remove(this_pair.words[-1])

    this_pair.similarity = get_similarity(this_pair.caption,
                                          this_pair.words)[1]
    next_one.similarity = get_similarity(next_one.caption, next_one.words)[1]

    new_windowed_array = [subtitle_pairs[i - 1], this_pair, next_one]
    new_diff = sum(x.similarity for x in new_windowed_array)
    # print(f'new diff {new_diff}')

    if new_diff < total_diff:
        best_one = copy.deepcopy(this_pair)
        best_one_next = copy.deepcopy(next_one)
        diff_so_far = total_diff

        while new_diff < diff_so_far:
            best_one = copy.deepcopy(this_pair)
            best_one_next = copy.deepcopy(next_one)

            diff_so_far = new_diff

            if len(this_pair.words) == 0:
                break

            next_one.words.insert(0, this_pair.words[-1])
            this_pair.words.remove(this_pair.words[-1])

            this_pair.similarity = get_similarity(this_pair.caption,
                                                  this_pair.words)[1]
            next_one.similarity = get_similarity(next_one.caption,
                                                 next_one.words)[1]

            new_windowed_array = [subtitle_pairs[i - 1], this_pair, next_one]
            new_diff = sum(x.similarity for x in new_windowed_array)
        return best_one, best_one_next, diff_so_far
    return subtitle_pairs[i], subtitle_pairs[i + 1], total_diff
Ejemplo n.º 4
0
def get_ids_from_property_value(data: dict,
                                property_name: str,
                                property_value: str,
                                fix_data_delegate: Callable = None,
                                match_exact: bool = False) -> list:
    # data structure: {id: content}
    # fixed_data structure: {description: id}
    if not data or not property_name or not property_value:
        print(
            f'- get_ids_from_property_value: invalid data or property info. Return empty list.'
        )
        return []

    if not fix_data_delegate:
        fix_data_delegate = _fix_property_value

    fixed_value = fix_data_delegate(property_value)
    fixed_data = {
        entry_id: fix_data_delegate(entry_data[property_name])
        for entry_id, entry_data in data.items() if entry_data[property_name]
    }

    if match_exact:
        results = [
            key for key, value in fixed_data.items() if value == property_value
        ]
    else:
        similarity_map = {}
        for entry_id, entry_property in fixed_data.items():
            if entry_property.startswith(
                    fixed_value) or fixed_value in entry_property:
                similarity_value = util.get_similarity(entry_property,
                                                       fixed_value)
                if similarity_value in similarity_map.keys():
                    similarity_map[similarity_value].append(
                        (entry_id, entry_property))
                else:
                    similarity_map[similarity_value] = [(entry_id,
                                                         entry_property)]
        for similarity_value, entries in similarity_map.items():
            similarity_map[similarity_value] = sorted(
                entries, key=lambda entry: entry[1])
        similarity_values = sorted(list(similarity_map.keys()), reverse=True)
        results = []
        for similarity_value in similarity_values:
            if not match_exact or (match_exact is True
                                   and similarity_value.is_integer()):
                entry_ids = [
                    entry_id
                    for (entry_id, _) in similarity_map[similarity_value]
                ]
                results.extend(entry_ids)

    return results
Ejemplo n.º 5
0
def process_subtitles(file_name: str,
                      use_levishtein=False) -> List[SubtitlePairWords]:
    subtitles = webvtt.read(f'data/{file_name}.vtt')

    with open(f'data/{file_name}.json', encoding='utf-8',
              errors='ignore') as fh:
        json_text = json.load(fh)

    generated_subtitles = [
        x['turns'] for x in json_text['sections']
        if (x['type'] == "speech" and "turns" in x.keys())
    ]
    generated_subtitles = [
        item for sublist in generated_subtitles for item in sublist
    ]

    i = 0
    pairs: List[SubtitlePairWords] = []

    words_arrays = list(map(lambda x: x['words'], generated_subtitles))
    words = [item for sublist in words_arrays for item in sublist]

    for caption in subtitles.captions:

        words_in_captions: List[Dict] = []

        while i < len(words) and is_word_in_caption(caption, words[i]):
            words_in_captions.append(words[i])
            i += 1

        cos_similarity, lev_distance = get_similarity(caption,
                                                      words_in_captions)
        similarity = lev_distance if use_levishtein else cos_similarity
        if i < len(words):
            i, similarity, words_in_captions = try_more_or_less_words(
                caption, i, similarity, words, words_in_captions,
                use_levishtein)

        pairs.append(SubtitlePairWords(words_in_captions, caption, similarity))

    return pairs
def moving_window(file_name):
    pairs = process_subtitles(file_name, use_levishtein=True)
    window_size = 3
    for i, pair in enumerate(pairs):

        if i <= (window_size // 2):
            start = i
            end = i + (window_size // 2) + 1
        elif i + (window_size // 2) >= len(pairs):
            start = i - window_size
            end = i
        else:
            start = i - (window_size // 2)
            end = i + (window_size // 2) + 1

        windowed_array = pairs[start:end]

        total_diff = sum(x.similarity for x in windowed_array)
        # print(f'old diff {total_diff}')

        if pair.similarity < 2:
            continue

        if i == len(pairs) - 1:
            break

        if len(pair.words) == 0 or len(pairs[i + 1].words) == 0:
            continue

        if i == 0:
            # print(pairs[0])
            forward_this_pair = copy.deepcopy(pair)

            forward_next_one = copy.deepcopy(pairs[i + 1])

            forward_this_pair.words.append(forward_next_one.words[0])
            forward_next_one.words.remove(forward_next_one.words[0])

            forward_this_pair.similarity = get_similarity(
                forward_this_pair.caption, forward_this_pair.words)[1]
            forward_next_one.similarity = get_similarity(
                forward_next_one.caption, forward_next_one.words)[1]

            forward_new_windowed_array = [forward_this_pair, forward_next_one]
            forward_new_diff = sum(x.similarity
                                   for x in forward_new_windowed_array)

            backwards_this_pair = copy.deepcopy(pair)

            backwards_next_one = copy.deepcopy(pairs[i + 1])

            backwards_next_one.words.insert(0, backwards_this_pair.words[-1])
            backwards_this_pair.words.remove(backwards_this_pair.words[-1])

            backwards_this_pair.similarity = get_similarity(
                backwards_this_pair.caption, backwards_this_pair.words)[1]
            backwards_next_one.similarity = get_similarity(
                backwards_next_one.caption, backwards_next_one.words)[1]

            backwards_new_windowed_array = [
                backwards_this_pair, backwards_next_one
            ]
            backwards_new_diff = sum(x.similarity
                                     for x in backwards_new_windowed_array)

            if backwards_new_diff < total_diff and backwards_new_diff < forward_new_diff:
                pairs[i] = backwards_this_pair
                pairs[i + 1] = backwards_next_one

            if forward_new_diff < total_diff and forward_new_diff < backwards_new_diff:
                pairs[i] = forward_this_pair
                pairs[i + 1] = forward_next_one

        else:
            # print(pair)
            forward_this_pair, forward_next_one, forward_new_diff = check_forward(
                i, pairs, total_diff)
            backwards_this_pair, backwards_next_one, backwards_new_diff = check_backwards(
                i, pairs, total_diff)

            if backwards_new_diff < total_diff and backwards_new_diff < forward_new_diff:
                pairs[i] = backwards_this_pair
                pairs[i + 1] = backwards_next_one

            if forward_new_diff < total_diff and forward_new_diff < backwards_new_diff:
                pairs[i] = forward_this_pair
                pairs[i + 1] = forward_next_one

    return pairs
Ejemplo n.º 7
0
def moving_window(file_name):
    pairs = process_subtitles(file_name, True)
    window_size = 3
    for i, pair in enumerate(pairs):

        if i <= (window_size // 2):
            start = i
            end = i + window_size
        elif i + (window_size // 2) >= len(pairs):
            start = i - window_size
            end = i
        else:
            start = i - (window_size // 2)
            end = i + (window_size // 2) + 1

        windowed_array = pairs[start:end]

        total_diff = sum(x.similarity for x in windowed_array)
        # print(f'old diff {total_diff}')

        if pair.similarity < 2:
            continue

        if i == len(pairs) - 1:
            break

        if len(pair.words) == 0 or len(pairs[i + 1].words) == 0:
            continue

        if i == 0:
            # print(pairs[0])
            this_pair = copy.deepcopy(pair)

            next_one = copy.deepcopy(pairs[i + 1])

            this_pair.words.append(next_one.words[0])
            next_one.words.remove(next_one.words[0])

            this_pair.similarity = get_similarity(this_pair.caption,
                                                  this_pair.words)[1]
            next_one.similarity = get_similarity(next_one.caption,
                                                 next_one.words)[1]

            new_windowed_array = [this_pair, next_one, pairs[i + 2]]
            new_diff = sum(x.similarity for x in new_windowed_array)
            # print(f'new diff {new_diff}')

            if new_diff < total_diff:
                pairs[i] = this_pair
                pairs[i + 1] = next_one
        else:
            # print(pair)

            this_pair = copy.deepcopy(pair)

            next_one = copy.deepcopy(pairs[i + 1])

            this_pair.words.append(next_one.words[0])
            next_one.words.remove(next_one.words[0])

            this_pair.similarity = get_similarity(this_pair.caption,
                                                  this_pair.words)[1]
            next_one.similarity = get_similarity(next_one.caption,
                                                 next_one.words)[1]

            new_windowed_array = [pairs[i - 1], this_pair, next_one]
            new_diff = sum(x.similarity for x in new_windowed_array)
            # print(f'new diff {new_diff}')

            if new_diff < total_diff:

                while new_diff < total_diff:
                    best_one = copy.deepcopy(this_pair)
                    best_one_next = copy.deepcopy(next_one)

                    total_diff = new_diff

                    if len(next_one.words) == 0:
                        break

                    this_pair.words.append(next_one.words[0])
                    next_one.words.remove(next_one.words[0])

                    this_pair.similarity = get_similarity(
                        this_pair.caption, this_pair.words)[1]
                    next_one.similarity = get_similarity(
                        next_one.caption, next_one.words)[1]

                    new_windowed_array = [pairs[i - 1], this_pair, next_one]
                    new_diff = sum(x.similarity for x in new_windowed_array)

                pairs[i] = best_one
                pairs[i + 1] = best_one_next
    return pairs