Ejemplo n.º 1
0
    def recursiveLevenshtein(self, mode, new_fragments,
                             words_to_substitute_in_new, old_list, start_after,
                             end_before):

        if not words_to_substitute_in_new:
            return

        words_to_substitute_in_old = old_list[start_after + 1:end_before]
        if words_to_substitute_in_old:

            distance_matrix = Utils.dtw(words_to_substitute_in_new,
                                        words_to_substitute_in_old)
            index = self.findMostSimilarWordIndex(distance_matrix,
                                                  words_to_substitute_in_new,
                                                  words_to_substitute_in_old)
            index_to_append = start_after + 1 + index[1]
            fragment_to_append = self.getFragmentID(index_to_append)

            if index[0] != 0:  # there are at least one word left from this
                left = words_to_substitute_in_new[:index[0]]
                self.recursiveLevenshtein(False, new_fragments, left, old_list,
                                          start_after, index_to_append)

            if mode:
                new_fragments[
                    fragment_to_append] += words_to_substitute_in_new[
                        index[0]] + " "
            else:
                new_fragments[fragment_to_append] = words_to_substitute_in_new[
                    index[0]] + " " + new_fragments[fragment_to_append]

            if len(words_to_substitute_in_new) - 1 > index[
                    0]:  # there are at least one word right from this
                right = words_to_substitute_in_new[index[0] + 1:]
                self.recursiveLevenshtein(True, new_fragments, right, old_list,
                                          index_to_append, end_before)

        else:  # add the remaining new words
            fragment_to_append = self.getFragmentID(start_after + 1)
            for word in words_to_substitute_in_new:
                new_fragments[fragment_to_append] += word + " "
Ejemplo n.º 2
0
    def updateWithDynamicTimeWarping(self, new_paragraph):

        old_paragraph = self.getParagraph()
        if old_paragraph != new_paragraph:  # it's the same? -> if yes, do nothing
            if self.fragments == 0:  # safety check to avoid error
                print(
                    "Warning! Into empty paragraphs, inserting is not available."
                )
            elif not len(
                    new_paragraph):  # empty input, all fragment must erased
                self.eraseFragments()
            elif len(
                    self.fragments
            ) == 1:  # if there is only one fragment, there are no problem (optimal case)
                self.fragments[0] = new_paragraph
            else:  # more than one fragment, so this is where the fun begins

                old_paragraph = self.createSplitableParagraph()
                old_list = old_paragraph.split(" ")  # init arrays
                new_list = new_paragraph.split(" ")
                distance_matrix = Utils.dtw(
                    new_list, old_list
                )  # run dtw, based on edit distance algorithm(Levenshtein)
                self.calculateWordFreqs()

                new_fragments = [""] * len(self.fragments)

                depth_limit = 0  # limit to spare time and avoid fake matches from the repeating words
                actual_fragment = 0  # store the actual fragment's index for further use
                last_assigned_from_old = -1  # pointer to store which old word was handled at the last match

                for col_index in range(len(
                        distance_matrix[0])):  # col_index ->  old words
                    for row_index in range(
                            depth_limit,
                            len(distance_matrix)):  #row_index -> new words

                        if distance_matrix[
                                row_index,
                                col_index] == 0:  # if the Levenshtein dist is 0, it's the same word

                            actual_fragment = self.getFragmentID(
                                col_index
                            )  # get the original place of the matched word
                            self.alignAddedWords(
                                new_fragments, new_list, old_list,
                                actual_fragment, last_assigned_from_old,
                                col_index, depth_limit, row_index
                            )  # add the words from the new before the match

                            new_fragments[actual_fragment] += old_list[
                                col_index] + " "  # add the match

                            last_assigned_from_old = col_index
                            depth_limit = row_index + 1  # set new limit -> under this the matrix can contain "fake" matches (they're coming from the repeating words)
                            break

                self.alignAddedWords(new_fragments, new_list, old_list,
                                     actual_fragment, last_assigned_from_old,
                                     col_index, depth_limit,
                                     len(new_list))  #add the remaining words

                self.removeUnwantedSpaces(new_fragments)
                self.fragments = new_fragments