def recursiveLevenshtein(self, mode, new_fragments, words_to_substitute_in_new, old_list, start_after, end_before): if not words_to_substitute_in_new: return words_to_substitute_in_old = old_list[start_after + 1:end_before] if words_to_substitute_in_old: distance_matrix = Utils.dtw(words_to_substitute_in_new, words_to_substitute_in_old) index = self.findMostSimilarWordIndex(distance_matrix, words_to_substitute_in_new, words_to_substitute_in_old) index_to_append = start_after + 1 + index[1] fragment_to_append = self.getFragmentID(index_to_append) if index[0] != 0: # there are at least one word left from this left = words_to_substitute_in_new[:index[0]] self.recursiveLevenshtein(False, new_fragments, left, old_list, start_after, index_to_append) if mode: new_fragments[ fragment_to_append] += words_to_substitute_in_new[ index[0]] + " " else: new_fragments[fragment_to_append] = words_to_substitute_in_new[ index[0]] + " " + new_fragments[fragment_to_append] if len(words_to_substitute_in_new) - 1 > index[ 0]: # there are at least one word right from this right = words_to_substitute_in_new[index[0] + 1:] self.recursiveLevenshtein(True, new_fragments, right, old_list, index_to_append, end_before) else: # add the remaining new words fragment_to_append = self.getFragmentID(start_after + 1) for word in words_to_substitute_in_new: new_fragments[fragment_to_append] += word + " "
def updateWithDynamicTimeWarping(self, new_paragraph): old_paragraph = self.getParagraph() if old_paragraph != new_paragraph: # it's the same? -> if yes, do nothing if self.fragments == 0: # safety check to avoid error print( "Warning! Into empty paragraphs, inserting is not available." ) elif not len( new_paragraph): # empty input, all fragment must erased self.eraseFragments() elif len( self.fragments ) == 1: # if there is only one fragment, there are no problem (optimal case) self.fragments[0] = new_paragraph else: # more than one fragment, so this is where the fun begins old_paragraph = self.createSplitableParagraph() old_list = old_paragraph.split(" ") # init arrays new_list = new_paragraph.split(" ") distance_matrix = Utils.dtw( new_list, old_list ) # run dtw, based on edit distance algorithm(Levenshtein) self.calculateWordFreqs() new_fragments = [""] * len(self.fragments) depth_limit = 0 # limit to spare time and avoid fake matches from the repeating words actual_fragment = 0 # store the actual fragment's index for further use last_assigned_from_old = -1 # pointer to store which old word was handled at the last match for col_index in range(len( distance_matrix[0])): # col_index -> old words for row_index in range( depth_limit, len(distance_matrix)): #row_index -> new words if distance_matrix[ row_index, col_index] == 0: # if the Levenshtein dist is 0, it's the same word actual_fragment = self.getFragmentID( col_index ) # get the original place of the matched word self.alignAddedWords( new_fragments, new_list, old_list, actual_fragment, last_assigned_from_old, col_index, depth_limit, row_index ) # add the words from the new before the match new_fragments[actual_fragment] += old_list[ col_index] + " " # add the match last_assigned_from_old = col_index depth_limit = row_index + 1 # set new limit -> under this the matrix can contain "fake" matches (they're coming from the repeating words) break self.alignAddedWords(new_fragments, new_list, old_list, actual_fragment, last_assigned_from_old, col_index, depth_limit, len(new_list)) #add the remaining words self.removeUnwantedSpaces(new_fragments) self.fragments = new_fragments