コード例 #1
0
def find_common_substrings(content, dict_term, partial_match_min_size, partial_match_thresh):
    """
    Scan dict_term finding any common substrings from dict_term.  For each possible common substring, only the first one is found.
    """
    results = []
    len_content = len(content)
    len_term = len(dict_term)
    i = 0
    while i < len_content:
        match_start = -1
        matched_chars = 0
        # Ignore white space
        if content[i].isspace():
            i += 1
            continue;
        match = None
        for j in range(len_term):
            char_match = (i + matched_chars < len_content and content[i + matched_chars] == dict_term[j])
            if char_match and match_start == -1:
                match_start = j
            elif match_start > -1 and not char_match:
                match = Match(i, match_start, j - match_start)
                break
            if char_match:
                matched_chars += 1
        # Check for match at the end
        if match is None and match_start > -1:
            match = Match(i, match_start, len_term - match_start)
        # Process content match
        if not match is None:
            # Ignore matches if they aren't big enough
            # No partial matches for small terms
            if len_term <= partial_match_min_size:
                if match.size >= len_term:
                    results.append(match)
            # If the term is larger, we can have content partial match
            elif match.size >= int(len_term * partial_match_thresh):
                results.append(match)
            i += match.size
        else:
            i += 1

    # Compute word length for matched substrings
    # The word is terminated by whitespace, or /, unless the character in question is also present in the dictionary term at the same location
    results_mod = []
    for res in results:
        start_idx = res.a
        start_idx_b = res.b
        while (start_idx > 0 and (content[start_idx - 1].isalpha() or content[start_idx - 1] == '_')) or (start_idx > 0 and start_idx_b > 0 and content[start_idx - 1] == dict_term[start_idx_b - 1]):
            start_idx -= 1
            start_idx_b -= 1
        end_idx = res.a
        end_idx_b = res.b
        while (end_idx < len_content and (content[end_idx].isalpha() or content[end_idx] == '_')) or (end_idx < len_content and end_idx_b < len_term and content[end_idx] == dict_term[end_idx_b]):
            end_idx += 1
            end_idx_b += 1
        content_word_length = end_idx - start_idx
        results_mod.append(IPSMatch(res.a, res.b, res.size, content_word_length))
    return results_mod
コード例 #2
0
ファイル: mismatch.py プロジェクト: shwina/still-magic
def align(options, path, included):
    '''
    Display side by side.
    '''
    actual = get_src_file(path)
    if is_simple_inclusion(options, included, actual):
        return

    included = included.rstrip('\n').split('\n')
    actual = actual.rstrip('\n').split('\n')
    matches = [Match(0, 0, 0)] + \
        SequenceMatcher(a=included, b=actual).get_matching_blocks()

    result = []
    diffs_found = False
    fmt = '{{0}}|{{1:{}}}|{{2}}'.format(max([len(x) for x in included]))
    for i in range(len(matches) - 1):
        diffs_found |= align_one(result, options, fmt, included, actual,
                                 matches[i], matches[i + 1])

    if options['names_only']:
        if diffs_found:
            print(path)
    elif diffs_found or options['verbose']:
        print('\n-- {}'.format(path))
        for r in result:
            print(r)
コード例 #3
0
ファイル: util.py プロジェクト: runt18/git-cinnabar
def _iter_diff_blocks(a, b):
    m = SequenceMatcher(a=a, b=b, autojunk=False).get_matching_blocks()
    for start, end in zip(chain((Match(0, 0, 0),), m), m):
        if start.a + start.size != end.a or start.b + start.size != end.b:
            yield start.a + start.size, end.a, start.b + start.size, end.b
コード例 #4
0
    def __calculate_blocks(self, rev, min_threshold=0.6):
        """ Calculate line by line, which lines have changed based on min_threshold and then
        check for within line changes (char by char) and return which a list of matched code blocks
        that have remained the same.

        :param rev: text of new revision
        :type rev: str
        :param min_threshold: a percentage of similarity for line by line comparisons
        :type min_threshold: float bound between 0.0 to 1.0

        :return: matched code blocks that have remained the same, list of tuples
        :rtype: list [(start position in original text, start pos in new text, length),(),()...]
        """
        matches = []  # contains tuples of matched parts
        original = self.code_text.splitlines(
            True)  # original text with split lines (retains \n as a char)
        new = rev.splitlines(True)  # the new submitted text
        found = True

        # Calculate start positions for each line in original strings
        char_start_original = []
        char_start_new = []
        additive = 0
        for x in range(0, len(original)):
            char_start_original.append(additive)
            additive += len(original[x])
        additive = 0
        for y in range(0, len(new)):
            char_start_new.append(additive)
            additive += len(new[y])

        # Match identification code below
        # worst-case: O(x*y) or O(x^2) if x and y equal length and changes existing in all lines
        # best-case: O(x)

        # Constructing a hash-multiset to speed the process later on
        cnt = Counter()
        for word in new:
            cnt[word] += 1

        diffs = []
        new_tmp = new  # Temporary object that we modify on the fly, used for reference
        y_list = list(range(
            0, len(new)))  # Temporary object for dynamic recursion
        counter = 0
        for x in range(0, len(original)):
            diffs.append([])
            if cnt[original[
                    x]] > 0:  # it exists (this is O(1) which helps skip a lot of comparisons)
                y = y_list.index(
                    new_tmp.index(original[x])
                )  # reference index number in y_list (iterable)
                # Adding a matched record that simulates what difflib would find if it were to compare the two strings
                # Basically the whole new line matches the old, difflib always has a zero size match as the last
                # element.
                diffs[x].append([
                    x, y_list[y], 1.0,
                    [
                        Match(a=0, b=0, size=len(original[x])),
                        Match(a=len(original[x]), b=len(original[x]), size=0)
                    ]
                ])
                del (y_list[y])  # delete the existing object's line
                # This is like deleting the record for the purposes of retrieving the index from the tmp object.
                # Deleting would have shifted the numbers
                new_tmp[new_tmp.index(original[x])] = 0
                cnt[original[x]] -= 1  # decrement
            else:  # no duplicate so we have to compare the item with the rest of the list (code modified or removed)
                for y in range(0, len(y_list)):
                    counter += 1
                    line_diff_result = difflib.SequenceMatcher(None,
                                                               original[x],
                                                               new[y_list[y]],
                                                               autojunk=False)
                    # Sanity check below, the hash-multiset should have removed all identical lines
                    if line_diff_result.ratio() == 1:
                        diffs[x].append([
                            x, y_list[y],
                            line_diff_result.ratio(),
                            line_diff_result.get_matching_blocks()
                        ])
                        del (y_list[y])
                        break
                    else:
                        diffs[x].append([
                            x, y_list[y],
                            line_diff_result.ratio(),
                            line_diff_result.get_matching_blocks()
                        ])
        print(
            "Total comparisons: " + str(counter)
        )  # For visually seeing whether the optimizations work and we avoid n^2
        del cnt
        del new_tmp

        # Iterate through all the calculated diffs and figure out the best matches
        # The look keeps going on and on until all possible matches are found (could be rewritten as a recursive func)
        # to_delete serves a deleting agent so that after a match is append the lines found to much as eliminated
        # from further consideration
        to_delete = -999  # init a non-sense number
        while found is True:
            found = False
            max_match = [
                0, 0, 0, 0
            ]  # This will hold the best match between line_x and line_y
            for x in range(0, len(diffs)):
                for y in range(0, len(diffs[x])):
                    if diffs[x][y][1] == to_delete:
                        diffs[x][y] = [0, 0, 0, 0]
                    else:
                        if diffs[x][y][2] > min_threshold and max_match[
                                2] < diffs[x][y][2]:
                            max_match = [
                                diffs[x][y][0], diffs[x][y][1], diffs[x][y][2],
                                diffs[x][y][3], x
                            ]
            if max_match[
                    2] != 0:  # we found a line that looks similar enough and was likely moved
                found = True
                for m in max_match[3]:
                    if m[2] != 0:  # make sure that the matched content matches at least 1 char (sanity check)
                        matches.append([
                            char_start_original[max_match[0]] + m[0],
                            char_start_new[max_match[1]] + m[1], m[2]
                        ])
                del (diffs[max_match[4]])
                to_delete = max_match[1]
        return matches
コード例 #5
0
    def add_match(self, page, match):
        # l('ADDING ' + str(match))
        info = RangeMatch(self, page, match)
        # l(info)
        pageno = page.info['number']
        pagenoval = rnum_to_int(pageno)
        if pagenoval == 0 and len(pageno) > 0:
            pagenoval = int(pageno)

        matchint = Interval.between(match.b, match.b + match.size)

        overlaps = [m for m in self.matches if m & matchint]

        # if nearnos matches either, mark flag and amp score
        if pageno:
            nearnos = self.find_nearnos(match)
            # l("GREPME near is [%s] pagenoval %s" % (nearnos, pagenoval))
            # for no in nearnos[1], nearnos[0]:
            if nearnos is None:  # XXX SHOULDN"T BE NEEDED!!!!!!!!!!!!
                nearnos = []
            for no in nearnos[1], nearnos[0]:
                # for no in nearnos:
                if no is not None:
                    # l(no.val)
                    if no.val == pagenoval:
                        info.notes += 'nearno: %s' % pageno
                        # l("GOODMATCH tc %s, %s %s" % (self.page.index, pageno, self.score))
                        self.score += 1
                        info.nearno = no.word_index
                        break
                    if no.val > pagenoval - 10 and match.a < 10:
                        self.score += .01
                        break

        # cases: no overlap
        if len(overlaps) == 0:
            self.matchinfo[matchint] = info
            self.matches = self.matches + IntervalSet([matchint])
        else:
            start = match.b
            end = match.b + match.size
            for i in overlaps:
                oinfo = self.matchinfo[i]
                ostart = oinfo.match.b
                oend = oinfo.match.b + oinfo.match.size
                scootback = 0
                if ostart < start:
                    scootback = start - ostart
                    start = ostart
                if oend > end:
                    end = oend
                info.match = Match(info.match.a - scootback, start,
                                   end - start)
                if oinfo.nearno != -1:
                    # assert(info.nearno == -1)
                    info.nearno = oinfo.nearno
                # info.score += oinfo.score
                # info.pageno = oinfo.pageno
                # info.notes = info.notes + ' ' + info.notes
                # for opageno in oinfo.pagenos:
                #     opagecount = oinfo.pagenos[opageno]
                #     if opageno in info.pagenos:
                #         info.pagenos[opageno] += opagecount
                #     else:
                #         info.pagenos[opageno] = opagecount
            self.matches += IntervalSet([matchint])
            (new_i, ) = [m for m in self.matches if m & matchint]
            self.matchinfo[new_i] = info
コード例 #6
0
ファイル: difflib2.py プロジェクト: pombredanne/vhash
    def find_longest_match(self, alo, ahi, blo, bhi):
        """Find longest matching block in a[alo:ahi] and b[blo:bhi].

        If isjunk is not defined:

        Return (i,j,k) such that a[i:i+k] is equal to b[j:j+k], where
            alo <= i <= i+k <= ahi
            blo <= j <= j+k <= bhi
        and for all (i',j',k') meeting those conditions,
            k >= k'
            i <= i'
            and if i == i', j <= j'

        In other words, of all maximal matching blocks, return one that
        starts earliest in a, and of all those maximal matching blocks that
        start earliest in a, return the one that starts earliest in b.

        >>> s = SequenceMatcher(None, " abcd", "abcd abcd")
        >>> s.find_longest_match(0, 5, 0, 9)
        Match(a=0, b=4, size=5)

        If isjunk is defined, first the longest matching block is
        determined as above, but with the additional restriction that no
        junk element appears in the block.  Then that block is extended as
        far as possible by matching (only) junk elements on both sides.  So
        the resulting block never matches on junk except as identical junk
        happens to be adjacent to an "interesting" match.

        Here's the same example as before, but considering blanks to be
        junk.  That prevents " abcd" from matching the " abcd" at the tail
        end of the second sequence directly.  Instead only the "abcd" can
        match, and matches the leftmost "abcd" in the second sequence:

        >>> s = SequenceMatcher(lambda x: x==" ", " abcd", "abcd abcd")
        >>> s.find_longest_match(0, 5, 0, 9)
        Match(a=1, b=0, size=4)

        If no blocks match, return (alo, blo, 0).

        >>> s = SequenceMatcher(None, "ab", "c")
        >>> s.find_longest_match(0, 2, 0, 1)
        Match(a=0, b=0, size=0)
        """

        # CAUTION:  stripping common prefix or suffix would be incorrect.
        # E.g.,
        #    ab
        #    acab
        # Longest matching block is "ab", but if common prefix is
        # stripped, it's "a" (tied with "b").  UNIX(tm) diff does so
        # strip, so ends up claiming that ab is changed to acab by
        # inserting "ca" in the middle.  That's minimal but unintuitive:
        # "it's obvious" that someone inserted "ac" at the front.
        # Windiff ends up at the same place as diff, but by pairing up
        # the unique 'b's and then matching the first two 'a's.

        a, b, b2j, isbjunk = self.a, self.b, self.b2j, self.isbjunk
        besti, bestj, bestsize = alo, blo, 0
        # find longest junk-free match
        # during an iteration of the loop, j2len[j] = length of longest
        # junk-free match ending with a[i-1] and b[j]
        j2len = {}
        nothing = []
        for i in xrange(alo, ahi):
            # look at all instances of a[i] in b; note that because
            # b2j has no junk keys, the loop is skipped if a[i] is junk
            j2lenget = j2len.get
            newj2len = {}
            for j in b2j.get(a[i], nothing):
                # a[i] matches b[j]
                if j < blo:
                    continue
                if j >= bhi:
                    break
                k = newj2len[j] = j2lenget(j-1, 0) + 1
                if k > bestsize:
                    besti, bestj, bestsize = i-k+1, j-k+1, k
            j2len = newj2len

        # Extend the best by non-junk elements on each end.  In particular,
        # "popular" non-junk elements aren't in b2j, which greatly speeds
        # the inner loop above, but also means "the best" match so far
        # doesn't contain any junk *or* popular non-junk elements.
        while besti > alo and bestj > blo and \
              not isbjunk(b[bestj-1]) and \
              self.match_function(a[besti-1], b[bestj-1]):
            besti, bestj, bestsize = besti-1, bestj-1, bestsize+1
        while besti+bestsize < ahi and bestj+bestsize < bhi and \
              not isbjunk(b[bestj+bestsize]) and \
              self.match_function(a[besti+bestsize], b[bestj+bestsize]):
            bestsize += 1

        # Now that we have a wholly interesting match (albeit possibly
        # empty!), we may as well suck up the matching junk on each
        # side of it too.  Can't think of a good reason not to, and it
        # saves post-processing the (possibly considerable) expense of
        # figuring out what to do with it.  In the case of an empty
        # interesting match, this is clearly the right thing to do,
        # because no other kind of match is possible in the regions.
        while besti > alo and bestj > blo and \
              isbjunk(b[bestj-1]) and \
              self.match_function(a[besti-1], b[bestj-1]):
            besti, bestj, bestsize = besti-1, bestj-1, bestsize+1
        while besti+bestsize < ahi and bestj+bestsize < bhi and \
              isbjunk(b[bestj+bestsize]) and \
              self.match_function(a[besti+bestsize], b[bestj+bestsize]):
            bestsize = bestsize + 1

        return Match(besti, bestj, bestsize)
コード例 #7
0
ファイル: search.py プロジェクト: zenokoller/sequence
 def adapt_match(match: Match) -> Match:
     return Match(a=match.a,
                  b=match.b + lower,
                  size=match.size + processing_offset)