Python SequenceMatcher Examples

Programming Language: Python

Namespace/Package Name: cdifflib

Class/Type: SequenceMatcher

Examples at hotexamples.com: 3

Python SequenceMatcher - 3 examples found. These are the top rated real world Python examples of cdifflib.SequenceMatcher extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

SequenceMatcher(3)

Frequently Used Methods

SequenceMatcher (3)

Example #1

Show file

File: most_similar.py Project: codeman38/bin-similarity

def main():
    parser = argparse.ArgumentParser(
        description='Finds the most similar files to a given file.')
    parser.add_argument('target', help='file for which to find matches')
    parser.add_argument('other', nargs='+', help='other file(s) to compare')
    parser.add_argument(
        '-n',
        '--num',
        metavar='N',
        type=int,
        default=0,
        help='use quick_ratio and keep only the N best guesses '
        'before calculating the true similarity ratios')
    parser.add_argument('-l',
                        '--longest',
                        action='store_true',
                        help='use longest match instead of ratio')
    parser.add_argument('-s',
                        '--scaled',
                        action='store_true',
                        help='scale ratios relative to file sizes (including '
                        'initial filtering by rough ratio)')
    parser.add_argument(
        '-m',
        '--maxbytes',
        metavar='N',
        type=int,
        default=-1,
        help='limit comparisons to the first N bytes from each file '
        '(default: entire file)')
    args = parser.parse_args()

    with open(args.target, 'rb') as fp:
        seq1 = fp.read(args.maxbytes)

    matcher = SequenceMatcher()
    matcher.set_seq2(list(seq1))

    if args.num > 0:
        estimates = []
        for fname in args.other:
            if fname == args.target:
                continue
            with open(fname, 'rb') as fp:
                seq2 = fp.read(args.maxbytes)
            matcher.set_seq1(list(seq2))
            ratio = matcher.quick_ratio()
            estimates.append((fname, ratio))
        estimates.sort(key=lambda x: x[1])
        estimates = estimates[-args.num:]
        nbest = [x[0] for x in estimates]
    else:
        nbest = args.other

    actuals = []
    for idx, fname in enumerate(nbest):
        print('{0}/{1}'.format(idx, len(nbest)), file=sys.stderr)
        with open(fname, 'rb') as fp:
            seq2 = fp.read(args.maxbytes)
        matcher.set_seq1(list(seq2))
        metric = matcher.ratio()
        if args.longest:
            metric = max(x.size for x in matcher.get_matching_blocks())
        else:
            metric = matcher.ratio()
            if args.scaled:
                metric *= (len(seq1) + len(seq2)) / 2
        actuals.append((fname, metric))
    actuals.sort(key=lambda x: x[1])
    for stat in actuals:
        print('{0}\t{1}'.format(stat[1], stat[0]))

Example #2

Show file

File: linter.py Project: NiallRees/sqlfluff

    def fix_string(self, verbosity=0):
        """Obtain the changes to a path as a string.

        We use the file_mask to do a safe merge, avoiding any templated
        sections. First we need to detect where there have been changes
        between the fixed and templated versions. The file mask is of
        the format: (raw_file, templated_file, fixed_file).

        We use difflib.SequenceMatcher.get_opcodes
        See: https://docs.python.org/3.7/library/difflib.html#difflib.SequenceMatcher.get_opcodes
        It returns a list of tuples ('equal|replace|delete|insert', ia1, ia2, ib1, ib2).

        """
        bencher = BenchIt()
        bencher("fix_string: start")

        # Do we have enough information to actually fix the file?
        if any(elem is None for elem in self.file_mask):
            verbosity_logger(
                "Insufficient information to fix file: {0}".format(
                    self.file_mask),
                verbosity=verbosity)
            return None, False

        verbosity_logger("Persisting file masks: {0}".format(self.file_mask),
                         verbosity=verbosity)
        # Compare Templated with Raw
        diff_templ = SequenceMatcher(autojunk=None,
                                     a=self.file_mask[0],
                                     b=self.file_mask[1])
        bencher("fix_string: Match 0&1")
        diff_templ_codes = diff_templ.get_opcodes()
        verbosity_logger("Templater diff codes: {0}".format(diff_templ_codes),
                         verbosity=verbosity)

        bencher("fix_string: Got Opcodes 0&1")
        # Compare Fixed with Templated
        diff_fix = SequenceMatcher(autojunk=None,
                                   a=self.file_mask[1],
                                   b=self.file_mask[2])
        bencher("fix_string: Matched 1&2")
        # diff_fix = SequenceMatcher(autojunk=None, a=self.file_mask[1][0], b=self.file_mask[2][0])
        diff_fix_codes = diff_fix.get_opcodes()
        verbosity_logger("Fixing diff codes: {0}".format(diff_fix_codes),
                         verbosity=verbosity)
        bencher("fix_string: Got Opcodes 1&2")

        # If diff_templ isn't the same then we should just keep the template. If there *was*
        # a fix in that space, then we should raise an issue
        # If it is the same, then we can apply fixes as expected.
        write_buff = ''
        fixed_block = None
        templ_block = None
        # index in raw, templ and fix
        idx = (0, 0, 0)
        loop_idx = 0
        bencher("fix_string: Loop Setup")
        while True:
            loop_idx += 1
            verbosity_logger("{0:04d}: Write Loop: idx:{1}, buff:{2!r}".format(
                loop_idx, idx, write_buff),
                             verbosity=verbosity)

            if templ_block is None:
                if diff_templ_codes:
                    templ_block = diff_templ_codes.pop(0)
                # We've exhausted the template. Have we exhausted the fixes?
                elif fixed_block is None and not diff_fix_codes:
                    # Yes - excellent. DONE
                    break
                # Deal with the case that we only have inserts left.
                elif all(elem[0] == 'insert' for elem in diff_fix_codes):
                    for fixed_block in diff_fix_codes:
                        write_buff += self.file_mask[2][
                            fixed_block[3]:fixed_block[4]]
                    break
                else:
                    raise NotImplementedError(
                        "Fix Block(s) left over! Don't know how to handle this! aeflf8wh"
                    )
            if fixed_block is None:
                if diff_fix_codes:
                    fixed_block = diff_fix_codes.pop(0)
                # One case is that we just consumed the last block of both, so check indexes
                # to see if we're at the end of the raw file.
                elif idx[0] >= len(self.file_mask[0]):
                    # Yep we're at the end
                    break
                else:
                    raise NotImplementedError(
                        "Unexpectedly depleted the fixes. Panic!")
            verbosity_logger("{0:04d}: Blocks: template:{1}, fix:{2}".format(
                loop_idx, templ_block, fixed_block),
                             verbosity=verbosity)

            if templ_block[0] == 'equal':
                if fixed_block[0] == 'equal':
                    # No templating, no fixes, go with middle and advance indexes
                    # Find out how far we can advance (we use the middle version because it's common)
                    if templ_block[4] == fixed_block[2]:
                        buff = self.file_mask[1][idx[1]:fixed_block[2]]
                        # consume both blocks
                        fixed_block = None
                        templ_block = None
                    elif templ_block[4] > fixed_block[2]:
                        buff = self.file_mask[1][idx[1]:fixed_block[2]]
                        # consume fixed block
                        fixed_block = None
                    elif templ_block[4] < fixed_block[2]:
                        buff = self.file_mask[1][idx[1]:templ_block[4]]
                        # consume templ block
                        templ_block = None
                    idx = (idx[0] + len(buff), idx[1] + len(buff),
                           idx[2] + len(buff))
                    write_buff += buff
                    continue
                elif fixed_block[0] == 'replace':
                    # Consider how to apply fixes.
                    # Can we implement the fix while staying in the equal segment?
                    if fixed_block[2] <= templ_block[4]:
                        # Yes! Write from the fixed version.
                        write_buff += self.file_mask[2][idx[2]:fixed_block[4]]
                        idx = (idx[0] + (fixed_block[2] - fixed_block[1]),
                               fixed_block[2], fixed_block[4])
                        # Consume the fixed block because we've written the whole thing.
                        fixed_block = None
                        continue
                    else:
                        raise NotImplementedError("DEF")
                elif fixed_block[0] == 'delete':
                    # We're deleting items, nothing to write but we can consume some
                    # blocks and advance some indexes.
                    idx = (idx[0] + (fixed_block[2] - fixed_block[1]),
                           fixed_block[2], fixed_block[4])
                    fixed_block = None
                elif fixed_block[0] == 'insert':
                    # We're inserting items, Write from the fix block, but only that index moves.
                    write_buff += self.file_mask[2][idx[2]:fixed_block[4]]
                    idx = (idx[0], idx[1], fixed_block[4])
                    fixed_block = None
                else:
                    raise NotImplementedError((
                        "Unexpected opcode {0} for fix block! Please report this "
                        "issue on github with the query and rules you're trying to "
                        "fix.").format(fixed_block[0]))
            elif templ_block[0] == 'replace':
                # We're in a templated section - we should write the templated version.
                # we should consume the whole replace block and then deal with where
                # we end up.
                buff = self.file_mask[0][idx[0]:templ_block[2]]
                new_templ_idx = templ_block[4]

                # Fast forward through fix blocks until we catch up. We're not implementing
                # any changes in a templated section.
                while True:
                    if fixed_block[2] > new_templ_idx >= fixed_block[1]:
                        # this block contains the end point
                        break
                    else:
                        # We're not at the end point yet, continue to fast forward through.
                        if fixed_block[0] != 'equal':
                            print("WARNING: Skipping edit block: {0}".format(
                                fixed_block))
                        if diff_fix_codes:
                            fixed_block = diff_fix_codes.pop(0)
                        else:
                            raise NotImplementedError(
                                "Unexpectedly depleted the fixes. Panic!")
                # Are we exactly on a join?
                if new_templ_idx == fixed_block[1]:
                    # GREAT - this makes things easy because we have an equality point already
                    idx = (templ_block[2], new_templ_idx, fixed_block[3])
                else:
                    if fixed_block[0] == 'equal':
                        # If it's in an equal block, we can use the same offset from the end.
                        idx = (templ_block[2], new_templ_idx, fixed_block[3] +
                               (new_templ_idx - fixed_block[1]))
                    else:
                        # TODO: We're trying to move through an templated section, but end up
                        # in a fixed section. We've lost track of indexes.
                        # We might need to panic if this happens...
                        print("UMMMMMM!")
                        print(new_templ_idx)
                        print(fixed_block)
                        raise NotImplementedError("ABC")
                write_buff += buff
                # consume template block
                templ_block = None
            elif templ_block[0] == 'delete':
                # The comparison, things that the templater has deleted
                # some characters. This is just a quirk of the differ.
                # In reality this means we just write these characters
                # and don't worry about advancing the other indexes.
                buff = self.file_mask[0][idx[0]:templ_block[2]]
                # consume templ block
                templ_block = None
                idx = (idx[0] + len(buff), idx[1], idx[2])
                write_buff += buff
            elif templ_block[0] == 'insert':
                # The templater has inserted something here. We don't need
                # to write anything here (because whatever we're looking at
                # was inserted by the templater), but we do need to keep
                # track of what happened to the rest of the section we're in.
                # If nothing was fixed then it's easy because the indices
                # will be the same. Otherwise... great question...

                # For now let's just deal with the happy case where the fixed
                # block is equal
                if fixed_block[0] == 'equal':
                    # Let's make sure we can consume enough to get through the
                    # templ block and not get to the end of the fix block.
                    if templ_block[4] <= fixed_block[2]:
                        insert_len = templ_block[4] - templ_block[3]
                        idx = (idx[0], idx[1] + insert_len,
                               idx[2] + insert_len)
                        # if things matched up perfectly, consume the fixed block
                        if templ_block[4] == fixed_block[2]:
                            fixed_block = None
                        # always consume templ block in this case
                        templ_block = None
                    else:
                        raise NotImplementedError((
                            "Unexpected scenario during insert opcode! Please report "
                            "this issue on github with the query and rules you're trying "
                            "to fix."))
                else:
                    raise NotImplementedError((
                        "Unexpected opcode {0} for fix block! Please report this "
                        "issue on github with the query and rules you're trying to "
                        "fix.").format(fixed_block[0]))
            else:
                raise NotImplementedError((
                    "Unexpected opcode {0} for template block! Please report this "
                    "issue on github with the query and rules you're trying to "
                    "fix.").format(templ_block[0]))

        bencher("fix_string: Fixing loop done")
        # The success metric here is whether anything ACTUALLY changed.
        return write_buff, write_buff != self.file_mask[0]

Example #3

Show file

File: normalizer.py Project: ksheth3939/SectionNormalization

    def normalize(self, section, row):
        """normalize a single (section, row) input

        Given a (Section, Row) input, returns (section_id, row_id, valid)
        where
            section_id = int or None
            row_id = int or None
            valid = True or False

        Arguments:
            section {[type]} -- [description]
            row {[type]} -- [description]
        """
        # Strip whitespace and preceding zeros and put into lowercase
        row = row.strip().lstrip('0').lower()
        section = section.strip().lower()

        # Initialize values to return
        section_id = -1
        row_id = -1
        valid = False

        # Find section number for section argument
        section_int = ''.join(dig for dig in section if dig.isdigit())

        # Create a normalized version of section for use
        builder = ''
        splitted = section.split(' ')
        if len(splitted) == 1:
            letters = ''
            digits = ''
            for s in splitted[0]:
                if s.isdigit():
                    digits += s
                else:
                    letters += s
            builder = letters + ' ' + digits

        else:
            for word in splitted:
                if word.isalpha():
                    builder += word[0]
                elif word.isdigit():
                    if len(builder) == 0:
                        builder = word
                    else:
                        builder += ' ' + word

        # Searches for corresponding section through section number and abbreviated section name
        try:
            section_num_keys = self.data[section_int].keys()
            most_similar = section_num_keys[0]

            # Uses string similarity package to probabilistically determine the correct section
            max_accuracy = SequenceMatcher(None, builder, most_similar).ratio()

            # Determine which section is most likely
            for code in section_num_keys[1:]:
                acc = SequenceMatcher(None, builder, code).ratio()
                if acc > max_accuracy:
                    most_similar = code
                    max_accuracy = acc

            section_id = self.data[section_int][most_similar][0]
        except:
            return '', '', False

        # Searches for corresponding row id through row name
        try:
            row_id = self.data[section_int][most_similar][1][row]
            return section_id, row_id, True
        except:
            return section_id, '', False