Ejemplo n.º 1
0
def test_cred_form(url, username, password, host, port):
    try:
        driver = get_new_selenium_driver(host, port)
        driver.get(url)
        initial_page = driver.page_source
        username_input, password_input, click_button = get_form_objects(driver)
        username_input.clear()
        username_input.send_keys(username)
        password_input.clear()
        password_input.send_keys(password)
        click_button.click()
        sleep(3)
        m = CSequenceMatcher(None, initial_page, driver.page_source)
        logger.debug(f"{username}:{password} ratio: {m.ratio()}")
        if m.ratio() < 0.8:
            return [{
                'ratio': m.ratio(),
                'username': username,
                'password': password
            }]
        return None
    except Exception as e:
        logger.error(e)
    finally:
        if driver is not None:
            driver.close()
Ejemplo n.º 2
0
    def _diff(cls, base, cmp, check_variant=True, label=None):
        lbl = {'base': 'base', 'cmp': 'cmp'}
        if label:
            lbl.update(label)
        ret, line_no = [], 1
        s = CSequenceMatcher(None, base, cmp, autojunk=False)
        for tag, i1, i2, j1, j2 in s.get_opcodes():
            t1, t2 = base[i1:i2], cmp[j1:j2]
            # print('{:7}   a[{}:{}] --> b[{}:{}] {!r:>8} --> {!r}'.format(tag, i1, i2, j1, j2, t1, t2))
            if '\n' in t1:  # 换行符
                lst1 = t1.split('\n')
                for k, _t1 in enumerate(lst1):
                    if _t1 != '':
                        ret.append({
                            'line_no': line_no,
                            'is_same': False,
                            lbl['base']: _t1,
                            lbl['cmp']: t2
                        })
                        t2 = ''
                    elif k == len(lst1) - 1 and t2:
                        ret.append({
                            'line_no': line_no,
                            'is_same': False,
                            lbl['base']: _t1,
                            lbl['cmp']: t2
                        })
                    if k < len(lst1) - 1:  # 换行
                        ret.append({
                            'line_no': line_no,
                            'is_same': True,
                            lbl['base']: '\n'
                        })
                        line_no += 1
            else:
                is_same = True if tag == 'equal' else False
                r = {
                    'line_no': line_no,
                    'is_same': is_same,
                    lbl['base']: t1,
                    lbl['cmp']: t2
                }
                if check_variant and len(t1) == 1 and len(
                        t2) == 1 and t1 != t2 and is_variant(t1, t2):
                    r['is_variant'] = True
                ret.append(r)

        # 设置起止位置
        line_no, start = 1, 0
        for r in ret:
            if r['line_no'] != line_no:  # 换行
                line_no += 1
                start = 0
            end = start + len(r[lbl['base']])
            r['range'] = (start, end)
            start = end

        return ret
Ejemplo n.º 3
0
 def find_books(self, txt):
     l = [(max(
         CSequenceMatcher(None, txt.lower(), book.name.lower()).ratio(),
         CSequenceMatcher(None, txt.lower(), book.author.lower()).ratio()),
           '@' + self.get_user(book.owner_id).name, book.name, book.author)
          for book in self.books]
     l.sort()
     l.reverse()
     l = [i[1:] for i in l]
     return l
Ejemplo n.º 4
0
def get_group_similarity_alignment(group_a,
                                   group_b,
                                   early_stopping_threshold=0.0,
                                   w=None,
                                   min_alert_match_similarity=0.0,
                                   alignment_weight=0.0,
                                   partial=False):
    if min(len(group_a.merge_seq), len(group_b.merge_seq)) / max(
            len(group_a.merge_seq), len(
                group_b.merge_seq)) < early_stopping_threshold:
        return 0.0
    s = 0.0
    alert_matching = find_alert_matching(
        group_a.bag_of_alerts.keys(),
        group_b.bag_of_alerts.keys(),
        early_stopping_threshold=0.0,
        w=w,
        min_alert_match_similarity=min_alert_match_similarity
    )  # Set early stopping to 0.0 for bag since grouping criteria do not match
    used_a = []
    used_b = []
    b_to_a = {}
    for a, b in alert_matching:
        if a not in used_a and b not in used_b:
            used_a.append(a)
            used_b.append(b)
            b_to_a[b] = a
    alignment_a = []
    alignment_b = []
    for a in group_a.merge_seq:
        if a in used_a:
            alignment_a.append(used_a.index(a))
        else:
            # No match found, use max index + 1
            alignment_a.append(len(used_a))
    for b in group_b.merge_seq:
        if b in b_to_a:
            a_eq = b_to_a[b]
            alignment_b.append(used_a.index(a_eq))
        else:
            # No match found, use max index + 2
            alignment_b.append(len(used_a) + 1)
    if alignment_weight != 0.0 and len(alignment_a) > 0 and len(
            alignment_b) > 0:
        sm = CSequenceMatcher(None, alignment_a, alignment_b, autojunk=False)
        lcs_len = sum([block.size for block in sm.get_matching_blocks()])
        if partial is False:
            return lcs_len / min(len(alignment_a), len(alignment_b))
        else:
            return lcs_len / len(alignment_a)
    return 0.0
Ejemplo n.º 5
0
def merge_seq_alignment(groups, merged_bags, merged_bags_inv):
  # For efficiency, alignment is created incrementally. This does not guarantee optimal alignment.
  lcs = []
  merge_list = list(merged_bags.keys())
  first_alignment = True
  for group in groups:
    alignment = []
    for alert in group.merge_seq:
      alignment.append(merge_list.index(merged_bags_inv[alert]))
    if first_alignment is True:
      lcs = alignment
      first_alignment = False
    else:
      sm = CSequenceMatcher(None, lcs, alignment, autojunk=False) # During testing, autojunk=True sometimes incorrectly returned empty lists
      l = [lcs[block.a:(block.a + block.size)] for block in sm.get_matching_blocks()]
      lcs = [item for sublist in l for item in sublist]
  seq = []
  for alert_index in lcs:
    seq.append(merge_list[alert_index])
  return seq
Ejemplo n.º 6
0
    def testCDifflibWithBug5Data(self):
        """Check cdifflib returns the same result for bug #5
           (autojunk handling issues)"""
        from . import testdata

        # note: convert both to lists for Python 3.3
        sm = SequenceMatcher(None, testdata.a5, testdata.b5)
        difflib_matches = list(sm.get_matching_blocks())

        sm = CSequenceMatcher(None, testdata.a5, testdata.b5)
        cdifflib_matches = list(sm.get_matching_blocks())

        self.assertEqual(difflib_matches, cdifflib_matches)
Ejemplo n.º 7
0
    def fix_string(self, verbosity=0):
        """Obtain the changes to a path as a string.

        We use the file_mask to do a safe merge, avoiding any templated
        sections. First we need to detect where there have been changes
        between the fixed and templated versions. The file mask is of
        the format: (raw_file, templated_file, fixed_file).

        We use difflib.SequenceMatcher.get_opcodes
        See: https://docs.python.org/3.7/library/difflib.html#difflib.SequenceMatcher.get_opcodes
        It returns a list of tuples ('equal|replace|delete|insert', ia1, ia2, ib1, ib2).

        """
        bencher = BenchIt()
        bencher("fix_string: start")

        # Do we have enough information to actually fix the file?
        if any(elem is None for elem in self.file_mask):
            verbosity_logger(
                "Insufficient information to fix file: {0}".format(
                    self.file_mask),
                verbosity=verbosity)
            return None, False

        verbosity_logger("Persisting file masks: {0}".format(self.file_mask),
                         verbosity=verbosity)
        # Compare Templated with Raw
        diff_templ = SequenceMatcher(autojunk=None,
                                     a=self.file_mask[0],
                                     b=self.file_mask[1])
        bencher("fix_string: Match 0&1")
        diff_templ_codes = diff_templ.get_opcodes()
        verbosity_logger("Templater diff codes: {0}".format(diff_templ_codes),
                         verbosity=verbosity)

        bencher("fix_string: Got Opcodes 0&1")
        # Compare Fixed with Templated
        diff_fix = SequenceMatcher(autojunk=None,
                                   a=self.file_mask[1],
                                   b=self.file_mask[2])
        bencher("fix_string: Matched 1&2")
        # diff_fix = SequenceMatcher(autojunk=None, a=self.file_mask[1][0], b=self.file_mask[2][0])
        diff_fix_codes = diff_fix.get_opcodes()
        verbosity_logger("Fixing diff codes: {0}".format(diff_fix_codes),
                         verbosity=verbosity)
        bencher("fix_string: Got Opcodes 1&2")

        # If diff_templ isn't the same then we should just keep the template. If there *was*
        # a fix in that space, then we should raise an issue
        # If it is the same, then we can apply fixes as expected.
        write_buff = ''
        fixed_block = None
        templ_block = None
        # index in raw, templ and fix
        idx = (0, 0, 0)
        loop_idx = 0
        bencher("fix_string: Loop Setup")
        while True:
            loop_idx += 1
            verbosity_logger("{0:04d}: Write Loop: idx:{1}, buff:{2!r}".format(
                loop_idx, idx, write_buff),
                             verbosity=verbosity)

            if templ_block is None:
                if diff_templ_codes:
                    templ_block = diff_templ_codes.pop(0)
                # We've exhausted the template. Have we exhausted the fixes?
                elif fixed_block is None and not diff_fix_codes:
                    # Yes - excellent. DONE
                    break
                # Deal with the case that we only have inserts left.
                elif all(elem[0] == 'insert' for elem in diff_fix_codes):
                    for fixed_block in diff_fix_codes:
                        write_buff += self.file_mask[2][
                            fixed_block[3]:fixed_block[4]]
                    break
                else:
                    raise NotImplementedError(
                        "Fix Block(s) left over! Don't know how to handle this! aeflf8wh"
                    )
            if fixed_block is None:
                if diff_fix_codes:
                    fixed_block = diff_fix_codes.pop(0)
                # One case is that we just consumed the last block of both, so check indexes
                # to see if we're at the end of the raw file.
                elif idx[0] >= len(self.file_mask[0]):
                    # Yep we're at the end
                    break
                else:
                    raise NotImplementedError(
                        "Unexpectedly depleted the fixes. Panic!")
            verbosity_logger("{0:04d}: Blocks: template:{1}, fix:{2}".format(
                loop_idx, templ_block, fixed_block),
                             verbosity=verbosity)

            if templ_block[0] == 'equal':
                if fixed_block[0] == 'equal':
                    # No templating, no fixes, go with middle and advance indexes
                    # Find out how far we can advance (we use the middle version because it's common)
                    if templ_block[4] == fixed_block[2]:
                        buff = self.file_mask[1][idx[1]:fixed_block[2]]
                        # consume both blocks
                        fixed_block = None
                        templ_block = None
                    elif templ_block[4] > fixed_block[2]:
                        buff = self.file_mask[1][idx[1]:fixed_block[2]]
                        # consume fixed block
                        fixed_block = None
                    elif templ_block[4] < fixed_block[2]:
                        buff = self.file_mask[1][idx[1]:templ_block[4]]
                        # consume templ block
                        templ_block = None
                    idx = (idx[0] + len(buff), idx[1] + len(buff),
                           idx[2] + len(buff))
                    write_buff += buff
                    continue
                elif fixed_block[0] == 'replace':
                    # Consider how to apply fixes.
                    # Can we implement the fix while staying in the equal segment?
                    if fixed_block[2] <= templ_block[4]:
                        # Yes! Write from the fixed version.
                        write_buff += self.file_mask[2][idx[2]:fixed_block[4]]
                        idx = (idx[0] + (fixed_block[2] - fixed_block[1]),
                               fixed_block[2], fixed_block[4])
                        # Consume the fixed block because we've written the whole thing.
                        fixed_block = None
                        continue
                    else:
                        raise NotImplementedError("DEF")
                elif fixed_block[0] == 'delete':
                    # We're deleting items, nothing to write but we can consume some
                    # blocks and advance some indexes.
                    idx = (idx[0] + (fixed_block[2] - fixed_block[1]),
                           fixed_block[2], fixed_block[4])
                    fixed_block = None
                elif fixed_block[0] == 'insert':
                    # We're inserting items, Write from the fix block, but only that index moves.
                    write_buff += self.file_mask[2][idx[2]:fixed_block[4]]
                    idx = (idx[0], idx[1], fixed_block[4])
                    fixed_block = None
                else:
                    raise NotImplementedError((
                        "Unexpected opcode {0} for fix block! Please report this "
                        "issue on github with the query and rules you're trying to "
                        "fix.").format(fixed_block[0]))
            elif templ_block[0] == 'replace':
                # We're in a templated section - we should write the templated version.
                # we should consume the whole replace block and then deal with where
                # we end up.
                buff = self.file_mask[0][idx[0]:templ_block[2]]
                new_templ_idx = templ_block[4]

                # Fast forward through fix blocks until we catch up. We're not implementing
                # any changes in a templated section.
                while True:
                    if fixed_block[2] > new_templ_idx >= fixed_block[1]:
                        # this block contains the end point
                        break
                    else:
                        # We're not at the end point yet, continue to fast forward through.
                        if fixed_block[0] != 'equal':
                            print("WARNING: Skipping edit block: {0}".format(
                                fixed_block))
                        if diff_fix_codes:
                            fixed_block = diff_fix_codes.pop(0)
                        else:
                            raise NotImplementedError(
                                "Unexpectedly depleted the fixes. Panic!")
                # Are we exactly on a join?
                if new_templ_idx == fixed_block[1]:
                    # GREAT - this makes things easy because we have an equality point already
                    idx = (templ_block[2], new_templ_idx, fixed_block[3])
                else:
                    if fixed_block[0] == 'equal':
                        # If it's in an equal block, we can use the same offset from the end.
                        idx = (templ_block[2], new_templ_idx, fixed_block[3] +
                               (new_templ_idx - fixed_block[1]))
                    else:
                        # TODO: We're trying to move through an templated section, but end up
                        # in a fixed section. We've lost track of indexes.
                        # We might need to panic if this happens...
                        print("UMMMMMM!")
                        print(new_templ_idx)
                        print(fixed_block)
                        raise NotImplementedError("ABC")
                write_buff += buff
                # consume template block
                templ_block = None
            elif templ_block[0] == 'delete':
                # The comparison, things that the templater has deleted
                # some characters. This is just a quirk of the differ.
                # In reality this means we just write these characters
                # and don't worry about advancing the other indexes.
                buff = self.file_mask[0][idx[0]:templ_block[2]]
                # consume templ block
                templ_block = None
                idx = (idx[0] + len(buff), idx[1], idx[2])
                write_buff += buff
            elif templ_block[0] == 'insert':
                # The templater has inserted something here. We don't need
                # to write anything here (because whatever we're looking at
                # was inserted by the templater), but we do need to keep
                # track of what happened to the rest of the section we're in.
                # If nothing was fixed then it's easy because the indices
                # will be the same. Otherwise... great question...

                # For now let's just deal with the happy case where the fixed
                # block is equal
                if fixed_block[0] == 'equal':
                    # Let's make sure we can consume enough to get through the
                    # templ block and not get to the end of the fix block.
                    if templ_block[4] <= fixed_block[2]:
                        insert_len = templ_block[4] - templ_block[3]
                        idx = (idx[0], idx[1] + insert_len,
                               idx[2] + insert_len)
                        # if things matched up perfectly, consume the fixed block
                        if templ_block[4] == fixed_block[2]:
                            fixed_block = None
                        # always consume templ block in this case
                        templ_block = None
                    else:
                        raise NotImplementedError((
                            "Unexpected scenario during insert opcode! Please report "
                            "this issue on github with the query and rules you're trying "
                            "to fix."))
                else:
                    raise NotImplementedError((
                        "Unexpected opcode {0} for fix block! Please report this "
                        "issue on github with the query and rules you're trying to "
                        "fix.").format(fixed_block[0]))
            else:
                raise NotImplementedError((
                    "Unexpected opcode {0} for template block! Please report this "
                    "issue on github with the query and rules you're trying to "
                    "fix.").format(templ_block[0]))

        bencher("fix_string: Fixing loop done")
        # The success metric here is whether anything ACTUALLY changed.
        return write_buff, write_buff != self.file_mask[0]
Ejemplo n.º 8
0
    def _diff_two_v2(cls, base, cmp, label=None):
        lbl = {'base': 'base', 'cmp': 'cmp'}
        if label and isinstance(label, dict):
            lbl.update(label)

        # 和v1不同,v2在比较时,先去掉换行符,以免对diff算法干扰
        base = base.replace('|', '\n').rstrip('\n')
        base_lines = base.split('\n')
        base = cls.pre_base(base, False)
        cmp = cls.pre_cmp(cmp)
        segments = []
        s = CSequenceMatcher(None, base, cmp, autojunk=False)
        for tag, i1, i2, j1, j2 in s.get_opcodes():
            t1, t2 = base[i1:i2], cmp[j1:j2]
            # print('{:7}   a[{}:{}] --> b[{}:{}] {!r:>8} --> {!r}'.format(tag, i1, i2, j1, j2, t1, t2))
            is_same = True if tag == 'equal' else False
            r = {
                'line_no': None,
                'is_same': is_same,
                lbl['base']: t1,
                lbl['cmp']: t2
            }
            segments.append(r)

        # 合并diff时可能被异体字隔断的同文
        for i, seg in enumerate(segments):
            if seg.get('is_same'):
                # 往前找一个没有被delete的同文seg进行合并
                j = i - 1
                while j >= 0:
                    pre = segments[j]
                    if not pre['is_same']:
                        break
                    if not pre.get('deleted'):
                        pre[lbl['base']] += seg[lbl['base']]
                        pre[lbl['cmp']] += seg[lbl['cmp']]
                        seg['deleted'] = True
                        break
                    j -= 1
        segments = [s for s in segments if not s.get('deleted')]

        # 根据diff比较的结果,按照base设置换行
        line_segments, idx = [], 0
        for i, line in enumerate(base_lines):
            if not len(line):  # 如果line为空,则新增换行
                line_segments.append({
                    'line_no': i + 1,
                    'is_same': True,
                    lbl['base']: '\n',
                    lbl['cmp']: '\n'
                })
                continue
            # 从segments中找len(line)长作为第i+1行
            start, left_len = 0, len(line)
            while idx < len(segments) and left_len > 0:
                seg = segments[idx]
                if len(seg[lbl['base']]) <= left_len:  # seg比left_len短,seg入栈
                    seg['line_no'] = i + 1
                    seg_len = len(seg[lbl['base']])
                    line_segments.append(seg)
                    # 更新变量
                    left_len -= seg_len
                    start += seg_len
                    idx += 1
                else:  # seg比left_len长,截断seg
                    front_part = {
                        'line_no': i + 1,
                        'is_same': seg['is_same'],
                        lbl['base']: seg[lbl['base']][:left_len],
                        lbl['cmp']: seg[lbl['cmp']][:left_len],
                    }
                    line_segments.append(front_part)
                    seg.update({
                        lbl['cmp']:
                        seg[lbl['cmp']][left_len:]
                        if len(seg[lbl['cmp']]) > left_len else '',
                        lbl['base']:
                        seg[lbl['base']][left_len:],
                    })
                    # 更新变量
                    left_len = 0
                    start = 0

                if left_len == 0:  # 换行
                    line_segments.append({
                        'line_no': i + 1,
                        'is_same': True,
                        lbl['base']: '\n',
                        lbl['cmp']: '\n'
                    })

        # 检查换行符后是否有base为空的异文,有则往前提
        for i, seg in enumerate(line_segments):
            pre = line_segments[i - 1] if i > 1 else {}
            if seg[lbl['base']] == '' and pre.get('is_same') and pre.get(
                    lbl['base']) == '\n':
                # 当前为空异文,之前为换行,则交换二者位置
                temp = seg.copy()
                seg.update(pre)
                pre.update(temp)

        # 设置range
        start = 0
        for seg in line_segments:
            seg['range'] = (start, start + len(seg[lbl['base']]))
            start += len(seg[lbl['base']])
            if seg['is_same'] and seg[lbl['base']] == '\n':
                start = 0

        return line_segments
Ejemplo n.º 9
0
def main():
    parser = argparse.ArgumentParser(
        description='Finds the most similar files to a given file.')
    parser.add_argument('target', help='file for which to find matches')
    parser.add_argument('other', nargs='+', help='other file(s) to compare')
    parser.add_argument(
        '-n',
        '--num',
        metavar='N',
        type=int,
        default=0,
        help='use quick_ratio and keep only the N best guesses '
        'before calculating the true similarity ratios')
    parser.add_argument('-l',
                        '--longest',
                        action='store_true',
                        help='use longest match instead of ratio')
    parser.add_argument('-s',
                        '--scaled',
                        action='store_true',
                        help='scale ratios relative to file sizes (including '
                        'initial filtering by rough ratio)')
    parser.add_argument(
        '-m',
        '--maxbytes',
        metavar='N',
        type=int,
        default=-1,
        help='limit comparisons to the first N bytes from each file '
        '(default: entire file)')
    args = parser.parse_args()

    with open(args.target, 'rb') as fp:
        seq1 = fp.read(args.maxbytes)

    matcher = SequenceMatcher()
    matcher.set_seq2(list(seq1))

    if args.num > 0:
        estimates = []
        for fname in args.other:
            if fname == args.target:
                continue
            with open(fname, 'rb') as fp:
                seq2 = fp.read(args.maxbytes)
            matcher.set_seq1(list(seq2))
            ratio = matcher.quick_ratio()
            estimates.append((fname, ratio))
        estimates.sort(key=lambda x: x[1])
        estimates = estimates[-args.num:]
        nbest = [x[0] for x in estimates]
    else:
        nbest = args.other

    actuals = []
    for idx, fname in enumerate(nbest):
        print('{0}/{1}'.format(idx, len(nbest)), file=sys.stderr)
        with open(fname, 'rb') as fp:
            seq2 = fp.read(args.maxbytes)
        matcher.set_seq1(list(seq2))
        metric = matcher.ratio()
        if args.longest:
            metric = max(x.size for x in matcher.get_matching_blocks())
        else:
            metric = matcher.ratio()
            if args.scaled:
                metric *= (len(seq1) + len(seq2)) / 2
        actuals.append((fname, metric))
    actuals.sort(key=lambda x: x[1])
    for stat in actuals:
        print('{0}\t{1}'.format(stat[1], stat[0]))
Ejemplo n.º 10
0
def processFiles(directory):
    ind = 1
    for itemFile in os.listdir(directory):
        ind += 1
        subCat = os.path.basename(itemFile)[:-4]

        itemDetails.clear()
        itemDetailsWithModel.clear()
        wFile = directory._str + "\\" + itemFile

        if (os.path.isdir(wFile)):
            continue

        pcent = getCurrentPercentage(directory, ind)
        print("Working in ", directory.parts[2], "-", itemFile, " ",
              pcent.__round__(2), "% complete")

        try:
            with open(wFile, encoding='iso-8859-2', newline='') as csv_file:
                csv_reader = csv.reader(csv_file, delimiter=',')
                for row in csv_reader:
                    if row[12].strip("'") in nullValues:
                        itemDetails.append(row)
                    else:
                        itemDetailsWithModel.append(row)

                totals = len(itemDetails)
                if totals == 0:
                    continue
                iDetailsIndex = 0
                titleList = [processString(row[1]) for row in itemDetails]

                while iDetailsIndex < totals:
                    productTitle = processString(itemDetails[iDetailsIndex][1])
                    if iDetailsIndex == (totals - 1):
                        modelList.append(getInitials(productTitle))
                        break
                    for y in range(iDetailsIndex + 1, len(titleList)):
                        ratio = CSequenceMatcher(None, productTitle,
                                                 titleList[y]).ratio()
                        modelList.append(getInitials(productTitle))
                        if ratio <= 0.75:
                            iDetailsIndex += 1
                            break
                        iDetailsIndex += 1

                for i in range(len(itemDetails)):
                    itemDetails[i][
                        12] = "\'" + modelList[i] + subCat[3:] + "\'"
        except:
            logging.exception("Error processing file")
            print("Error with file. Skipping...")
            continue

        itemDetails.extend(itemDetailsWithModel)
        #outputFile = "OUT" + '\\' + dirString + '\\' + upperCat + '.csv'
        outputFile = directory._str + "\\" + subCat + '.csv'

        try:
            with open(outputFile, mode='w', newline='',
                      encoding='iso-8859-2') as csv_file:
                csv_writer = csv.writer(csv_file, delimiter=',')
                for row in itemDetails:
                    csv_writer.writerow(row)
        except OSError:
            logging.exception("Could not write output file")
            print("Could not write output file. Skipping...")
            continue