def find_books(self, txt): l = [(max( CSequenceMatcher(None, txt.lower(), book.name.lower()).ratio(), CSequenceMatcher(None, txt.lower(), book.author.lower()).ratio()), '@' + self.get_user(book.owner_id).name, book.name, book.author) for book in self.books] l.sort() l.reverse() l = [i[1:] for i in l] return l
def test_cred_form(url, username, password, host, port): try: driver = get_new_selenium_driver(host, port) driver.get(url) initial_page = driver.page_source username_input, password_input, click_button = get_form_objects(driver) username_input.clear() username_input.send_keys(username) password_input.clear() password_input.send_keys(password) click_button.click() sleep(3) m = CSequenceMatcher(None, initial_page, driver.page_source) logger.debug(f"{username}:{password} ratio: {m.ratio()}") if m.ratio() < 0.8: return [{ 'ratio': m.ratio(), 'username': username, 'password': password }] return None except Exception as e: logger.error(e) finally: if driver is not None: driver.close()
def _diff(cls, base, cmp, check_variant=True, label=None): lbl = {'base': 'base', 'cmp': 'cmp'} if label: lbl.update(label) ret, line_no = [], 1 s = CSequenceMatcher(None, base, cmp, autojunk=False) for tag, i1, i2, j1, j2 in s.get_opcodes(): t1, t2 = base[i1:i2], cmp[j1:j2] # print('{:7} a[{}:{}] --> b[{}:{}] {!r:>8} --> {!r}'.format(tag, i1, i2, j1, j2, t1, t2)) if '\n' in t1: # 换行符 lst1 = t1.split('\n') for k, _t1 in enumerate(lst1): if _t1 != '': ret.append({ 'line_no': line_no, 'is_same': False, lbl['base']: _t1, lbl['cmp']: t2 }) t2 = '' elif k == len(lst1) - 1 and t2: ret.append({ 'line_no': line_no, 'is_same': False, lbl['base']: _t1, lbl['cmp']: t2 }) if k < len(lst1) - 1: # 换行 ret.append({ 'line_no': line_no, 'is_same': True, lbl['base']: '\n' }) line_no += 1 else: is_same = True if tag == 'equal' else False r = { 'line_no': line_no, 'is_same': is_same, lbl['base']: t1, lbl['cmp']: t2 } if check_variant and len(t1) == 1 and len( t2) == 1 and t1 != t2 and is_variant(t1, t2): r['is_variant'] = True ret.append(r) # 设置起止位置 line_no, start = 1, 0 for r in ret: if r['line_no'] != line_no: # 换行 line_no += 1 start = 0 end = start + len(r[lbl['base']]) r['range'] = (start, end) start = end return ret
def testCDifflibWithBug5Data(self): """Check cdifflib returns the same result for bug #5 (autojunk handling issues)""" from . import testdata # note: convert both to lists for Python 3.3 sm = SequenceMatcher(None, testdata.a5, testdata.b5) difflib_matches = list(sm.get_matching_blocks()) sm = CSequenceMatcher(None, testdata.a5, testdata.b5) cdifflib_matches = list(sm.get_matching_blocks()) self.assertEqual(difflib_matches, cdifflib_matches)
def get_group_similarity_alignment(group_a, group_b, early_stopping_threshold=0.0, w=None, min_alert_match_similarity=0.0, alignment_weight=0.0, partial=False): if min(len(group_a.merge_seq), len(group_b.merge_seq)) / max( len(group_a.merge_seq), len( group_b.merge_seq)) < early_stopping_threshold: return 0.0 s = 0.0 alert_matching = find_alert_matching( group_a.bag_of_alerts.keys(), group_b.bag_of_alerts.keys(), early_stopping_threshold=0.0, w=w, min_alert_match_similarity=min_alert_match_similarity ) # Set early stopping to 0.0 for bag since grouping criteria do not match used_a = [] used_b = [] b_to_a = {} for a, b in alert_matching: if a not in used_a and b not in used_b: used_a.append(a) used_b.append(b) b_to_a[b] = a alignment_a = [] alignment_b = [] for a in group_a.merge_seq: if a in used_a: alignment_a.append(used_a.index(a)) else: # No match found, use max index + 1 alignment_a.append(len(used_a)) for b in group_b.merge_seq: if b in b_to_a: a_eq = b_to_a[b] alignment_b.append(used_a.index(a_eq)) else: # No match found, use max index + 2 alignment_b.append(len(used_a) + 1) if alignment_weight != 0.0 and len(alignment_a) > 0 and len( alignment_b) > 0: sm = CSequenceMatcher(None, alignment_a, alignment_b, autojunk=False) lcs_len = sum([block.size for block in sm.get_matching_blocks()]) if partial is False: return lcs_len / min(len(alignment_a), len(alignment_b)) else: return lcs_len / len(alignment_a) return 0.0
def merge_seq_alignment(groups, merged_bags, merged_bags_inv): # For efficiency, alignment is created incrementally. This does not guarantee optimal alignment. lcs = [] merge_list = list(merged_bags.keys()) first_alignment = True for group in groups: alignment = [] for alert in group.merge_seq: alignment.append(merge_list.index(merged_bags_inv[alert])) if first_alignment is True: lcs = alignment first_alignment = False else: sm = CSequenceMatcher(None, lcs, alignment, autojunk=False) # During testing, autojunk=True sometimes incorrectly returned empty lists l = [lcs[block.a:(block.a + block.size)] for block in sm.get_matching_blocks()] lcs = [item for sublist in l for item in sublist] seq = [] for alert_index in lcs: seq.append(merge_list[alert_index]) return seq
def _diff_two_v2(cls, base, cmp, label=None): lbl = {'base': 'base', 'cmp': 'cmp'} if label and isinstance(label, dict): lbl.update(label) # 和v1不同,v2在比较时,先去掉换行符,以免对diff算法干扰 base = base.replace('|', '\n').rstrip('\n') base_lines = base.split('\n') base = cls.pre_base(base, False) cmp = cls.pre_cmp(cmp) segments = [] s = CSequenceMatcher(None, base, cmp, autojunk=False) for tag, i1, i2, j1, j2 in s.get_opcodes(): t1, t2 = base[i1:i2], cmp[j1:j2] # print('{:7} a[{}:{}] --> b[{}:{}] {!r:>8} --> {!r}'.format(tag, i1, i2, j1, j2, t1, t2)) is_same = True if tag == 'equal' else False r = { 'line_no': None, 'is_same': is_same, lbl['base']: t1, lbl['cmp']: t2 } segments.append(r) # 合并diff时可能被异体字隔断的同文 for i, seg in enumerate(segments): if seg.get('is_same'): # 往前找一个没有被delete的同文seg进行合并 j = i - 1 while j >= 0: pre = segments[j] if not pre['is_same']: break if not pre.get('deleted'): pre[lbl['base']] += seg[lbl['base']] pre[lbl['cmp']] += seg[lbl['cmp']] seg['deleted'] = True break j -= 1 segments = [s for s in segments if not s.get('deleted')] # 根据diff比较的结果,按照base设置换行 line_segments, idx = [], 0 for i, line in enumerate(base_lines): if not len(line): # 如果line为空,则新增换行 line_segments.append({ 'line_no': i + 1, 'is_same': True, lbl['base']: '\n', lbl['cmp']: '\n' }) continue # 从segments中找len(line)长作为第i+1行 start, left_len = 0, len(line) while idx < len(segments) and left_len > 0: seg = segments[idx] if len(seg[lbl['base']]) <= left_len: # seg比left_len短,seg入栈 seg['line_no'] = i + 1 seg_len = len(seg[lbl['base']]) line_segments.append(seg) # 更新变量 left_len -= seg_len start += seg_len idx += 1 else: # seg比left_len长,截断seg front_part = { 'line_no': i + 1, 'is_same': seg['is_same'], lbl['base']: seg[lbl['base']][:left_len], lbl['cmp']: seg[lbl['cmp']][:left_len], } line_segments.append(front_part) seg.update({ lbl['cmp']: seg[lbl['cmp']][left_len:] if len(seg[lbl['cmp']]) > left_len else '', lbl['base']: seg[lbl['base']][left_len:], }) # 更新变量 left_len = 0 start = 0 if left_len == 0: # 换行 line_segments.append({ 'line_no': i + 1, 'is_same': True, lbl['base']: '\n', lbl['cmp']: '\n' }) # 检查换行符后是否有base为空的异文,有则往前提 for i, seg in enumerate(line_segments): pre = line_segments[i - 1] if i > 1 else {} if seg[lbl['base']] == '' and pre.get('is_same') and pre.get( lbl['base']) == '\n': # 当前为空异文,之前为换行,则交换二者位置 temp = seg.copy() seg.update(pre) pre.update(temp) # 设置range start = 0 for seg in line_segments: seg['range'] = (start, start + len(seg[lbl['base']])) start += len(seg[lbl['base']]) if seg['is_same'] and seg[lbl['base']] == '\n': start = 0 return line_segments
def processFiles(directory): ind = 1 for itemFile in os.listdir(directory): ind += 1 subCat = os.path.basename(itemFile)[:-4] itemDetails.clear() itemDetailsWithModel.clear() wFile = directory._str + "\\" + itemFile if (os.path.isdir(wFile)): continue pcent = getCurrentPercentage(directory, ind) print("Working in ", directory.parts[2], "-", itemFile, " ", pcent.__round__(2), "% complete") try: with open(wFile, encoding='iso-8859-2', newline='') as csv_file: csv_reader = csv.reader(csv_file, delimiter=',') for row in csv_reader: if row[12].strip("'") in nullValues: itemDetails.append(row) else: itemDetailsWithModel.append(row) totals = len(itemDetails) if totals == 0: continue iDetailsIndex = 0 titleList = [processString(row[1]) for row in itemDetails] while iDetailsIndex < totals: productTitle = processString(itemDetails[iDetailsIndex][1]) if iDetailsIndex == (totals - 1): modelList.append(getInitials(productTitle)) break for y in range(iDetailsIndex + 1, len(titleList)): ratio = CSequenceMatcher(None, productTitle, titleList[y]).ratio() modelList.append(getInitials(productTitle)) if ratio <= 0.75: iDetailsIndex += 1 break iDetailsIndex += 1 for i in range(len(itemDetails)): itemDetails[i][ 12] = "\'" + modelList[i] + subCat[3:] + "\'" except: logging.exception("Error processing file") print("Error with file. Skipping...") continue itemDetails.extend(itemDetailsWithModel) #outputFile = "OUT" + '\\' + dirString + '\\' + upperCat + '.csv' outputFile = directory._str + "\\" + subCat + '.csv' try: with open(outputFile, mode='w', newline='', encoding='iso-8859-2') as csv_file: csv_writer = csv.writer(csv_file, delimiter=',') for row in itemDetails: csv_writer.writerow(row) except OSError: logging.exception("Could not write output file") print("Could not write output file. Skipping...") continue