Ejemplo n.º 1
0
def get_group_similarity_alignment(group_a,
                                   group_b,
                                   early_stopping_threshold=0.0,
                                   w=None,
                                   min_alert_match_similarity=0.0,
                                   alignment_weight=0.0,
                                   partial=False):
    if min(len(group_a.merge_seq), len(group_b.merge_seq)) / max(
            len(group_a.merge_seq), len(
                group_b.merge_seq)) < early_stopping_threshold:
        return 0.0
    s = 0.0
    alert_matching = find_alert_matching(
        group_a.bag_of_alerts.keys(),
        group_b.bag_of_alerts.keys(),
        early_stopping_threshold=0.0,
        w=w,
        min_alert_match_similarity=min_alert_match_similarity
    )  # Set early stopping to 0.0 for bag since grouping criteria do not match
    used_a = []
    used_b = []
    b_to_a = {}
    for a, b in alert_matching:
        if a not in used_a and b not in used_b:
            used_a.append(a)
            used_b.append(b)
            b_to_a[b] = a
    alignment_a = []
    alignment_b = []
    for a in group_a.merge_seq:
        if a in used_a:
            alignment_a.append(used_a.index(a))
        else:
            # No match found, use max index + 1
            alignment_a.append(len(used_a))
    for b in group_b.merge_seq:
        if b in b_to_a:
            a_eq = b_to_a[b]
            alignment_b.append(used_a.index(a_eq))
        else:
            # No match found, use max index + 2
            alignment_b.append(len(used_a) + 1)
    if alignment_weight != 0.0 and len(alignment_a) > 0 and len(
            alignment_b) > 0:
        sm = CSequenceMatcher(None, alignment_a, alignment_b, autojunk=False)
        lcs_len = sum([block.size for block in sm.get_matching_blocks()])
        if partial is False:
            return lcs_len / min(len(alignment_a), len(alignment_b))
        else:
            return lcs_len / len(alignment_a)
    return 0.0
Ejemplo n.º 2
0
def merge_seq_alignment(groups, merged_bags, merged_bags_inv):
  # For efficiency, alignment is created incrementally. This does not guarantee optimal alignment.
  lcs = []
  merge_list = list(merged_bags.keys())
  first_alignment = True
  for group in groups:
    alignment = []
    for alert in group.merge_seq:
      alignment.append(merge_list.index(merged_bags_inv[alert]))
    if first_alignment is True:
      lcs = alignment
      first_alignment = False
    else:
      sm = CSequenceMatcher(None, lcs, alignment, autojunk=False) # During testing, autojunk=True sometimes incorrectly returned empty lists
      l = [lcs[block.a:(block.a + block.size)] for block in sm.get_matching_blocks()]
      lcs = [item for sublist in l for item in sublist]
  seq = []
  for alert_index in lcs:
    seq.append(merge_list[alert_index])
  return seq
Ejemplo n.º 3
0
def main():
    parser = argparse.ArgumentParser(
        description='Finds the most similar files to a given file.')
    parser.add_argument('target', help='file for which to find matches')
    parser.add_argument('other', nargs='+', help='other file(s) to compare')
    parser.add_argument(
        '-n',
        '--num',
        metavar='N',
        type=int,
        default=0,
        help='use quick_ratio and keep only the N best guesses '
        'before calculating the true similarity ratios')
    parser.add_argument('-l',
                        '--longest',
                        action='store_true',
                        help='use longest match instead of ratio')
    parser.add_argument('-s',
                        '--scaled',
                        action='store_true',
                        help='scale ratios relative to file sizes (including '
                        'initial filtering by rough ratio)')
    parser.add_argument(
        '-m',
        '--maxbytes',
        metavar='N',
        type=int,
        default=-1,
        help='limit comparisons to the first N bytes from each file '
        '(default: entire file)')
    args = parser.parse_args()

    with open(args.target, 'rb') as fp:
        seq1 = fp.read(args.maxbytes)

    matcher = SequenceMatcher()
    matcher.set_seq2(list(seq1))

    if args.num > 0:
        estimates = []
        for fname in args.other:
            if fname == args.target:
                continue
            with open(fname, 'rb') as fp:
                seq2 = fp.read(args.maxbytes)
            matcher.set_seq1(list(seq2))
            ratio = matcher.quick_ratio()
            estimates.append((fname, ratio))
        estimates.sort(key=lambda x: x[1])
        estimates = estimates[-args.num:]
        nbest = [x[0] for x in estimates]
    else:
        nbest = args.other

    actuals = []
    for idx, fname in enumerate(nbest):
        print('{0}/{1}'.format(idx, len(nbest)), file=sys.stderr)
        with open(fname, 'rb') as fp:
            seq2 = fp.read(args.maxbytes)
        matcher.set_seq1(list(seq2))
        metric = matcher.ratio()
        if args.longest:
            metric = max(x.size for x in matcher.get_matching_blocks())
        else:
            metric = matcher.ratio()
            if args.scaled:
                metric *= (len(seq1) + len(seq2)) / 2
        actuals.append((fname, metric))
    actuals.sort(key=lambda x: x[1])
    for stat in actuals:
        print('{0}\t{1}'.format(stat[1], stat[0]))