def load_fuzzy_groups_xml(logger): from lxml import etree global args, only_generate_for_ui import clones # default required settings for fuzzy groups clones.write_reformatted_sources = False clones.checkmarkup = False only_generate_for_ui = clones.only_generate_for_ui = args.only_ui == "yes" inputfile = clones.InputFile(args.source_xml) fuzzyclonedata = etree.parse(args.fuzzy_xml) # type: ElementTree fgrps = [] for fgrp in fuzzyclonedata.xpath('/fuzzygroups/fuzzygroup'): # here is group fclns = [] fclntexts = [] fclnwords = [] for fcln in fgrp.xpath('./fuzzyclone'): fclns.append( (0, int(fcln.attrib['offset']), int(fcln.attrib['length']) + int(fcln.attrib['offset']))) fclntexts.append(fcln.xpath('./sourcetext')[0].text) fclnwords.append(fcln.xpath('./sourcewords')[0].text) fgrps.append( clones.FuzzyCloneGroup(fgrp.attrib['id'], fclns, fclntexts, fclnwords)) clones.initdata([inputfile], fgrps)
def loadfuzzyinputs(logger): global args # default required settings for fuzzy groups clones.write_reformatted_sources = False clones.checkmarkup = False clones.only_generate_for_ui = args.only_ui == "yes" inputfile = clones.InputFile(args.source_xml) with open(args.source_xml + ".reformatted", "w", encoding='utf-8') as rf: rf.write(inputfile.text) fgrps = extract_near_duplicates(inputfile.text, logger) clones.initdata([inputfile], fgrps)
def organize_search(logger, args): import clones import itertools # default required settings for fuzzy groups clones.write_reformatted_sources = False clones.checkmarkup = False clones.only_generate_for_ui = args.only_ui == "yes" inputfile = clones.InputFile(args.input_document) fuzzyclonedata = find_like_pattern(inputfile, args.pattern, args.minimal_similarity) fgrps = [] clones.inputfiles = [inputfile] # to get texts and ratios properly for (cbeg, cend, clr, ctxt, cwrds), ctr in zip(fuzzyclonedata, itertools.count(1)): fgrps.append(clones.FuzzyCloneGroup( str(ctr), [(0, cbeg, cend)] #, #[' '.join(cwrds)], # [ctxt], !! TODO: Don't hack, implement! #[cwrds] )) clones.initdata([inputfile], fgrps)
def load_dups_benchmark_json(logger): global args import clones import util import json clones.write_reformatted_sources = False clones.checkmarkup = False clones.only_generate_for_ui = args.only_ui == "yes" inputfile = clones.InputFile(args.source_xml) with open(args.neardup_json, encoding='utf-8') as ndj: fuzzyclonedata = json.load(ndj) # Then the data... OMG... # It is all wrong now... fgrps = [] for fgrp in fuzzyclonedata['Benchmarks']: # here is group group_id = fgrp['name'] fclns = [] fclntexts = [] fclnwords = [] for fcln in fgrp['group_ids']: [si, ei] = fcln['position'] # tx = fcln['name2'] # not everywhere filled tx = inputfile.text[si:ei] fclns.append((0, int(si), int(ei))) fclntexts.append(tx) fclnwords.append(util.ctokens(tx)) fgrps.append( clones.FuzzyCloneGroup(group_id, fclns, fclntexts, fclnwords)) clones.initdata([inputfile], fgrps)
def organize_search(logger): global args import clones import itertools # default required settings for fuzzy groups clones.write_reformatted_sources = False clones.checkmarkup = False clones.only_generate_for_ui = args.only_ui == "yes" inputfile = clones.InputFile(args.input_document) fuzzyclonedata = find_like_pattern(inputfile, args.pattern, args.minimal_similarity) fgrps = [] for (cbeg, cend, clr, ctxt, cwrds), ctr in zip(fuzzyclonedata, itertools.count(1)): fgrps.append(clones.FuzzyCloneGroup( str(ctr), [(0, cbeg, cend)], [' '.join(cwrds)], # [ctxt], !! TODO: Don't hack, implement! [cwrds], ratio=clr )) clones.initdata([inputfile], fgrps)
def load_near_duplicates_json(logger): global args import clones import util import json """ JSON example: { "groups": [ { "group_id": 1, "duplicates": [ { "start_index": 404, "end_index": 604, "text": "ZEXTERN int ZEXPORT deflateInit OF((z_streamp strm, int level)); Initializes the internal stream state for compression. The fields zalloc, zfree and opaque must be initialized before by the caller." }, { "start_index": 8148, "end_index": 8358, "text": "ZEXTERN int ZEXPORT inflateInit OF((z_streamp strm)); Initializes the internal stream state for decompression. The fields next_in, avail_in, zalloc, zfree and opaque must be initialized before by the caller." } ] }, { "group_id": 2, "duplicates": [ { "start_index": 605, "end_index": 705, "text": "If zalloc and zfree are set to Z_NULL, deflateInit updates them to use default allocation functions." }, { "start_index": 8579, "end_index": 8679, "text": "If zalloc and zfree are set to Z_NULL, inflateInit updates them to use default allocation functions." } ] }, ... ] } """ # default required settings for fuzzy groups clones.write_reformatted_sources = False clones.checkmarkup = False clones.only_generate_for_ui = args.only_ui == "yes" inputfile = clones.InputFile(args.source_xml) clones.initdata([inputfile], []) with open(args.neardup_json, encoding='utf-8') as ndj: fuzzyclonedata = json.load(ndj) fgrps = [] for fgrp in fuzzyclonedata['groups']: # here is group group_id = fgrp['group_id'] fclns = [] fclntexts = [] fclnwords = [] for fcln in fgrp['duplicates']: si = fcln['start_index'] ei = fcln['end_index'] tx = fcln['text'] fclns.append((0, int(si), int(ei))) fclntexts.append(tx) fclnwords.append(util.ctokens(tx)) fgrps.append( clones.FuzzyCloneGroup( group_id, fclns #, fclntexts, fclnwords )) clones.initdata([inputfile], fgrps)
import textwrap import os import sys import cgi import string import collections import argparse import shutil import time import errno import csv import intertree import clones clones.initdata() def initargs(): argpar = argparse.ArgumentParser() argpar.add_argument("-nb", "--findnearby", help="Find clones nearby each other, specify maximal distance (if clones theirselves are shorter)") argpar.add_argument("-wv", "--writevariations", help="Detect and write clone variations") argpar.add_argument("-sd", "--subdir", help="Subdir for output") argpar.add_argument("-bl", "--blacklist", help="Group ID's (as Clone Miner prints) to throw away") argpar.add_argument("-wl", "--whitelist", help="Group ID's (as Clone Miner prints) to keep (consider others blacklisted)") argpar.add_argument("-minl", "--minimalclonelength", help="Minimal clone length in symbols. Default = 0") argpar.add_argument("-mino", "--minimalgrouppower", help="Minimal count of clones in group. Default = 2") argpar.add_argument("-cmup", "--checkmarkup", help="Allow (no) Filter (yes) and Shrink-fix (shrink) groups, containing broken markup",
import sys import html import string import collections import argparse import shutil import time import errno import csv import util import intertree import clones import extra_report clones.initdata() def initargs(): def str2bool(v): if v.lower() in ('yes', 'true', 't', 'y', '1', '2'): return True elif v.lower() in ('no', 'false', 'f', 'n', '0'): return False else: raise argparse.ArgumentTypeError('Boolean value expected.') argpar = argparse.ArgumentParser() argpar.add_argument( "-nb", "--findnearby",