def load_fuzzy_groups_xml(logger): from lxml import etree global args, only_generate_for_ui import clones # default required settings for fuzzy groups clones.write_reformatted_sources = False clones.checkmarkup = False only_generate_for_ui = clones.only_generate_for_ui = args.only_ui == "yes" inputfile = clones.InputFile(args.source_xml) fuzzyclonedata = etree.parse(args.fuzzy_xml) # type: ElementTree fgrps = [] for fgrp in fuzzyclonedata.xpath('/fuzzygroups/fuzzygroup'): # here is group fclns = [] fclntexts = [] fclnwords = [] for fcln in fgrp.xpath('./fuzzyclone'): fclns.append( (0, int(fcln.attrib['offset']), int(fcln.attrib['length']) + int(fcln.attrib['offset']))) fclntexts.append(fcln.xpath('./sourcetext')[0].text) fclnwords.append(fcln.xpath('./sourcewords')[0].text) fgrps.append( clones.FuzzyCloneGroup(fgrp.attrib['id'], fclns, fclntexts, fclnwords)) clones.initdata([inputfile], fgrps)
def extract_near_duplicates(src, logger): intervals = sourcemarkers.find_marked_intervals(src) id2clones = defaultdict(lambda: []) for ob, ce, mt in intervals: if mt == 'ACCEPT': oe = ob + sourcemarkers.markerlen cb = ce - sourcemarkers.markerlen mi = sourcemarkers.open_marker_id(src[ob:ob + sourcemarkers.markerlen]) id2clones[mi].append((oe, cb)) cnt = 0 fgrps = [] for ndi in id2clones.keys(): cnt += 1 fclns = [] fclntexts = [] fclnwords = [] intervals = id2clones[ndi] for b, e in intervals: fclns.append((0, b, e)) fclntexts.append(src[b:e]) fclnwords.append(words(src[b:e])) fgrps.append( clones.FuzzyCloneGroup(str(cnt), fclns, fclntexts, fclnwords)) return fgrps
def organize_search(logger, args): import clones import itertools # default required settings for fuzzy groups clones.write_reformatted_sources = False clones.checkmarkup = False clones.only_generate_for_ui = args.only_ui == "yes" inputfile = clones.InputFile(args.input_document) fuzzyclonedata = find_like_pattern(inputfile, args.pattern, args.minimal_similarity) fgrps = [] clones.inputfiles = [inputfile] # to get texts and ratios properly for (cbeg, cend, clr, ctxt, cwrds), ctr in zip(fuzzyclonedata, itertools.count(1)): fgrps.append(clones.FuzzyCloneGroup( str(ctr), [(0, cbeg, cend)] #, #[' '.join(cwrds)], # [ctxt], !! TODO: Don't hack, implement! #[cwrds] )) clones.initdata([inputfile], fgrps)
def load_dups_benchmark_json(logger): global args import clones import util import json clones.write_reformatted_sources = False clones.checkmarkup = False clones.only_generate_for_ui = args.only_ui == "yes" inputfile = clones.InputFile(args.source_xml) with open(args.neardup_json, encoding='utf-8') as ndj: fuzzyclonedata = json.load(ndj) # Then the data... OMG... # It is all wrong now... fgrps = [] for fgrp in fuzzyclonedata['Benchmarks']: # here is group group_id = fgrp['name'] fclns = [] fclntexts = [] fclnwords = [] for fcln in fgrp['group_ids']: [si, ei] = fcln['position'] # tx = fcln['name2'] # not everywhere filled tx = inputfile.text[si:ei] fclns.append((0, int(si), int(ei))) fclntexts.append(tx) fclnwords.append(util.ctokens(tx)) fgrps.append( clones.FuzzyCloneGroup(group_id, fclns, fclntexts, fclnwords)) clones.initdata([inputfile], fgrps)
def load_near_duplicates_json(logger): global args import clones import util import json """ JSON example: { "groups": [ { "group_id": 1, "duplicates": [ { "start_index": 404, "end_index": 604, "text": "ZEXTERN int ZEXPORT deflateInit OF((z_streamp strm, int level)); Initializes the internal stream state for compression. The fields zalloc, zfree and opaque must be initialized before by the caller." }, { "start_index": 8148, "end_index": 8358, "text": "ZEXTERN int ZEXPORT inflateInit OF((z_streamp strm)); Initializes the internal stream state for decompression. The fields next_in, avail_in, zalloc, zfree and opaque must be initialized before by the caller." } ] }, { "group_id": 2, "duplicates": [ { "start_index": 605, "end_index": 705, "text": "If zalloc and zfree are set to Z_NULL, deflateInit updates them to use default allocation functions." }, { "start_index": 8579, "end_index": 8679, "text": "If zalloc and zfree are set to Z_NULL, inflateInit updates them to use default allocation functions." } ] }, ... ] } """ # default required settings for fuzzy groups clones.write_reformatted_sources = False clones.checkmarkup = False clones.only_generate_for_ui = args.only_ui == "yes" inputfile = clones.InputFile(args.source_xml) clones.initdata([inputfile], []) with open(args.neardup_json, encoding='utf-8') as ndj: fuzzyclonedata = json.load(ndj) fgrps = [] for fgrp in fuzzyclonedata['groups']: # here is group group_id = fgrp['group_id'] fclns = [] fclntexts = [] fclnwords = [] for fcln in fgrp['duplicates']: si = fcln['start_index'] ei = fcln['end_index'] tx = fcln['text'] fclns.append((0, int(si), int(ei))) fclntexts.append(tx) fclnwords.append(util.ctokens(tx)) fgrps.append( clones.FuzzyCloneGroup( group_id, fclns #, fclntexts, fclnwords )) clones.initdata([inputfile], fgrps)