def load_fuzzy_groups_xml(logger):
    from lxml import etree
    global args, only_generate_for_ui
    import clones

    # default required settings for fuzzy groups
    clones.write_reformatted_sources = False
    clones.checkmarkup = False
    only_generate_for_ui = clones.only_generate_for_ui = args.only_ui == "yes"

    inputfile = clones.InputFile(args.source_xml)

    fuzzyclonedata = etree.parse(args.fuzzy_xml)  # type: ElementTree

    fgrps = []
    for fgrp in fuzzyclonedata.xpath('/fuzzygroups/fuzzygroup'):
        # here is group
        fclns = []
        fclntexts = []
        fclnwords = []
        for fcln in fgrp.xpath('./fuzzyclone'):
            fclns.append(
                (0, int(fcln.attrib['offset']),
                 int(fcln.attrib['length']) + int(fcln.attrib['offset'])))
            fclntexts.append(fcln.xpath('./sourcetext')[0].text)
            fclnwords.append(fcln.xpath('./sourcewords')[0].text)

        fgrps.append(
            clones.FuzzyCloneGroup(fgrp.attrib['id'], fclns, fclntexts,
                                   fclnwords))

    clones.initdata([inputfile], fgrps)
def extract_near_duplicates(src, logger):
    intervals = sourcemarkers.find_marked_intervals(src)
    id2clones = defaultdict(lambda: [])

    for ob, ce, mt in intervals:
        if mt == 'ACCEPT':
            oe = ob + sourcemarkers.markerlen
            cb = ce - sourcemarkers.markerlen
            mi = sourcemarkers.open_marker_id(src[ob:ob +
                                                  sourcemarkers.markerlen])
            id2clones[mi].append((oe, cb))

    cnt = 0
    fgrps = []
    for ndi in id2clones.keys():
        cnt += 1
        fclns = []
        fclntexts = []
        fclnwords = []

        intervals = id2clones[ndi]
        for b, e in intervals:
            fclns.append((0, b, e))
            fclntexts.append(src[b:e])
            fclnwords.append(words(src[b:e]))
        fgrps.append(
            clones.FuzzyCloneGroup(str(cnt), fclns, fclntexts, fclnwords))

    return fgrps
Exemple #3
0
def organize_search(logger, args):
    import clones
    import itertools

    # default required settings for fuzzy groups
    clones.write_reformatted_sources = False
    clones.checkmarkup = False
    clones.only_generate_for_ui = args.only_ui == "yes"

    inputfile = clones.InputFile(args.input_document)

    fuzzyclonedata = find_like_pattern(inputfile, args.pattern, args.minimal_similarity)

    fgrps = []
    clones.inputfiles = [inputfile] # to get texts and ratios properly
    for (cbeg, cend, clr, ctxt, cwrds), ctr in zip(fuzzyclonedata, itertools.count(1)):
        fgrps.append(clones.FuzzyCloneGroup(
            str(ctr), [(0, cbeg, cend)] #,
            #[' '.join(cwrds)], # [ctxt], !! TODO: Don't hack, implement!
            #[cwrds]
        ))

    clones.initdata([inputfile], fgrps)
def load_dups_benchmark_json(logger):
    global args
    import clones
    import util
    import json

    clones.write_reformatted_sources = False
    clones.checkmarkup = False
    clones.only_generate_for_ui = args.only_ui == "yes"

    inputfile = clones.InputFile(args.source_xml)

    with open(args.neardup_json, encoding='utf-8') as ndj:
        fuzzyclonedata = json.load(ndj)

    # Then the data... OMG...
    # It is all wrong now...
    fgrps = []
    for fgrp in fuzzyclonedata['Benchmarks']:
        # here is group
        group_id = fgrp['name']
        fclns = []
        fclntexts = []
        fclnwords = []
        for fcln in fgrp['group_ids']:
            [si, ei] = fcln['position']
            # tx = fcln['name2'] # not everywhere filled
            tx = inputfile.text[si:ei]
            fclns.append((0, int(si), int(ei)))
            fclntexts.append(tx)
            fclnwords.append(util.ctokens(tx))

        fgrps.append(
            clones.FuzzyCloneGroup(group_id, fclns, fclntexts, fclnwords))

    clones.initdata([inputfile], fgrps)
def load_near_duplicates_json(logger):
    global args
    import clones
    import util
    import json
    """
    JSON example:
    {
        "groups": [
        {
          "group_id": 1,
          "duplicates": [
            {
              "start_index": 404,
              "end_index": 604,
              "text": "ZEXTERN int ZEXPORT deflateInit OF((z_streamp strm, int level)); Initializes the internal stream state for compression. The fields zalloc, zfree and opaque must be initialized before by the caller."
            },
            {
              "start_index": 8148,
              "end_index": 8358,
              "text": "ZEXTERN int ZEXPORT inflateInit OF((z_streamp strm)); Initializes the internal stream state for decompression. The fields next_in, avail_in, zalloc, zfree and opaque must be initialized before by the caller."
            }
          ]
        },
        {
          "group_id": 2,
          "duplicates": [
            {
              "start_index": 605,
              "end_index": 705,
              "text": "If zalloc and zfree are set to Z_NULL, deflateInit updates them to use default allocation functions."
            },
            {
              "start_index": 8579,
              "end_index": 8679,
              "text": "If zalloc and zfree are set to Z_NULL, inflateInit updates them to use default allocation functions."
            }
          ]
        },
        ...
      ]
    }
    """

    # default required settings for fuzzy groups
    clones.write_reformatted_sources = False
    clones.checkmarkup = False
    clones.only_generate_for_ui = args.only_ui == "yes"

    inputfile = clones.InputFile(args.source_xml)
    clones.initdata([inputfile], [])

    with open(args.neardup_json, encoding='utf-8') as ndj:
        fuzzyclonedata = json.load(ndj)

    fgrps = []
    for fgrp in fuzzyclonedata['groups']:
        # here is group
        group_id = fgrp['group_id']
        fclns = []
        fclntexts = []
        fclnwords = []
        for fcln in fgrp['duplicates']:
            si = fcln['start_index']
            ei = fcln['end_index']
            tx = fcln['text']
            fclns.append((0, int(si), int(ei)))
            fclntexts.append(tx)
            fclnwords.append(util.ctokens(tx))

        fgrps.append(
            clones.FuzzyCloneGroup(
                group_id,
                fclns  #, fclntexts, fclnwords
            ))

    clones.initdata([inputfile], fgrps)