def load_fuzzy_groups_xml(logger):
    from lxml import etree
    global args, only_generate_for_ui
    import clones

    # default required settings for fuzzy groups
    clones.write_reformatted_sources = False
    clones.checkmarkup = False
    only_generate_for_ui = clones.only_generate_for_ui = args.only_ui == "yes"

    inputfile = clones.InputFile(args.source_xml)

    fuzzyclonedata = etree.parse(args.fuzzy_xml)  # type: ElementTree

    fgrps = []
    for fgrp in fuzzyclonedata.xpath('/fuzzygroups/fuzzygroup'):
        # here is group
        fclns = []
        fclntexts = []
        fclnwords = []
        for fcln in fgrp.xpath('./fuzzyclone'):
            fclns.append(
                (0, int(fcln.attrib['offset']),
                 int(fcln.attrib['length']) + int(fcln.attrib['offset'])))
            fclntexts.append(fcln.xpath('./sourcetext')[0].text)
            fclnwords.append(fcln.xpath('./sourcewords')[0].text)

        fgrps.append(
            clones.FuzzyCloneGroup(fgrp.attrib['id'], fclns, fclntexts,
                                   fclnwords))

    clones.initdata([inputfile], fgrps)
def loadfuzzyinputs(logger):
    global args

    # default required settings for fuzzy groups
    clones.write_reformatted_sources = False
    clones.checkmarkup = False
    clones.only_generate_for_ui = args.only_ui == "yes"

    inputfile = clones.InputFile(args.source_xml)
    with open(args.source_xml + ".reformatted", "w", encoding='utf-8') as rf:
        rf.write(inputfile.text)

    fgrps = extract_near_duplicates(inputfile.text, logger)

    clones.initdata([inputfile], fgrps)
Example #3
0
def organize_search(logger, args):
    import clones
    import itertools

    # default required settings for fuzzy groups
    clones.write_reformatted_sources = False
    clones.checkmarkup = False
    clones.only_generate_for_ui = args.only_ui == "yes"

    inputfile = clones.InputFile(args.input_document)

    fuzzyclonedata = find_like_pattern(inputfile, args.pattern, args.minimal_similarity)

    fgrps = []
    clones.inputfiles = [inputfile] # to get texts and ratios properly
    for (cbeg, cend, clr, ctxt, cwrds), ctr in zip(fuzzyclonedata, itertools.count(1)):
        fgrps.append(clones.FuzzyCloneGroup(
            str(ctr), [(0, cbeg, cend)] #,
            #[' '.join(cwrds)], # [ctxt], !! TODO: Don't hack, implement!
            #[cwrds]
        ))

    clones.initdata([inputfile], fgrps)
def load_dups_benchmark_json(logger):
    global args
    import clones
    import util
    import json

    clones.write_reformatted_sources = False
    clones.checkmarkup = False
    clones.only_generate_for_ui = args.only_ui == "yes"

    inputfile = clones.InputFile(args.source_xml)

    with open(args.neardup_json, encoding='utf-8') as ndj:
        fuzzyclonedata = json.load(ndj)

    # Then the data... OMG...
    # It is all wrong now...
    fgrps = []
    for fgrp in fuzzyclonedata['Benchmarks']:
        # here is group
        group_id = fgrp['name']
        fclns = []
        fclntexts = []
        fclnwords = []
        for fcln in fgrp['group_ids']:
            [si, ei] = fcln['position']
            # tx = fcln['name2'] # not everywhere filled
            tx = inputfile.text[si:ei]
            fclns.append((0, int(si), int(ei)))
            fclntexts.append(tx)
            fclnwords.append(util.ctokens(tx))

        fgrps.append(
            clones.FuzzyCloneGroup(group_id, fclns, fclntexts, fclnwords))

    clones.initdata([inputfile], fgrps)
def organize_search(logger):
    global args
    import clones
    import itertools

    # default required settings for fuzzy groups
    clones.write_reformatted_sources = False
    clones.checkmarkup = False
    clones.only_generate_for_ui = args.only_ui == "yes"

    inputfile = clones.InputFile(args.input_document)

    fuzzyclonedata = find_like_pattern(inputfile, args.pattern, args.minimal_similarity)

    fgrps = []
    for (cbeg, cend, clr, ctxt, cwrds), ctr in zip(fuzzyclonedata, itertools.count(1)):
        fgrps.append(clones.FuzzyCloneGroup(
            str(ctr), [(0, cbeg, cend)],
            [' '.join(cwrds)], # [ctxt], !! TODO: Don't hack, implement!
            [cwrds],
            ratio=clr
        ))

    clones.initdata([inputfile], fgrps)
def load_near_duplicates_json(logger):
    global args
    import clones
    import util
    import json
    """
    JSON example:
    {
        "groups": [
        {
          "group_id": 1,
          "duplicates": [
            {
              "start_index": 404,
              "end_index": 604,
              "text": "ZEXTERN int ZEXPORT deflateInit OF((z_streamp strm, int level)); Initializes the internal stream state for compression. The fields zalloc, zfree and opaque must be initialized before by the caller."
            },
            {
              "start_index": 8148,
              "end_index": 8358,
              "text": "ZEXTERN int ZEXPORT inflateInit OF((z_streamp strm)); Initializes the internal stream state for decompression. The fields next_in, avail_in, zalloc, zfree and opaque must be initialized before by the caller."
            }
          ]
        },
        {
          "group_id": 2,
          "duplicates": [
            {
              "start_index": 605,
              "end_index": 705,
              "text": "If zalloc and zfree are set to Z_NULL, deflateInit updates them to use default allocation functions."
            },
            {
              "start_index": 8579,
              "end_index": 8679,
              "text": "If zalloc and zfree are set to Z_NULL, inflateInit updates them to use default allocation functions."
            }
          ]
        },
        ...
      ]
    }
    """

    # default required settings for fuzzy groups
    clones.write_reformatted_sources = False
    clones.checkmarkup = False
    clones.only_generate_for_ui = args.only_ui == "yes"

    inputfile = clones.InputFile(args.source_xml)
    clones.initdata([inputfile], [])

    with open(args.neardup_json, encoding='utf-8') as ndj:
        fuzzyclonedata = json.load(ndj)

    fgrps = []
    for fgrp in fuzzyclonedata['groups']:
        # here is group
        group_id = fgrp['group_id']
        fclns = []
        fclntexts = []
        fclnwords = []
        for fcln in fgrp['duplicates']:
            si = fcln['start_index']
            ei = fcln['end_index']
            tx = fcln['text']
            fclns.append((0, int(si), int(ei)))
            fclntexts.append(tx)
            fclnwords.append(util.ctokens(tx))

        fgrps.append(
            clones.FuzzyCloneGroup(
                group_id,
                fclns  #, fclntexts, fclnwords
            ))

    clones.initdata([inputfile], fgrps)
Example #7
0
import textwrap
import os
import sys
import cgi
import string
import collections
import argparse
import shutil
import time
import errno
import csv

import intertree
import clones

clones.initdata()


def initargs():
    argpar = argparse.ArgumentParser()
    argpar.add_argument("-nb", "--findnearby",
                        help="Find clones nearby each other, specify maximal distance (if clones theirselves are shorter)")
    argpar.add_argument("-wv", "--writevariations", help="Detect and write clone variations")
    argpar.add_argument("-sd", "--subdir", help="Subdir for output")
    argpar.add_argument("-bl", "--blacklist", help="Group ID's (as Clone Miner prints) to throw away")
    argpar.add_argument("-wl", "--whitelist",
                        help="Group ID's (as Clone Miner prints) to keep (consider others blacklisted)")
    argpar.add_argument("-minl", "--minimalclonelength", help="Minimal clone length in symbols. Default = 0")
    argpar.add_argument("-mino", "--minimalgrouppower", help="Minimal count of clones in group. Default = 2")
    argpar.add_argument("-cmup", "--checkmarkup",
                        help="Allow (no) Filter (yes) and Shrink-fix (shrink) groups, containing broken markup",
Example #8
0
import sys
import html
import string
import collections
import argparse
import shutil
import time
import errno
import csv
import util

import intertree
import clones
import extra_report

clones.initdata()


def initargs():
    def str2bool(v):
        if v.lower() in ('yes', 'true', 't', 'y', '1', '2'):
            return True
        elif v.lower() in ('no', 'false', 'f', 'n', '0'):
            return False
        else:
            raise argparse.ArgumentTypeError('Boolean value expected.')

    argpar = argparse.ArgumentParser()
    argpar.add_argument(
        "-nb",
        "--findnearby",