def report(logger):
    global args
    import clones

    if args.unfuzzy in ["yes", "True", "1"]:
        import archetype_extraction
        clones.cm_inclusiveend = True
        ves = [
            archetype_extraction.get_variative_element(clones, g)
            for g in clones.clonegroups
        ]
        ves = filter(None, ves)
        cohtml = clones.VariativeElement.summaryhtml(
            ves, clones.ReportMode.variative)
    else:
        fuzzygroups = [
            clones.VariativeElement([cg]) for cg in clones.clonegroups
        ]
        cohtml = clones.VariativeElement.summaryhtml(
            fuzzygroups, clones.ReportMode.fuzzyclones)

    outdir = args.output_directory
    with open(os.path.join(outdir, "acceptedduplicates.html"),
              'w',
              encoding='utf-8') as htmlfile:
        htmlfile.write(cohtml)

    shutil.copyfile(
        os.path.join(os.path.dirname(os.path.abspath(__file__)), 'js',
                     'interactivity.js'),
        os.path.join(outdir, "interactivity.js"))
    shutil.copyfile(
        os.path.join(os.path.dirname(os.path.abspath(__file__)), 'js',
                     'jquery-2.0.3.min.js'),
        os.path.join(outdir, "jquery-2.0.3.min.js"))
Ejemplo n.º 2
0
def report(logger):
    global args
    import webbrowser
    import clones
    import pathlib

    fuzzygroups = [clones.VariativeElement([cg]) for cg in clones.clonegroups]
    cohtml = clones.VariativeElement.summaryhtml(fuzzygroups,
                                                 clones.ReportMode.fuzzyclones)

    outdir = args.output_directory
    with open(os.path.join(outdir, "pyvarelements.html"),
              'w',
              encoding='utf-8') as htmlfile:
        htmlfile.write(cohtml)

    shutil.copyfile(
        os.path.join(os.path.dirname(os.path.abspath(__file__)), 'js',
                     'interactivity.js'),
        os.path.join(outdir, "interactivity.js"))
    shutil.copyfile(
        os.path.join(os.path.dirname(os.path.abspath(__file__)), 'js',
                     'jquery-2.0.3.min.js'),
        os.path.join(outdir, "jquery-2.0.3.min.js"))

    if args.open_browser:
        report_url = pathlib.Path(
            os.path.join(os.path.abspath(outdir),
                         "pyvarelements.html")).as_uri()
        webbrowser.open(report_url)
Ejemplo n.º 3
0
def combine_gruops_par_20140819(available_groups):
    # import multiprocessing
    # pool = multiprocessing.Pool()
    import clones
    import sys
    import time

    # participated_groups = set() not used
    combinations = []

    print("Combining groups, %d total..." % len(available_groups))
    t1 = time.process_time()

    pcounter = 0

    available_groups = set(available_groups)
    ptotal = len(available_groups)
    current_available_groups = set(available_groups)

    ttyn = '\r' if sys.stdout.isatty() else '\n'

    for g1 in available_groups:
        if g1 in current_available_groups:
            best_g2 = None
            best_dist = clones.infty
            for g2 in current_available_groups:
                if g2 != g1:
                    d = clones.ExactCloneGroup.distance(g1, g2)
                    if d < best_dist:
                        best_dist = d
                        best_g2 = g2
            if best_dist != clones.infty:
                combinations.append(clones.VariativeElement([g1, best_g2]))
                current_available_groups.discard(g1)
                current_available_groups.discard(g2)

        pcounter += 1
        print("~ %d / %d = %03.1f%%" %
              (pcounter, ptotal, 100.0 * pcounter / ptotal),
              end=ttyn,
              flush=True)

    t2 = time.process_time()

    # print stats
    def pstats():
        import logging
        logging.getLogger("cloneminer.combine.square").info(
            "Source single groups: %d" % (len(available_groups), ))
        logging.getLogger("cloneminer.combine.square").info(
            "Single groups: %d" % (len(current_available_groups), ))
        logging.getLogger("cloneminer.combine.square").info(
            "Variative groups: %d" % (len(combinations), ))
        logging.getLogger("cloneminer.combine.square").info(
            "Spent time: %f s." % (t2 - t1, ))

    pstats()

    return combinations, current_available_groups
Ejemplo n.º 4
0
def combine_groups():
    import combine_grp

    # sort descending
    available_groups = sorted(clones.clonegroups,
                              key=lambda gr: len(gr.text()),
                              reverse=True)

    group_combinators = {
        "full-square": combine_grp.combine_gruops_par_20140819,
        "interval-n-ext":
        combine_grp.combine_groups_n_ext_with_int_tree,  # default
        # TODO: "interval-2-ext": findnearby201312 # first try, 2013 -- port or delete it
    }

    group_combinator = group_combinators[group_combining_algorithm_name]

    if group_combining_algorithm_name == "full-square":
        clones.VariativeElement.postfiltering = False
    elif group_combining_algorithm_name == "interval-n-ext":
        clones.VariativeElement.postfiltering = True
    else:
        raise Exception("WAT?..")

    if nearby:
        combinations, remaining_groups = group_combinator(available_groups)
    else:
        combinations, remaining_groups = [], available_groups

    # print("Offered clones -- total: %d, single: %d, variative: %d" % (len(remaining_groups) + len(combinations), len(remaining_groups), len(combinations)))

    combinations += [clones.VariativeElement([gr]) for gr in remaining_groups]

    combinations = list(filter(lambda ve: ve.passes_filter(), combinations))

    combinations.sort(key=lambda ve: ve.size, reverse=True)

    l = logging.getLogger("cloneminer.combine.summary")
    l.info("After final filtering, having:")
    l.info(
        "Exact dup groups: %d" %
        len(list(filter(lambda ve: len(ve.clone_groups) == 1, combinations))))
    l.info(
        "Near  dup groups: %d" %
        len(list(filter(lambda ve: len(ve.clone_groups) > 1, combinations))))

    return combinations
Ejemplo n.º 5
0
def report(logger, args):
    import clones

    clones.FuzzyCloneGroup.reference_text = args.pattern
    fuzzygroups = [clones.VariativeElement([cg]) for cg in clones.clonegroups]
    cohtml = clones.VariativeElement.summaryhtml(fuzzygroups, clones.ReportMode.fuzzymatches)

    outdir = args.output_directory
    with open(os.path.join(outdir, "pyvarelements.html"), 'w', encoding='utf-8') as htmlfile:
        htmlfile.write(cohtml)

    shutil.copyfile(
        os.path.join(os.path.dirname(os.path.abspath(__file__)), 'js', 'interactivity.js'),
        os.path.join(outdir, "interactivity.js")
    )
    shutil.copyfile(
        os.path.join(os.path.dirname(os.path.abspath(__file__)), 'js', 'jquery-2.0.3.min.js'),
        os.path.join(outdir, "jquery-2.0.3.min.js")
    )

    return fuzzygroups
Ejemplo n.º 6
0
def report(logger):
    global args
    import clones

    fuzzygroups = [clones.VariativeElement([cg]) for cg in clones.clonegroups]
    cohtml = clones.VariativeElement.summaryhtml(fuzzygroups,
                                                 clones.ReportMode.fuzzyclones)

    outdir = args.output_directory
    with open(os.path.join(outdir, "acceptedduplicates.html"),
              'w',
              encoding='utf-8') as htmlfile:
        htmlfile.write(cohtml)

    shutil.copyfile(
        os.path.join(os.path.dirname(os.path.abspath(__file__)), 'js',
                     'interactivity.js'),
        os.path.join(outdir, "interactivity.js"))
    shutil.copyfile(
        os.path.join(os.path.dirname(os.path.abspath(__file__)), 'js',
                     'jquery-2.0.3.min.js'),
        os.path.join(outdir, "jquery-2.0.3.min.js"))
Ejemplo n.º 7
0
def combine_groups_n_ext_with_int_tree(available_groups: "list[clones.CloneGroup]"):
    """
    Algorithm:
    AG = available_groups
    AVG = available_variative_groups
    0. AVG += AG, AG := []; interval tree is built
    1. for each G1 in AVG find G2 in AVG which it best combines with (closest non-intersecting)
    1.1. during 2nd and further iterations only consider G1 of 2 and more groups above. Reason:
         all combinable single groups were combined during 1st iteration, so no new variative
         ones can appear, only existing can be extended
    1.2. we only consider new VG as successful when
    2. each successful G1, G2 combination is:
    2.1. ACG -= [G1, G2]
    2.2. VG1 <- [G1, G2]
    2.3. AVG += [VG1]
    2.4. interval tree is modified
    3. if there were any successful combinations in (2), continue from (1), else go ahead
    4. AVG -> AVG, AG
    5. return AVG, AG

    :param available_groups:
    :return:
    """
    import sys
    import itertools
    import logging
    import clones
    import time
    from intervaltree import IntervalTree

    ttyn = '\r' if sys.stdout.isatty() else '\n'

    print("Combining groups, %d total..." % len(available_groups))
    t1 = time.process_time()

    # (0)
    avg = set([clones.VariativeElement([cg]) for cg in available_groups])

    def build_interval_tree():
        vg_interval_list = []
        for ve in avg:
            vg_interval_list += ve.get_tree_intervals(expanded=True, archetype_consolidated=True)
        logging.debug("(re)building interval tree...")
        itree = IntervalTree(vg_interval_list)  # to search who intersects with clone_intervals[i.begin:i.end]
        logging.debug("(re)built interval tree of %d intervals." % (len(vg_interval_list),))
        return itree

    def pprogress(stepsready, stepprogress):
        ptotal = len(available_groups)
        spready = 1.0 - 2**(-stepsready)  # very optimistic: next step is 2 times shorter than next
        aready = 2**(-1 - stepsready) * stepprogress
        tready = int((spready + aready) * ptotal)
        print("~ %d / %d = %03.1f%%" % (tready, ptotal, 100.0 * tready / ptotal), end=ttyn, flush=True)

    # (1)
    cycle = True
    iterations_passed = 0
    while cycle:
        cycle = False
        vg_intervals = build_interval_tree()  # TODO: why does it crash when used incrementally as in (2)?..

        pprogress(iterations_passed, 0)

        skip = set()
        tojoin = set()
        for g1, g1i in zip(avg, itertools.count()):
            if g1 in skip:
                continue

            # on second and further iterations no new single groups
            # will be combined with each other, so only consider variative ones
            # TODO: test better
            if iterations_passed and g1.g_power == 1:
                continue

            probable_g2_intervals = [
                vg_intervals.overlap(interval.begin, interval.end) for interval in
                g1.get_tree_intervals(expanded=True, archetype_consolidated=True)
            ]
            probable_g2s = [
                clones.VariativeElement.from_tree_interval(i)
                for i in itertools.chain.from_iterable(probable_g2_intervals)
            ]
            probable_g2_dists = [
                clones.VariativeElement.distance(g1, g2, expanded=False, archetype_consolidated=True)
                for g2 in probable_g2s
            ]
            probable_g2_xdists = [
                clones.VariativeElement.distance(g1, g2, expanded=True, archetype_consolidated=True)
                for g2 in probable_g2s
            ]
            g2sdists = [
                (g, d)
                for g, d, xd in zip(probable_g2s, probable_g2_dists, probable_g2_xdists)
                if 0 < d < clones.infty and xd == -clones.infty and g not in skip
                # groups do not intersect, but expanded masks do intersect
                # This condition is very important. It causes only combining with groups
                # those are close enough to keep variative part less than 15% of archetype
            ]
            if len(g2sdists) > 0:
                best_g2, best_d = min(g2sdists, key=lambda gd: gd[1])
                g1_g2 = g1 + best_g2
                # In fact, interval expansion is a bit more tolerant than it should be, let's correct it here
                if g1_g2.obeys_basset_constraint():
                    tojoin.add((g1, best_g2, g1_g2))
                    skip.add(g1)
                    skip.add(best_g2)

            if not g1i % 100:
                pprogress(iterations_passed, g1i / len(avg))

        # (2)
        for g1, g2, new_ve in tojoin:
            logging.debug("AVG %d ->" % (len(avg),))

            cycle = True  # check for (3)

            # (2.1)
            avg.remove(g1)
            avg.remove(g2)
            # (2.3)
            avg.add(new_ve)

            logging.debug("AVG %d <-" % (len(avg),))

            # (2.4)
            # for itvl in g1.get_tree_intervals(expanded=True):
            #     vg_intervals.remove(itvl)
            # for itvl in g2.get_tree_intervals(expanded=True):
            #     vg_intervals.remove(itvl)
            # for itvl in new_ve.get_tree_intervals(expanded=True):
            #     vg_intervals.add(itvl)

        iterations_passed += 1

    # then split unary groups away
    var_groups = []
    uni_groups = []
    for ve in avg:
        if len(ve.clone_groups) > 1:
            var_groups.append(ve)
        else:
            uni_groups.append(ve.clone_groups[0])

    t2 = time.process_time()

    # print stats
    def pstats():
        import collections
        logging.getLogger("cloneminer.combine.n_ext_points").info("Source single groups: %d" %(len(available_groups),))
        logging.getLogger("cloneminer.combine.n_ext_points").info("Single groups: 1 -> %d" %(len(uni_groups),))
        vgs =  collections.defaultdict(lambda : 0)
        for vg in var_groups:
            vgs[len(vg.clone_groups)] += 1
        logging.getLogger("cloneminer.combine.n_ext_points").info("Variative groups:")
        for gc in sorted(vgs.keys()):
            logging.getLogger("cloneminer.combine.n_ext_points").info(" - %d -> %d" % (gc, vgs[gc]))
        if vgs.keys():
            avg = sum([vgs[gc]*gc for gc in vgs.keys()]) / sum([vgs[gc] for gc in vgs.keys()])
            logging.getLogger("cloneminer.combine.n_ext_points").info(" - AVG -> %f" % (avg,))
        if vgs.values():
            logging.getLogger("cloneminer.combine.n_ext_points").info(" - Total VGs -> %d" % (sum(vgs.values()),))
        logging.getLogger("cloneminer.combine.n_ext_points").info("Spent time: %f s." % (t2 - t1,))

    pstats()

    return var_groups, uni_groups