Ejemplo n.º 1
0
def initMain(*args, **kwargs):
    """
  Setup module's operations.
  """
    if not FLAGS.active_mergeable_databases:
        raise ValueError(
            "Please input active feed databases to merge as a comma separated list."
        )
    db_paths = [
        pathlib.Path(p).absolute()
        for p in FLAGS.active_mergeable_databases.replace(" ", "").split(",")
    ]
    for p in db_paths:
        if not p.exists():
            raise FileNotFoundError(p)
    dbs = [
        ActiveFeedDatabase(url="sqlite:///{}".format(str(p)), must_exist=True)
        for p in db_paths
    ]

    if FLAGS.active_feed_mode == "merge_active":
        if not FLAGS.output_active_db:
            raise ValueError("Specify out path for merged database")

        out_path = pathlib.Path(FLAGS.output_active_db).absolute()
        if out_path.suffix != '.db':
            raise ValueError(
                "output_active_db must end in a valid database name (.db extension): {}"
                .format(out_path))
        out_path.parent.mkdir(exist_ok=True, parents=True)
        out_db = ActiveFeedDatabase(url="sqlite:///{}".format(str(out_path)),
                                    must_exist=False)

        merge_databases(dbs, out_db)
    elif FLAGS.active_feed_mode == "active_to_samples":

        out_path = pathlib.Path(FLAGS.output_samples_db).absolute()
        if out_path.suffix != '.db':
            raise ValueError(
                "output_samples_db must end in a valid database name (.db extension)"
            )
        out_path.parent.mkdir(exist_ok=True, parents=True)
        out_db = samples_database.SamplesDatabase(url="sqlite:///{}".format(
            str(out_path)),
                                                  must_exist=False)

        active_convert_samples(dbs, out_db)
    else:
        raise ValueError("Invalid value for FLAGS.active_feed_mode: {}".format(
            FLAGS.active_feed_mode))

    return
Ejemplo n.º 2
0
  def __init__(
    self,
    path: pathlib.Path,
    must_exist: bool = False,
    flush_secs: int = 30,
    plot_sample_status = False,
    commit_sample_frequency: int = 1024,
  ):
    distrib.lock()
    self.db = samples_database.SamplesDatabase("sqlite:///{}".format(str(path)), must_exist = must_exist)
    distrib.unlock()
    self.sample_id   = self.db.count
    self.visited     = set(self.db.get_hash_entries)
    self.flush_queue = []
    self.plot_sample_status = plot_sample_status

    if self.plot_sample_status:
      self.saturation_monitor = monitors.CumulativeHistMonitor(path.parent, "cumulative_sample_count")
Ejemplo n.º 3
0
def SRCIRORVsBenchPress(**kwargs) -> None:
    """
  Compare mutec mutation tool on github's database against BenchPress.
  Comparison is similar to KAverageScore comparison.
  """
    seed = kwargs.get('seed')
    benchpress = kwargs.get('benchpress')
    srciror_cache = kwargs.get('srciror_cache', '')
    mutation_level = kwargs.get('mutation_level')
    target = kwargs.get('targets')
    feature_space = kwargs.get('feature_space')
    top_k = kwargs.get('top_k')
    beam_width = kwargs.get('beam_width')
    unique_code = kwargs.get('unique_code', False)
    plot_config = kwargs.get('plot_config')
    workspace_path = kwargs.get('workspace_path')

    if mutation_level == 'src':
        if not pathlib.Path(SRCIROR_SRC).exists():
            raise FileNotFoundError(
                "SRCIROR_src executable not found: {}".format(SRCIROR_SRC))
    else:
        if not pathlib.Path(SRCIROR_IR).exists():
            raise FileNotFoundError(
                "SRCIROR_IR executable not found: {}".format(SRCIROR_IR))
    if seed.db_type != encoded.EncodedContentFiles and seed.db_type != clsmith.CLSmithDatabase:
        raise ValueError(
            "Scores require EncodedContentFiles or CLSmithDatabase but received",
            seed.db_type)
    if benchpress.db_type != samples_database.SamplesDatabase:
        raise ValueError(
            "BenchPress scores require SamplesDatabase but received",
            benchpress.db_type)
    if seed.db_type == clsmith.CLSmithDatabase:
        if not pathlib.Path(CLSMITH_INCLUDE).exists():
            raise FileNotFoundError(
                "CLSMITH_INCLUDE folder does not exist: {}".format(
                    CLSMITH_INCLUDE))

    ## Load database and checkpoint of targets.
    mutec_db = samples_database.SamplesDatabase(url="sqlite:///{}".format(
        pathlib.Path(srciror_cache).resolve()),
                                                must_exist=False)
    done = set()
    with mutec_db.Session(commit=True) as s:
        res = s.query(samples_database.SampleResults).filter_by(
            key=feature_space).first()
        if res is not None:
            done.update([str(x) for x in res.results.split('\n')])
        s.commit()

    ## Initialize dictionary.
    groups = {}
    groups["SRCIROR_{}".format(mutation_level)] = ([], [])
    groups[seed.group_name] = ([], [])
    groups[benchpress.group_name] = ([], [])

    ## Fix fetching data functions.
    if unique_code:
        git_get_data = lambda x: seed.get_unique_data_features(x)
        bp_get_data = lambda x: benchpress.get_unique_data_features(x)
    else:
        git_get_data = lambda x: seed.get_data_features(x)
        bp_get_data = lambda x: benchpress.get_data_features(x)

    ## Run engine on mutec.
    benchmarks = target.get_benchmarks(feature_space)
    for benchmark in tqdm.tqdm(benchmarks,
                               total=len(benchmarks),
                               desc="Benchmarks"):

        ## This has already been searched for.
        if benchmark.name in done:
            continue

        ## Tuple of closest src, distance from target benchmark.0
        closest = workers.SortedSrcDistances(git_get_data(feature_space),
                                             benchmark.features, feature_space)

        ## IF CLsmith takes too long here, collect only features, then for the beam size go and fetch
        ## the code.

        # Split source and distances lists.
        git_dist = [x for _, _, x in closest]

        ## If distances are already minimized, nothing to do.
        if sum(git_dist[:top_k]) == 0:
            continue

        l.logger().info(benchmark.name)

        closest_mutec_src = beam_srciror(
            [(src, inc, dist)
             for src, inc, dist in closest[:beam_width] if dist > 0],
            benchmark.features, feature_space, beam_width,
            mutec_db)[:top_k]  # tuple of (src, distance)
        closest_mutec_dist = [x for _, _, x in closest_mutec_src]

        assert len(closest_mutec_dist) == len(git_dist[:top_k])
        ## If mutec has provided a better score
        if sum(closest_mutec_dist) < sum(git_dist[:top_k]):

            l.logger().info("Score reduced from {} to {}".format(
                sum(git_dist[:top_k]), sum(closest_mutec_dist)))
            l.logger().info("Best score from {} to {}".format(
                git_dist[0], closest_mutec_dist[0]))

            with mutec_db.Session(commit=True) as s:
                res = s.query(samples_database.SampleResults).filter_by(
                    key=feature_space).first()
                if res is not None:
                    res.results = res.results + "\n" + benchmark.name
                else:
                    s.add(
                        samples_database.SampleResults(key=feature_space,
                                                       results=benchmark.name))
                s.commit()

            # Compute target's distance from O(0,0)
            target_origin_dist = math.sqrt(
                sum([x**2 for x in benchmark.features.values()]))
            mutec_avg_dist = sum(closest_mutec_dist) / top_k

            groups["SRCIROR_{}".format(mutation_level)][0].append(
                benchmark.name)
            groups["SRCIROR_{}".format(mutation_level)][1].append(
                100 *
                ((target_origin_dist - mutec_avg_dist) / target_origin_dist))

            # Compute target's distance from O(0,0)
            git_avg_dist = sum(git_dist[:top_k]) / top_k
            groups[seed.group_name][0].append(benchmark.name)
            groups[seed.group_name][1].append(
                100 *
                ((target_origin_dist - git_avg_dist) / target_origin_dist))

    ## Run engine on benchpress.
    benchmarks = target.get_benchmarks(feature_space)
    for benchmark in tqdm.tqdm(benchmarks,
                               total=len(benchmarks),
                               desc="Benchpress"):
        ## Run only for benchmarks mutec has improved.
        if benchmark.name in groups["SRCIROR_{}".format(mutation_level)][0]:

            l.logger().info(benchmark.name)
            distances = workers.SortedDistances(bp_get_data(feature_space),
                                                benchmark.features,
                                                feature_space)

            # Compute target's distance from O(0,0)
            target_origin_dist = math.sqrt(
                sum([x**2 for x in benchmark.features.values()]))
            avg_dist = sum(distances[:top_k]) / len(distances[:top_k])

            groups[benchpress.group_name][0].append(benchmark.name)
            groups[benchpress.group_name][1].append(
                100 * ((target_origin_dist - avg_dist) / target_origin_dist))

    plotter.GrouppedBars(
        groups=groups,
        plot_name="srciror_src_avg_{}_{}_{}".format(
            top_k, seed.group_name,
            feature_space.replace("Features", " Features")),
        path=workspace_path,
        **plot_config if plot_config else {},
    )
    return
Ejemplo n.º 4
0
def sample_files(workspace: str, model_sha: str, sampler_sha: str,
                 sample_db: str):

    global data
    global cached_models
    if data == {}:
        data = parseData()

    current_sampler = {}
    target_sha = crypto.sha256_str(str(workspace) + model_sha)

    for sampler in cached_models[target_sha]['samplers']:
        if sampler['sha'] == sampler_sha:
            current_sampler = sampler
            break

    db_file = current_sampler['path'] / "{}.db".format(sample_db)
    samples_db = samples_database.SamplesDatabase(
        "sqlite:///{}".format(db_file), must_exist=True)

    with samples_db.Session() as session:
        sample_files = session.query(samples_database.Sample).all()

    for sample in sample_files:
        processed_feed = []
        processed_indices = []
        if '[HOLE]' in sample.sample_feed:
            mask_type = '[HOLE]'
        elif '[MASK]' in sample.sample_feed:
            mask_type = '[MASK]'
        else:
            mask_type = ''
        sample_feed = sample.sample_feed.split(mask_type)
        sample_indices = sample.sample_indices.split('\n')
        assert len(sample_feed) - 1 == len(sample_indices), (
            "sample hole length/generation mismatch: {}, {}".format(
                len(sample_feed),
                len(sample_indices),
            ))

        prediction = sample.text

        for i in range(len(sample_feed) - 1):
            processed_feed += [
                {
                    'text': sample_feed[i],
                    'color': 'plain',
                },
                {
                    'text': mask_type,
                    'color': 'mask',
                },
            ]
            processed_indices += [
                {
                    'text': sample_feed[i],
                    'color': 'plain',
                },
                {
                    'text': mask_type,
                    'color': 'mask',
                },
                {
                    'text': sample_indices[i].replace("\\n", "\n"),
                    'color': 'prediction',
                },
            ]
        while i < len(sample_feed) - 1:
            i += 1
            processed_indices.append(
                {
                    'text': sample_feed[i],
                    'color': 'plain',
                }, )
            processed_feed.append({'text': sample_feed[i], 'color': 'plain'})
        sample.sample_indices = processed_indices
        sample.sample_feed = processed_feed

    sample_specs = {
        'summary': cached_models[target_sha]['summary'],
        'workspace': workspace,
        'model_sha': model_sha,
        'samples': sample_files,
    }
    return flask.render_template("sample_files.html",
                                 data=sample_specs,
                                 **GetBaseTemplateArgs())