Ejemplo n.º 1
0
def SortedDistances(data: typing.List[typing.Tuple[str, str,
                                                   typing.Dict[str, float]]],
                    target_features: typing.Dict[str, float],
                    feature_space: str) -> typing.List[float]:
    """
  Return list of pairs of source with respective euclidean distances from target features in ascending order.
  """
    return sorted([
        feature_sampler.calculate_distance(dp, target_features, feature_space)
        for _, _, dp in data
    ])
Ejemplo n.º 2
0
def SortedSrcFeatsDistances(
    data: typing.List[typing.Tuple[str, typing.Dict[str, float]]],
    target_features: typing.Dict[str, float], feature_space: str
) -> typing.List[typing.Tuple[str, str, typing.Dict[str, float], float]]:
    """
  Return list of euclidean distances from target features in ascending order.
  """
    return sorted([(src, include, dp,
                    feature_sampler.calculate_distance(dp, target_features,
                                                       feature_space))
                   for src, include, dp in data],
                  key=lambda x: x[3])
Ejemplo n.º 3
0
def IRExtractAndCalculate(bytecode: str, target_features: typing.Dict[str,
                                                                      float],
                          feature_space: str) -> typing.Tuple[str, str, float]:
    """
  Extract features for source code and calculate distance from target.

  Returns:
    Tuple of source code with distance.
  """
    f = extractor.ExtractIRFeatures(bytecode, [feature_space])
    if feature_space in f and f[feature_space]:
        return bytecode, "", feature_sampler.calculate_distance(
            f[feature_space], target_features, feature_space)
    return None
Ejemplo n.º 4
0
def ExtractAndCalculate(src_incl: typing.Tuple[str, str],
                        target_features: typing.Dict[str, float],
                        feature_space: str) -> typing.Tuple[str, str, float]:
    """
  Extract features for source code and calculate distance from target.

  Returns:
    Tuple of source code with distance.
  """
    src, incl = src_incl
    f = extractor.ExtractFeatures(
        src, [feature_space],
        header_file=incl,
        extra_args=[
            "-include{}".format(
                pathlib.Path(environment.CLSMITH_INCLUDE) / "CLSmith.h")
        ] if incl else [""])
    if feature_space in f and f[feature_space]:
        return src, incl, feature_sampler.calculate_distance(
            f[feature_space], target_features, feature_space)
    return None
Ejemplo n.º 5
0
  def registerOutputData(self,
                         outputs    : typing.Dict[str, typing.List[np.array]],
                         # rng        : typing.Tuple[int, int],
                         feeds      : ActiveSampleFeed,
                         candidates : typing.List[ActiveSample],
                         rejected_candidates: typing.List[ActiveSample],
                         bar: tqdm.tqdm,
                         ) -> typing.List[int]:
    """
    Gets workload output from model.
    In parallel, every sample is checked for compilability and features are extracted.
    If sample compiles, it is stored as an active learning candidate.

    Args:
      outputs: Dictionary output of workload
      candidates: Passed by reference and filled within this function
      bar: tqdm bar for status checking

    Returns:
      cm_rate: List of two elements that express compilation rate of workload.
               0th el: Total compiling.
               1st el: Total samples.
    """
    cm_rate = [0, 0]
    # l.logger().warn("Opening pool")
    pool = multiprocessing.Pool()
    cm_rate[1] += len(outputs['generated_samples'])
    better_found = None
    try:
      it = zip(
        outputs['generated_samples'], outputs['sample_indices'],
        outputs['input_ids'], outputs['masked_lm_lengths'],
        feeds
      )
      if self.feat_sampler.feature_space != "GreweFeatures":
        candidate_worker = functools.partial(
          IR_candidate_worker, tokenizer = self.tokenizer, feat_sampler = self.feat_sampler,
        )
      else:
        candidate_worker = functools.partial(
          text_candidate_worker, tokenizer = self.tokenizer, feat_sampler = self.feat_sampler,
        )
      t = 0
      # l.logger().warn("Pool opened")
      for idx, batch in bar(enumerate(pool.map(candidate_worker, it))):
        t = idx
        if batch[0]:
          cm_rate[0] += 1
          candidates.append(batch[1])
          if 0 < batch[1].score < batch[1].sample_feed.input_score:
            if better_found is None or batch[1].score < better_found.score:
              better_found = batch[1]
        else:
          if FLAGS.evaluate_candidates:
            rejected_candidates.append(batch[1])

      if FLAGS.features_standard_scaler:
        scaler = sklearn.preprocessing.StandardScaler()
        scaler.fit([[float(y) for y in x.features.values()] for x in candidates + [self.feat_sampler.target_benchmark]])
        target_feats = {k: v for k, v in zip(self.feat_sampler.target_benchmark.features.keys(), scaler.transform([[float(x) for x in self.feat_sampler.target_benchmark.features.values()]])[0])}
        for idx, cd in enumerate(candidates):
          outfeats = {k: v for k, v in zip(cd.features.keys(), scaler.transform([[float(x) for x in cd.features.values()]])[0])}
          candidates[idx]._replace(score = feature_sampler.calculate_distance(outfeats, target_feats, self.feat_sampler.feature_space))

      pool.close()
      pool.terminate()
    except KeyboardInterrupt as e:
      pool.close()
      pool.terminate()
      raise e
    return cm_rate, better_found