Example #1
0
  def Run(self):
    if self.write_header:
      self.writer.writerow(
        ("analysis", "graph_name", "i", "label_count", "step_count")
      )

    for self.ctx.i, path in enumerate(self.files):
      graph_name = path.name[: -len(".ProgramGraphFeaturesList.pb")]
      features = pbutil.FromFile(
        path, program_graph_features_pb2.ProgramGraphFeaturesList()
      )
      for i, graph in enumerate(features.graph):
        step_count = graph.features.feature[
          "data_flow_step_count"
        ].int64_list.value
        if len(step_count):
          step_count = step_count[0]
        else:
          step_count = 0
        self.writer.writerow(
          (
            self.analysis,
            graph_name,
            i,
            len(graph.node_features.feature_list["data_flow_value"].feature),
            step_count,
          )
        )
Example #2
0
def Main():
    """Main entry point."""
    path = pathlib.Path(FLAGS.path)

    graphs_path = path / "test"
    labels_path = path / "labels" / FLAGS.analysis

    for graph_path in graphs_path.iterdir():
        stem = graph_path.name[:-len("ProgramGraph.pb")]
        name = f"{stem}ProgramGraphFeaturesList.pb"
        features_path = labels_path / name
        # There is no guarantee that we have generated features for this
        # program graph, so we check for its existence. As a *very* defensive
        # measure, we also check for the existence of the graph file that we
        # enumerated at the start of this function. This check can be removed
        # later, it is only useful during development when you might be
        # modifying the dataset at the same time as having test jobs running.
        if not graph_path.is_file() or not features_path.is_file():
            continue

        graph = pbutil.FromFile(graph_path, program_graph_pb2.ProgramGraph())
        if not len(graph.node) or len(graph.node) > FLAGS.max_graph_node_count:
            continue

        features_list = pbutil.FromFile(
            features_path,
            program_graph_features_pb2.ProgramGraphFeaturesList())

        for j, features in enumerate(features_list.graph):
            step_count_feature = features.features.feature[
                "data_flow_step_count"].int64_list.value
            step_count = step_count_feature[0] if len(
                step_count_feature) else 0
            print(features_path.name, j, step_count)
def TestOne(
    features_list_path: pathlib.Path,
    features_list_index: int,
    checkpoint_path: pathlib.Path,
) -> BatchResults:
    path = pathlib.Path(pathflag.path())

    features_list = pbutil.FromFile(
        features_list_path,
        program_graph_features_pb2.ProgramGraphFeaturesList(),
    )
    features = features_list.graph[features_list_index]

    graph_name = features_list_path.name[: -len(".ProgramGraphFeaturesList.pb")]
    graph = pbutil.FromFile(
        path / "graphs" / f"{graph_name}.ProgramGraph.pb",
        program_graph_pb2.ProgramGraph(),
    )

    # Instantiate and restore the model.
    vocab = vocabulary.LoadVocabulary(
        path,
        model_name="cdfg" if FLAGS.cdfg else "programl",
        max_items=FLAGS.max_vocab_size,
        target_cumfreq=FLAGS.target_vocab_cumfreq,
    )

    if FLAGS.cdfg:
        FLAGS.use_position_embeddings = False

    model = Ggnn(
        vocabulary=vocab,
        test_only=True,
        node_y_dimensionality=2,
        graph_y_dimensionality=0,
        graph_x_dimensionality=0,
        use_selector_embeddings=True,
    )
    checkpoint = pbutil.FromFile(checkpoint_path, checkpoint_pb2.Checkpoint())
    model.RestoreCheckpoint(checkpoint)

    batch = list(
        DataflowGgnnBatchBuilder(
            graph_loader=SingleGraphLoader(graph=graph, features=features),
            vocabulary=vocab,
            max_node_size=int(1e9),
            use_cdfg=FLAGS.cdfg,
            max_batch_count=1,
        )
    )[0]

    results = model.RunBatch(epoch_pb2.TEST, batch)

    return AnnotateGraphWithBatchResults(graph, features, results)
Example #4
0
def RunAnalysis(
    analysis: str, graph: program_graph_pb2.ProgramGraph
) -> program_graph_features_pb2.ProgramGraphFeaturesList:
    """Run the given analysis.

  Args:
    analysis: The name of the analysis to run.
    graph: The program graph to analyze.

  Returns:
    A program graph features list.

  Raises:
    ValueError: In case analysis fails.
  """
    graph_features = program_graph_features_pb2.ProgramGraphFeaturesList()
    serialized_graph_features = analysis_pybind.RunAnalysis(
        analysis, graph.SerializeToString())
    graph_features.ParseFromString(serialized_graph_features)
    return graph_features
Example #5
0
  def _Worker(self):
    """Threaded graph reader."""
    graph_files = list(self.graph_path.iterdir())
    app.Log(
      2, "Enumerated %s graph files to load", humanize.Commas(len(graph_files))
    )

    graph_count = 0
    while not self.min_graph_count or graph_count < self.min_graph_count:
      # Strip any graph files that we have earmarked for ignoring.
      graph_files = [
        f for f in graph_files if f not in self._excluded_graph_files
      ]

      # We may have run out of files.
      if not graph_files:
        self._Done(graph_count)
        return

      if self.seed:
        # If we are setting a reproducible seed, first sort the list of files
        # since iterdir() order is undefined, then seed the RNG for the
        # shuffle.
        graph_files = sorted(graph_files, key=lambda x: x.name)
        # Change the seed so that on the next execution of this loop we will
        # chose a different random ordering.
        self.seed += 1
      random.Random(self.seed).shuffle(graph_files)

      for graph_path in graph_files:
        if self._stopped:
          break

        stem = graph_path.name[: -len("ProgramGraph.pb")]
        name = f"{stem}ProgramGraphFeaturesList.pb"
        features_path = self.labels_path / name
        # There is no guarantee that we have generated features for this
        # program graph, so we check for its existence. As a *very* defensive
        # measure, we also check for the existence of the graph file that we
        # enumerated at the start of this function. This check can be removed
        # later, it is only useful during development when you might be
        # modifying the dataset at the same time as having test jobs running.
        if not graph_path.is_file() or not features_path.is_file():
          self.skip_count += 1
          continue

        # Read the graph from disk, maybe performing a cheeky wee conversion
        # to CDFG format.
        app.Log(3, "Read %s", features_path)
        if self.use_cdfg:
          graph = cdfg.FromProgramGraphFile(graph_path)
        else:
          graph = pbutil.FromFile(graph_path, program_graph_pb2.ProgramGraph())

        if not graph:
          app.Log(2, "Failed to load graph %s", graph_path)
          self._excluded_graph_files.add(graph_path)
          continue

        # Skip empty graphs.
        if not len(graph.node) or len(graph.node) > FLAGS.max_graph_node_count:
          app.Log(
            2,
            "Graph node count %s is not in range (1,%s]",
            len(graph.node),
            FLAGS.max_graph_node_count,
          )
          self._excluded_graph_files.add(graph_path)
          continue

        # Skip a graph without inst2vec
        if self.require_inst2vec and not len(
          graph.features.feature["inst2vec_annotated"].int64_list.value
        ):
          app.Log(2, "Skipping graph without inst2vec annotations")
          continue

        features_list = pbutil.FromFile(
          features_path, program_graph_features_pb2.ProgramGraphFeaturesList()
        )

        # Iterate over the features list to yield <graph, features> pairs.
        skipped_all_features = True
        for j, features in enumerate(features_list.graph):
          step_count_feature = features.features.feature[
            "data_flow_step_count"
          ].int64_list.value
          step_count = step_count_feature[0] if len(step_count_feature) else 0
          if self.data_flow_step_max and step_count > self.data_flow_step_max:
            self.skip_count += 1
            app.Log(
              3,
              "Skipped graph with data_flow_step_count %d > %d "
              "(skipped %d / %d, %.2f%%)",
              step_count,
              self.data_flow_step_max,
              self.skip_count,
              (graph_count + self.skip_count),
              (self.skip_count / (graph_count + self.skip_count)) * 100,
            )
            continue
          graph_count += 1
          if self.logfile:
            self.logfile.write(f"{features_path} {j}\n")
          self._outq.put((graph, features), block=True)
          skipped_all_features = False
          if self.max_graph_count and graph_count >= self.max_graph_count:
            app.Log(2, "Stopping after reading %d graphs", graph_count)
            self._Done(graph_count)
            return

        if skipped_all_features:
          self._excluded_graph_files.add(graph_path)

    self._Done(graph_count)