def main(argv):
    """Main entry point."""
    init_app(argv)
    path = pathlib.Path(FLAGS.path)

    graphs_path = path / "test"
    labels_path = path / "labels" / FLAGS.analysis

    for graph_path in graphs_path.iterdir():
        stem = graph_path.name[:-len("ProgramGraph.pb")]
        name = f"{stem}ProgramGraphFeaturesList.pb"
        features_path = labels_path / name
        # There is no guarantee that we have generated features for this
        # program graph, so we check for its existence. As a *very* defensive
        # measure, we also check for the existence of the graph file that we
        # enumerated at the start of this function. This check can be removed
        # later, it is only useful during development when you might be
        # modifying the dataset at the same time as having test jobs running.
        if not graph_path.is_file() or not features_path.is_file():
            continue

        graph = pbutil.FromFile(graph_path, program_graph_pb2.ProgramGraph())
        if not len(graph.node) or len(graph.node) > FLAGS.max_graph_node_count:
            continue

        features_list = pbutil.FromFile(
            features_path,
            program_graph_features_pb2.ProgramGraphFeaturesList())

        for j, features in enumerate(features_list.graph):
            step_count_feature = features.features.feature[
                "data_flow_step_count"].int64_list.value
            step_count = step_count_feature[0] if len(
                step_count_feature) else 0
            print(features_path.name, j, step_count)
Exemple #2
0
def Encode(encoder, graph, graph_path, ir_path):
    if ir_path.is_file():
        try:
            ir = pbutil.FromFile(ir_path, ir_pb2.Ir()).text
        except pbutil.DecodeError:
            ir = None
    else:
        ir = None

    encoder.Encode(graph, ir=ir)
    pbutil.ToFile(graph, graph_path)
Exemple #3
0
    def Run(self):
        inst2vec = defaultdict(int)
        cdfg = defaultdict(int)
        programl = defaultdict(int)
        node_count = 0

        for self.ctx.i, path in enumerate(self.graphs, start=1):
            graph = pbutil.FromFile(path, program_graph_pb2.ProgramGraph())

            for node in graph.node:
                node_count += 1

                try:
                    n = (
                        node.features.feature["inst2vec_preprocessed"]
                        .bytes_list.value[0]
                        .decode("utf-8")
                    )
                    if n in self.inst2vec:
                        inst2vec[n] += 1
                except IndexError:
                    pass

                if node.text in self.cdfg:
                    cdfg[node.text] += 1

                if node.text in self.programl:
                    programl[node.text] += 1

        ToCsv(
            self.path / "vocab" / "inst2vec_test_coverage.csv",
            inst2vec,
            node_count,
        )
        ToCsv(
            self.path / "vocab" / "cdfg_test_coverage.csv",
            cdfg,
            node_count,
        )
        ToCsv(
            self.path / "vocab" / "programl_test_coverage.csv",
            programl,
            node_count,
        )
Exemple #4
0
def _ProcessRows(job) -> Tuple[int, int, float]:
    start_time = time.time()
    encoded_count = 0

    encoder: Inst2vecEncoder = job[0]
    paths: List[Tuple[pathlib.Path, pathlib.Path]] = job[1]
    for graph_path, ir_path in paths:
        graph = pbutil.FromFile(graph_path, program_graph_pb2.ProgramGraph())
        # Check to see if we have already processed this file.
        if len(graph.features.feature["inst2vec_annotated"].int64_list.value):
            continue

        encoded_count += 1
        try:
            Encode(encoder, graph, graph_path, ir_path)
        except AssertionError:
            # NCC codebase uses assertions to check for errors.
            pass
        except TimeoutError:
            pass
    return len(paths), encoded_count, time.time() - start_time