def main(argv): """Main entry point.""" init_app(argv) path = pathlib.Path(FLAGS.path) graphs_path = path / "test" labels_path = path / "labels" / FLAGS.analysis for graph_path in graphs_path.iterdir(): stem = graph_path.name[:-len("ProgramGraph.pb")] name = f"{stem}ProgramGraphFeaturesList.pb" features_path = labels_path / name # There is no guarantee that we have generated features for this # program graph, so we check for its existence. As a *very* defensive # measure, we also check for the existence of the graph file that we # enumerated at the start of this function. This check can be removed # later, it is only useful during development when you might be # modifying the dataset at the same time as having test jobs running. if not graph_path.is_file() or not features_path.is_file(): continue graph = pbutil.FromFile(graph_path, program_graph_pb2.ProgramGraph()) if not len(graph.node) or len(graph.node) > FLAGS.max_graph_node_count: continue features_list = pbutil.FromFile( features_path, program_graph_features_pb2.ProgramGraphFeaturesList()) for j, features in enumerate(features_list.graph): step_count_feature = features.features.feature[ "data_flow_step_count"].int64_list.value step_count = step_count_feature[0] if len( step_count_feature) else 0 print(features_path.name, j, step_count)
def Encode(encoder, graph, graph_path, ir_path): if ir_path.is_file(): try: ir = pbutil.FromFile(ir_path, ir_pb2.Ir()).text except pbutil.DecodeError: ir = None else: ir = None encoder.Encode(graph, ir=ir) pbutil.ToFile(graph, graph_path)
def Run(self): inst2vec = defaultdict(int) cdfg = defaultdict(int) programl = defaultdict(int) node_count = 0 for self.ctx.i, path in enumerate(self.graphs, start=1): graph = pbutil.FromFile(path, program_graph_pb2.ProgramGraph()) for node in graph.node: node_count += 1 try: n = ( node.features.feature["inst2vec_preprocessed"] .bytes_list.value[0] .decode("utf-8") ) if n in self.inst2vec: inst2vec[n] += 1 except IndexError: pass if node.text in self.cdfg: cdfg[node.text] += 1 if node.text in self.programl: programl[node.text] += 1 ToCsv( self.path / "vocab" / "inst2vec_test_coverage.csv", inst2vec, node_count, ) ToCsv( self.path / "vocab" / "cdfg_test_coverage.csv", cdfg, node_count, ) ToCsv( self.path / "vocab" / "programl_test_coverage.csv", programl, node_count, )
def _ProcessRows(job) -> Tuple[int, int, float]: start_time = time.time() encoded_count = 0 encoder: Inst2vecEncoder = job[0] paths: List[Tuple[pathlib.Path, pathlib.Path]] = job[1] for graph_path, ir_path in paths: graph = pbutil.FromFile(graph_path, program_graph_pb2.ProgramGraph()) # Check to see if we have already processed this file. if len(graph.features.feature["inst2vec_annotated"].int64_list.value): continue encoded_count += 1 try: Encode(encoder, graph, graph_path, ir_path) except AssertionError: # NCC codebase uses assertions to check for errors. pass except TimeoutError: pass return len(paths), encoded_count, time.time() - start_time