def test_ProgramGraphToNetworkX_empty_graph(): """Build from an empty proto.""" proto = program_graph_pb2.ProgramGraph() g = nx_format.ProgramGraphToNetworkX(proto) assert isinstance(g, nx.MultiDiGraph) assert not g.number_of_nodes() assert not g.number_of_edges()
def test_SerializeInstructionsInProgramGraph_root_node_only(): proto = program_graph_pb2.ProgramGraph(node=[ node_pb2.Node(type=node_pb2.Node.INSTRUCTION), ]) n = graph_serializer.SerializeInstructionsInProgramGraph(proto, max_nodes=1000) assert n == []
def FromProgramGraphFile(path) -> Optional[program_graph_pb2.ProgramGraph]: """Convert a binary ProgramGraph message file to a CDFG. Args: path: The path of a ProgramGraph protocol buffer. Returns: A ProgramGraph instance, or None if graph conversion failed. Raises: ValueError: If the graph cannot be converted. """ graph = program_graph_pb2.ProgramGraph() with open(path, "rb") as f: p = subprocess.Popen( [str(GRAPH2CDFG), "--stdin_fmt", "pb", "--stdout_fmt", "pb"], stdin=subprocess.PIPE, stdout=subprocess.PIPE, ) stdout, _ = p.communicate(f.read()) if p.returncode: return None try: graph.ParseFromString(stdout) except google.protobuf.message.DecodeError: return None return graph
def Run(self): if self.write_header: self.writer.writerow( ( "split", "graph_name", "node_count", "edge_count", "function_count", "module_count", ) ) for self.ctx.i, path in enumerate(self.files): graph_name = path.name[: -len(".ProgramGraph.pb")] graph = pbutil.FromFile(path, program_graph_pb2.ProgramGraph()) self.writer.writerow( ( self.split, graph_name, len(graph.node), len(graph.edge), len(graph.function), len(graph.module), ) )
def Main(): """Main entry point.""" path = pathlib.Path(FLAGS.path) graphs_path = path / "test" labels_path = path / "labels" / FLAGS.analysis for graph_path in graphs_path.iterdir(): stem = graph_path.name[:-len("ProgramGraph.pb")] name = f"{stem}ProgramGraphFeaturesList.pb" features_path = labels_path / name # There is no guarantee that we have generated features for this # program graph, so we check for its existence. As a *very* defensive # measure, we also check for the existence of the graph file that we # enumerated at the start of this function. This check can be removed # later, it is only useful during development when you might be # modifying the dataset at the same time as having test jobs running. if not graph_path.is_file() or not features_path.is_file(): continue graph = pbutil.FromFile(graph_path, program_graph_pb2.ProgramGraph()) if not len(graph.node) or len(graph.node) > FLAGS.max_graph_node_count: continue features_list = pbutil.FromFile( features_path, program_graph_features_pb2.ProgramGraphFeaturesList()) for j, features in enumerate(features_list.graph): step_count_feature = features.features.feature[ "data_flow_step_count"].int64_list.value step_count = step_count_feature[0] if len( step_count_feature) else 0 print(features_path.name, j, step_count)
def TestOne( features_list_path: pathlib.Path, features_list_index: int, checkpoint_path: pathlib.Path, ) -> BatchResults: path = pathlib.Path(pathflag.path()) features_list = pbutil.FromFile( features_list_path, program_graph_features_pb2.ProgramGraphFeaturesList(), ) features = features_list.graph[features_list_index] graph_name = features_list_path.name[: -len(".ProgramGraphFeaturesList.pb")] graph = pbutil.FromFile( path / "graphs" / f"{graph_name}.ProgramGraph.pb", program_graph_pb2.ProgramGraph(), ) # Instantiate and restore the model. vocab = vocabulary.LoadVocabulary( path, model_name="cdfg" if FLAGS.cdfg else "programl", max_items=FLAGS.max_vocab_size, target_cumfreq=FLAGS.target_vocab_cumfreq, ) if FLAGS.cdfg: FLAGS.use_position_embeddings = False model = Ggnn( vocabulary=vocab, test_only=True, node_y_dimensionality=2, graph_y_dimensionality=0, graph_x_dimensionality=0, use_selector_embeddings=True, ) checkpoint = pbutil.FromFile(checkpoint_path, checkpoint_pb2.Checkpoint()) model.RestoreCheckpoint(checkpoint) batch = list( DataflowGgnnBatchBuilder( graph_loader=SingleGraphLoader(graph=graph, features=features), vocabulary=vocab, max_node_size=int(1e9), use_cdfg=FLAGS.cdfg, max_batch_count=1, ) )[0] results = model.RunBatch(epoch_pb2.TEST, batch) return AnnotateGraphWithBatchResults(graph, features, results)
def test_SerializeInstructionsInProgramGraph_single_function(): proto = program_graph_pb2.ProgramGraph( node=[ node_pb2.Node(type=node_pb2.Node.INSTRUCTION), node_pb2.Node(type=node_pb2.Node.INSTRUCTION), node_pb2.Node(type=node_pb2.Node.INSTRUCTION), ], edge=[ edge_pb2.Edge(flow=edge_pb2.Edge.CALL, source=0, target=1), edge_pb2.Edge(flow=edge_pb2.Edge.CONTROL, source=1, target=2), ], ) n = graph_serializer.SerializeInstructionsInProgramGraph(proto, max_nodes=1000) assert n == [1, 2]
def Main(): encoder = inst2vec_encoder.Inst2vecEncoder() if FLAGS.dataset: encoder.RunOnDataset(FLAGS.dataset) return if FLAGS.directory: encoder.RunOnDirectory(FLAGS.directory) return proto = ParseStdinOrDie(program_graph_pb2.ProgramGraph()) ir = fs.Read(FLAGS.ir) if FLAGS.ir else None encoder.Encode(proto, ir) WriteStdout(proto)
def BuildProgramGraph( ir: str, options: program_graph_options_pb2.ProgramGraphOptions = DefaultOptions, timeout: int = 60, ) -> program_graph_pb2.ProgramGraph: """Construct a program graph from an LLVM-IR. Args: ir: The text of an LLVM-IR Module. options: The graph construction options. timeout: The number of seconds to permit before timing out. Returns: A ProgramGraph instance. Raises: ValueError: In case graph construction fails. TimeoutError: If timeout is reached. OsError: In case of other error. """ # Write the ProgramGraphOptions to a temporary file and pass it to a # worker subprocess which generates the graph and produces a ProgramGraph # message on stdout. with tempfile.NamedTemporaryFile("w") as f: f.write(ir) f.flush() options.ir_path = f.name process = subprocess.Popen( ["timeout", "-s9", str(timeout), str(GRAPH_BUILDER_BIN)], stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE, ) stdout, stderr = process.communicate(options.SerializeToString()) proto = program_graph_pb2.ProgramGraph() if process.returncode == 2: raise ValueError(stderr.decode("utf-8").rstrip()) elif process.returncode == 9 or process.returncode == -9: raise TimeoutError( f"Program graph construction exceeded {timeout} seconds") elif process.returncode: raise OSError(stderr.decode("utf-8").rstrip()) proto.ParseFromString(stdout) return proto
def Run(self): inst2vec = defaultdict(int) cdfg = defaultdict(int) programl = defaultdict(int) node_count = 0 for self.ctx.i, path in enumerate(self.graphs, start=1): graph = pbutil.FromFile(path, program_graph_pb2.ProgramGraph()) for node in graph.node: node_count += 1 try: n = ( node.features.feature["inst2vec_preprocessed"] .bytes_list.value[0] .decode("utf-8") ) if n in self.inst2vec: inst2vec[n] += 1 except IndexError: pass if node.text in self.cdfg: cdfg[node.text] += 1 if node.text in self.programl: programl[node.text] += 1 ToCsv( self.path / "vocab" / "inst2vec_test_coverage.csv", inst2vec, node_count, ) ToCsv( self.path / "vocab" / "cdfg_test_coverage.csv", cdfg, node_count, ) ToCsv( self.path / "vocab" / "programl_test_coverage.csv", programl, node_count, )
def test_Encode_llvm_program_graph( llvm_program_graph: program_graph_pb2.ProgramGraph, encoder: inst2vec_encoder.Inst2vecEncoder, ): """Black-box test encoding LLVM program graphs.""" proto = program_graph_pb2.ProgramGraph() proto.CopyFrom(llvm_program_graph) encoder.Encode(proto) # This assumes that all of the test graphs have at least one instruction. num_instructions = sum(1 if node.type == node_pb2.Node.INSTRUCTION else 0 for node in proto.node) assert num_instructions >= 1 # Check for the presence of expected node attributes. for node in proto.node: assert "inst2vec_embedding" in node.features.feature if node.type == node_pb2.Node.INSTRUCTION: assert "inst2vec_preprocessed" in node.features.feature
def main(argv): init_app(argv) encoder = inst2vec_encoder.Inst2vecEncoder() if FLAGS.dataset: encoder.RunOnDataset(Path(FLAGS.dataset)) return if FLAGS.directory: encoder.RunOnDirectory(Path(FLAGS.directory)) return proto = ParseStdinOrDie(program_graph_pb2.ProgramGraph()) if FLAGS.ir: with open(FLAGS.ir) as f: ir = f.read() else: ir = None encoder.Encode(proto, ir) WriteStdout(proto)
def _ProcessRows(job) -> Tuple[int, int, float]: start_time = time.time() encoded_count = 0 encoder: Inst2vecEncoder = job[0] paths: List[Tuple[pathlib.Path, pathlib.Path]] = job[1] for graph_path, ir_path in paths: graph = pbutil.FromFile(graph_path, program_graph_pb2.ProgramGraph()) # Check to see if we have already processed this file. if len(graph.features.feature["inst2vec_annotated"].int64_list.value): continue encoded_count += 1 try: Encode(encoder, graph, graph_path, ir_path) except AssertionError: # NCC codebase uses assertions to check for errors. pass except TimeoutError: pass return len(paths), encoded_count, time.time() - start_time
def BuildProgramGraphProto( hlo_proto: xla_pb2.HloProto, ) -> program_graph_pb2.ProgramGraph: """Construct a program graph for the given LLVM IR. Args: hlo_proto: The LLVM IR string for a module. Returns: A ProgramGraph message instance. Raises: ValueError: If graph construction fails. """ # This requires a round trip serialized to / from strings, since I can't # figure out a way to get pybind11 to auto-generate bindings for protocol # buffers. graph = program_graph_pb2.ProgramGraph() serialized_graph = xla_pybind.BuildProgramGraphProto( hlo_proto.SerializeToString()) graph.ParseFromString(serialized_graph) return graph
def test_GraphTuple_one_graph(): graph = program_graph_pb2.ProgramGraph( node=[node_pb2.Node(),], edge=[ edge_pb2.Edge(source=0, target=1,), edge_pb2.Edge(source=0, target=2, position=1,), edge_pb2.Edge(source=1, target=0, position=10, flow=edge_pb2.Edge.CALL,), ], ) builder = GraphTupleBuilder() builder.AddProgramGraph(graph) gt = builder.Build() assert np.array_equal(gt.adjacencies[edge_pb2.Edge.CONTROL], [(0, 1), (0, 2)]) assert np.array_equal( gt.adjacencies[edge_pb2.Edge.DATA], np.zeros((0, 2), dtype=np.int32) ) assert np.array_equal(gt.adjacencies[edge_pb2.Edge.CALL], [(1, 0)]) assert np.array_equal(gt.edge_positions[edge_pb2.Edge.CONTROL], [0, 1]) assert np.array_equal(gt.edge_positions[edge_pb2.Edge.DATA], []) assert np.array_equal(gt.edge_positions[edge_pb2.Edge.CALL], [10])
def EnumerateLlvmProgramGraphs( ) -> Iterable[Tuple[str, program_graph_pb2.ProgramGraph]]: """Enumerate a test set of LLVM IR file paths.""" for path in LLVM_IR_GRAPHS.iterdir(): yield path.name, pbutil.FromFile(path, program_graph_pb2.ProgramGraph())
def _Worker(self): """Threaded graph reader.""" graph_files = list(self.graph_path.iterdir()) app.Log( 2, "Enumerated %s graph files to load", humanize.Commas(len(graph_files)) ) graph_count = 0 while not self.min_graph_count or graph_count < self.min_graph_count: # Strip any graph files that we have earmarked for ignoring. graph_files = [ f for f in graph_files if f not in self._excluded_graph_files ] # We may have run out of files. if not graph_files: self._Done(graph_count) return if self.seed: # If we are setting a reproducible seed, first sort the list of files # since iterdir() order is undefined, then seed the RNG for the # shuffle. graph_files = sorted(graph_files, key=lambda x: x.name) # Change the seed so that on the next execution of this loop we will # chose a different random ordering. self.seed += 1 random.Random(self.seed).shuffle(graph_files) for graph_path in graph_files: if self._stopped: break stem = graph_path.name[: -len("ProgramGraph.pb")] name = f"{stem}ProgramGraphFeaturesList.pb" features_path = self.labels_path / name # There is no guarantee that we have generated features for this # program graph, so we check for its existence. As a *very* defensive # measure, we also check for the existence of the graph file that we # enumerated at the start of this function. This check can be removed # later, it is only useful during development when you might be # modifying the dataset at the same time as having test jobs running. if not graph_path.is_file() or not features_path.is_file(): self.skip_count += 1 continue # Read the graph from disk, maybe performing a cheeky wee conversion # to CDFG format. app.Log(3, "Read %s", features_path) if self.use_cdfg: graph = cdfg.FromProgramGraphFile(graph_path) else: graph = pbutil.FromFile(graph_path, program_graph_pb2.ProgramGraph()) if not graph: app.Log(2, "Failed to load graph %s", graph_path) self._excluded_graph_files.add(graph_path) continue # Skip empty graphs. if not len(graph.node) or len(graph.node) > FLAGS.max_graph_node_count: app.Log( 2, "Graph node count %s is not in range (1,%s]", len(graph.node), FLAGS.max_graph_node_count, ) self._excluded_graph_files.add(graph_path) continue # Skip a graph without inst2vec if self.require_inst2vec and not len( graph.features.feature["inst2vec_annotated"].int64_list.value ): app.Log(2, "Skipping graph without inst2vec annotations") continue features_list = pbutil.FromFile( features_path, program_graph_features_pb2.ProgramGraphFeaturesList() ) # Iterate over the features list to yield <graph, features> pairs. skipped_all_features = True for j, features in enumerate(features_list.graph): step_count_feature = features.features.feature[ "data_flow_step_count" ].int64_list.value step_count = step_count_feature[0] if len(step_count_feature) else 0 if self.data_flow_step_max and step_count > self.data_flow_step_max: self.skip_count += 1 app.Log( 3, "Skipped graph with data_flow_step_count %d > %d " "(skipped %d / %d, %.2f%%)", step_count, self.data_flow_step_max, self.skip_count, (graph_count + self.skip_count), (self.skip_count / (graph_count + self.skip_count)) * 100, ) continue graph_count += 1 if self.logfile: self.logfile.write(f"{features_path} {j}\n") self._outq.put((graph, features), block=True) skipped_all_features = False if self.max_graph_count and graph_count >= self.max_graph_count: app.Log(2, "Stopping after reading %d graphs", graph_count) self._Done(graph_count) return if skipped_all_features: self._excluded_graph_files.add(graph_path) self._Done(graph_count)
def Build(self) -> program_graph_pb2.ProgramGraph: proto = program_graph_pb2.ProgramGraph() proto.ParseFromString(self._Build()) return proto
def Main(): proto = ParseStdinOrDie(program_graph_pb2.ProgramGraph()) pickle.dump(nx_format.ProgramGraphToNetworkX(proto), sys.stdout.buffer)
def main(argv): init_app(argv) proto = ParseStdinOrDie(program_graph_pb2.ProgramGraph()) pickle.dump(nx_format.ProgramGraphToNetworkX(proto), sys.stdout.buffer)
def test_SerializeInstructionsInProgramGraph_empty_graph(): proto = program_graph_pb2.ProgramGraph() n = graph_serializer.SerializeInstructionsInProgramGraph(proto, max_nodes=1000) assert n == []