def Run(self): if self.write_header: self.writer.writerow( ("analysis", "graph_name", "i", "label_count", "step_count") ) for self.ctx.i, path in enumerate(self.files): graph_name = path.name[: -len(".ProgramGraphFeaturesList.pb")] features = pbutil.FromFile( path, program_graph_features_pb2.ProgramGraphFeaturesList() ) for i, graph in enumerate(features.graph): step_count = graph.features.feature[ "data_flow_step_count" ].int64_list.value if len(step_count): step_count = step_count[0] else: step_count = 0 self.writer.writerow( ( self.analysis, graph_name, i, len(graph.node_features.feature_list["data_flow_value"].feature), step_count, ) )
def Main(): """Main entry point.""" path = pathlib.Path(FLAGS.path) graphs_path = path / "test" labels_path = path / "labels" / FLAGS.analysis for graph_path in graphs_path.iterdir(): stem = graph_path.name[:-len("ProgramGraph.pb")] name = f"{stem}ProgramGraphFeaturesList.pb" features_path = labels_path / name # There is no guarantee that we have generated features for this # program graph, so we check for its existence. As a *very* defensive # measure, we also check for the existence of the graph file that we # enumerated at the start of this function. This check can be removed # later, it is only useful during development when you might be # modifying the dataset at the same time as having test jobs running. if not graph_path.is_file() or not features_path.is_file(): continue graph = pbutil.FromFile(graph_path, program_graph_pb2.ProgramGraph()) if not len(graph.node) or len(graph.node) > FLAGS.max_graph_node_count: continue features_list = pbutil.FromFile( features_path, program_graph_features_pb2.ProgramGraphFeaturesList()) for j, features in enumerate(features_list.graph): step_count_feature = features.features.feature[ "data_flow_step_count"].int64_list.value step_count = step_count_feature[0] if len( step_count_feature) else 0 print(features_path.name, j, step_count)
def TestOne( features_list_path: pathlib.Path, features_list_index: int, checkpoint_path: pathlib.Path, ) -> BatchResults: path = pathlib.Path(pathflag.path()) features_list = pbutil.FromFile( features_list_path, program_graph_features_pb2.ProgramGraphFeaturesList(), ) features = features_list.graph[features_list_index] graph_name = features_list_path.name[: -len(".ProgramGraphFeaturesList.pb")] graph = pbutil.FromFile( path / "graphs" / f"{graph_name}.ProgramGraph.pb", program_graph_pb2.ProgramGraph(), ) # Instantiate and restore the model. vocab = vocabulary.LoadVocabulary( path, model_name="cdfg" if FLAGS.cdfg else "programl", max_items=FLAGS.max_vocab_size, target_cumfreq=FLAGS.target_vocab_cumfreq, ) if FLAGS.cdfg: FLAGS.use_position_embeddings = False model = Ggnn( vocabulary=vocab, test_only=True, node_y_dimensionality=2, graph_y_dimensionality=0, graph_x_dimensionality=0, use_selector_embeddings=True, ) checkpoint = pbutil.FromFile(checkpoint_path, checkpoint_pb2.Checkpoint()) model.RestoreCheckpoint(checkpoint) batch = list( DataflowGgnnBatchBuilder( graph_loader=SingleGraphLoader(graph=graph, features=features), vocabulary=vocab, max_node_size=int(1e9), use_cdfg=FLAGS.cdfg, max_batch_count=1, ) )[0] results = model.RunBatch(epoch_pb2.TEST, batch) return AnnotateGraphWithBatchResults(graph, features, results)
def RunAnalysis( analysis: str, graph: program_graph_pb2.ProgramGraph ) -> program_graph_features_pb2.ProgramGraphFeaturesList: """Run the given analysis. Args: analysis: The name of the analysis to run. graph: The program graph to analyze. Returns: A program graph features list. Raises: ValueError: In case analysis fails. """ graph_features = program_graph_features_pb2.ProgramGraphFeaturesList() serialized_graph_features = analysis_pybind.RunAnalysis( analysis, graph.SerializeToString()) graph_features.ParseFromString(serialized_graph_features) return graph_features
def _Worker(self): """Threaded graph reader.""" graph_files = list(self.graph_path.iterdir()) app.Log( 2, "Enumerated %s graph files to load", humanize.Commas(len(graph_files)) ) graph_count = 0 while not self.min_graph_count or graph_count < self.min_graph_count: # Strip any graph files that we have earmarked for ignoring. graph_files = [ f for f in graph_files if f not in self._excluded_graph_files ] # We may have run out of files. if not graph_files: self._Done(graph_count) return if self.seed: # If we are setting a reproducible seed, first sort the list of files # since iterdir() order is undefined, then seed the RNG for the # shuffle. graph_files = sorted(graph_files, key=lambda x: x.name) # Change the seed so that on the next execution of this loop we will # chose a different random ordering. self.seed += 1 random.Random(self.seed).shuffle(graph_files) for graph_path in graph_files: if self._stopped: break stem = graph_path.name[: -len("ProgramGraph.pb")] name = f"{stem}ProgramGraphFeaturesList.pb" features_path = self.labels_path / name # There is no guarantee that we have generated features for this # program graph, so we check for its existence. As a *very* defensive # measure, we also check for the existence of the graph file that we # enumerated at the start of this function. This check can be removed # later, it is only useful during development when you might be # modifying the dataset at the same time as having test jobs running. if not graph_path.is_file() or not features_path.is_file(): self.skip_count += 1 continue # Read the graph from disk, maybe performing a cheeky wee conversion # to CDFG format. app.Log(3, "Read %s", features_path) if self.use_cdfg: graph = cdfg.FromProgramGraphFile(graph_path) else: graph = pbutil.FromFile(graph_path, program_graph_pb2.ProgramGraph()) if not graph: app.Log(2, "Failed to load graph %s", graph_path) self._excluded_graph_files.add(graph_path) continue # Skip empty graphs. if not len(graph.node) or len(graph.node) > FLAGS.max_graph_node_count: app.Log( 2, "Graph node count %s is not in range (1,%s]", len(graph.node), FLAGS.max_graph_node_count, ) self._excluded_graph_files.add(graph_path) continue # Skip a graph without inst2vec if self.require_inst2vec and not len( graph.features.feature["inst2vec_annotated"].int64_list.value ): app.Log(2, "Skipping graph without inst2vec annotations") continue features_list = pbutil.FromFile( features_path, program_graph_features_pb2.ProgramGraphFeaturesList() ) # Iterate over the features list to yield <graph, features> pairs. skipped_all_features = True for j, features in enumerate(features_list.graph): step_count_feature = features.features.feature[ "data_flow_step_count" ].int64_list.value step_count = step_count_feature[0] if len(step_count_feature) else 0 if self.data_flow_step_max and step_count > self.data_flow_step_max: self.skip_count += 1 app.Log( 3, "Skipped graph with data_flow_step_count %d > %d " "(skipped %d / %d, %.2f%%)", step_count, self.data_flow_step_max, self.skip_count, (graph_count + self.skip_count), (self.skip_count / (graph_count + self.skip_count)) * 100, ) continue graph_count += 1 if self.logfile: self.logfile.write(f"{features_path} {j}\n") self._outq.put((graph, features), block=True) skipped_all_features = False if self.max_graph_count and graph_count >= self.max_graph_count: app.Log(2, "Stopping after reading %d graphs", graph_count) self._Done(graph_count) return if skipped_all_features: self._excluded_graph_files.add(graph_path) self._Done(graph_count)