def LoadPositiveNegativeProtos(path: pathlib.Path) -> PositiveNegativeDataset: """Load positive and negative training protos from a directory.""" positive_protos = [ pbutil.FromFile(p, fish_pb2.CompilerCrashDiscriminatorTrainingExample()) for p in path.iterdir() if p.name.startswith('positive-') ] logging.info( 'Loaded %s positive protos', humanize.intcomma(len(positive_protos))) negative_protos = [ pbutil.FromFile(p, fish_pb2.CompilerCrashDiscriminatorTrainingExample()) for p in path.iterdir() if p.name.startswith('negative-') ] logging.info( 'Loaded %s negative protos', humanize.intcomma(len(negative_protos))) return PositiveNegativeDataset(positive_protos, negative_protos)
def PostprocessSampleCorpus(instance: clgen.Instance): """Create a corpus from the model samples and pre-process.""" sample_dir = instance.model.SamplerCache(instance.sampler) # Read the sample protos and write them to a directory of content files. contentfiles_dir = pathlib.Path(str(sample_dir) + '.contentfiles') contentfiles_dir.mkdir(exist_ok=True) logging.info('Writing output contentfiles to %s', contentfiles_dir) if len(list(contentfiles_dir.iterdir())) != len(list( sample_dir.iterdir())): for proto_path in sample_dir.iterdir(): sample = pbutil.FromFile(proto_path, model_pb2.Sample()) with open(contentfiles_dir / proto_path.name, 'w') as f: f.write(sample.text) logging.info('Creating output corpus') output_corpus_config = corpus_pb2.Corpus() output_corpus_config.CopyFrom(instance.model.corpus.config) output_corpus_config.local_directory = str(contentfiles_dir) # We derive the programming language name from the input corpus directory. # This depends on corpuses being in directories named after their language, # e.g. ~/corpuses/opencl, or ~/corpuses/java.A preprocessed_dir = instance.model.corpus.preprocessed.database_path.parent language = (preprocessed_dir / 'contentfiles').resolve().name output_corpus_config.preprocessor[:] = POSTPROCESSORS[language] output_corpus = corpuses.Corpus(output_corpus_config) try: output_corpus.Create() except errors.EmptyCorpusException: pass return output_corpus
def main(argv): """Main entry point.""" if len(argv) > 1: raise app.UsageError("Unknown arguments: '{}'.".format(' '.join( argv[1:]))) start_time = time.time() instances = [ clgen.Instance(p) for p in pbutil.FromFile( pathlib.Path(FLAGS.instances), clgen_pb2.Instances()).instance ] random.shuffle(instances) candidate_instances = collections.deque(instances) logging.info('Loaded %d instances in %s ms', len(candidate_instances), humanize.intcomma(int((time.time() - start_time) * 1000))) while candidate_instances: instance = candidate_instances.popleft() with instance.Session(): if IsEligible(instance): logging.info('Found an eligible candidate to work on') SampleModel(instance) PostprocessSampleCorpus(instance) else: logging.info('Candidate is ineligible') candidate_instances.append(instance) time.sleep(1) logging.info('Done.')
def _ReadTestDataStoreFiles() -> datastore_pb2.DataStoreTestSet: """Read the config protos for testing. The datastore names are derived from the file names. Returns: A DataStoreTestSet instance. Raises: AssertionError: In case of error reading datastore configs. """ paths = list( pathlib.Path('deeplearning/deepsmith/tests/data/datastores').iterdir()) assert paths names = [p.stem for p in paths] protos = [ pbutil.FromFile(path, datastore_pb2.DataStore()) for path in paths ] datastore_set = datastore_pb2.DataStoreTestSet() for name, proto in zip(names, protos): # There's no graceful error handling here, but it's important that we don't # run tests on a datastore unless it's specifically marked as testonly. assert proto.testonly dst_proto = datastore_set.values[name] dst_proto.MergeFrom(proto) assert len(datastore_set.values) == len(protos) == len(names) == len(paths) return datastore_set
def main(argv): """Main entry point.""" if len(argv) > 1: raise app.UsageError('Unrecognized arguments') # Parse flags and instantiate testing objects. if not FLAGS.interesting_results_dir: raise app.UsageError('--interesting_results_dir must be set') interesting_results_dir = pathlib.Path(FLAGS.interesting_results_dir) if interesting_results_dir.exists( ) and not interesting_results_dir.is_dir(): raise app.UsageError('--interesting_results_dir must be a directory') logging.info('Recording interesting results in %s.', interesting_results_dir) for path in interesting_results_dir.iterdir(): result = pbutil.FromFile(path, deepsmith_pb2.Result()) print(f'=== BEGIN INTERESTING RESULT {path.stem} ===') print('Outcome:', deepsmith_pb2.Result.Outcome.Name(result.outcome)) print() print('OpenCL kernel') print('-------------') print(fmt.Indent(2, result.testcase.inputs['src'])) print() print('Stdout') print('------') print(fmt.Indent(2, result.outputs['stderr'])) print()
def ContentFiles(self) -> typing.Iterable[scrape_repos_pb2.ContentFile]: """Return an iterator over all contentfiles in the repo.""" if self.IsIndexed(): return (pbutil.FromFile(f, scrape_repos_pb2.ContentFile()) for f in self.index_dir.iterdir() if f.name != 'DONE.txt') else: return []
def main(argv): """Main entry point.""" if len(argv) > 1: raise app.UsageError("Unknown arguments '{}'".format(', '.join( argv[1:]))) clone_list_path = pathlib.Path(FLAGS.clone_list or '') if not clone_list_path.is_file(): raise app.UsageError('--clone_list is not a file.') clone_list = pbutil.FromFile(clone_list_path, scrape_repos_pb2.LanguageCloneList()) if not FLAGS.export_path: raise app.UsageError('--export_path not set.') export_path = pathlib.Path(FLAGS.export_path) export_path.mkdir(parents=True, exist_ok=True) # To export from contentfiles database. # for language in clone_list.language: # d = pathlib.Path(language.destination_directory) # d = d.parent / (str(d.name) + '.db') # db = contentfiles.ContentFiles(d) # with db.Session() as session: # (export_path / language.language).mkdir(exist_ok=True) # ExportDatabase(session, export_path / language.language) # To export from index directory. for language in clone_list.language: index_path = pathlib.Path(language.destination_directory + '.index') if index_path.is_dir(): (export_path / language.language).mkdir(exist_ok=True) ExportIndex(index_path, export_path / language.language)
def main(argv) -> None: """Main entry point.""" if len(argv) > 1: raise app.UsageError('Too many command-line arguments.') tiers = pbutil.FromFile(pathlib.Path(FLAGS.data_tiers), data_tiers_pb2.DataTiers()) for tier in tiers.directory: logging.info('Processing %s', tier.path) _SetDirectorySize(tier) if FLAGS.summary: # Print the size per directory. df = pd.DataFrame([{ 'Path': d.path, 'Tier': d.tier, 'Size': humanize.naturalsize(d.size_bytes), 'Size (bytes)': d.size_bytes } for d in tiers.directory if d.size_bytes]) df = df.sort_values(['Tier', 'Size (bytes)'], ascending=[True, False]) print(df[['Path', 'Tier', 'Size']].to_string(index=False)) # Print the total size per tier. df2 = df.groupby('Tier').sum() df2['Size'] = [ humanize.naturalsize(d['Size (bytes)']) for _, d in df2.iterrows() ] df2 = df2.reset_index() df2 = df2.sort_values('Tier') print() print("Totals:") print(df2[['Tier', 'Size']].to_string(index=False)) else: print(tiers)
def main(argv): """Main entry point.""" if len(argv) > 1: raise app.UsageError("Unknown arguments '{}'".format(', '.join( argv[1:]))) clone_list_path = pathlib.Path(FLAGS.clone_list or "") if not clone_list_path.is_file(): raise app.UsageError('--clone_list is not a file.') clone_list = pbutil.FromFile(clone_list_path, scrape_repos_pb2.LanguageCloneList()) # Error early if the config contains invalid preprocessors. for language in clone_list.language: for importer in language.importer: [ preprocessors.GetPreprocessorFunction(p) for p in importer.preprocessor ] pool = multiprocessing.Pool(FLAGS.processes) for language in clone_list.language: d = pathlib.Path(language.destination_directory) d = d.parent / (str(d.name) + '.db') db = contentfiles.ContentFiles(d) if pathlib.Path(language.destination_directory).is_dir(): ImportFromLanguage(db, language, pool)
def test_FromFile_FileNotFoundError(suffix): """Test that FileNotFoundError raised if file doesn't exist.""" with tempfile.TemporaryDirectory(prefix='labm8_proto_') as d: with pytest.raises(FileNotFoundError): pbutil.FromFile( pathlib.Path(d) / f'proto{suffix}', test_protos_pb2.TestMessage())
def main(argv) -> None: """Main entry point.""" if len(argv) > 1: raise app.UsageError('Too many command-line arguments.') clone_list_path = pathlib.Path(FLAGS.clone_list or "") if not clone_list_path.is_file(): raise app.UsageError('--clone_list is not a file.') clone_list = pbutil.FromFile(clone_list_path, scrape_repos_pb2.LanguageCloneList()) meta_files = [] for language in clone_list.language: directory = pathlib.Path(language.destination_directory) if directory.is_dir(): meta_files += [ pathlib.Path(directory / f) for f in directory.iterdir() if IsRepoMetaFile(f) ] random.shuffle(meta_files) worker = AsyncWorker(meta_files) logging.info('Cloning %s repos from GitHub ...', humanize.intcomma(worker.max)) bar = progressbar.ProgressBar(max_value=worker.max, redirect_stderr=True) worker.start() while worker.is_alive(): bar.update(worker.i) worker.join(.5) bar.update(worker.i)
def EpochTelemetry(self) -> typing.List[telemetry_pb2.ModelEpochTelemetry]: """Return the epoch telemetry files.""" return [ pbutil.FromFile(self.logdir / p, telemetry_pb2.ModelEpochTelemetry()) for p in sorted(self.logdir.iterdir()) if re.match(r'epoch_\d\d+_telemetry\.pbtxt', str(p.name)) ]
def GeneratorFromFlag(config_class, generator_class) -> base_generator.GeneratorServiceBase: """Instantiate a generator from the --generator_config flag.""" if not pbutil.ProtoIsReadable(FLAGS.generator_config, config_class()): raise app.UsageError( f'--generator_config is not a {config_class.__name__} proto') config = pbutil.FromFile(pathlib.Path(FLAGS.generator_config), config_class()) return generator_class(config)
def test_FromFile_required_fields_not_set(suffix): """Test that DecodeError raised if required fields not set.""" with tempfile.NamedTemporaryFile(prefix='labm8_proto_', suffix=suffix) as f: pbutil.ToFile(test_protos_pb2.AnotherTestMessage(number=1), pathlib.Path(f.name)) with pytest.raises(pbutil.DecodeError) as e_info: pbutil.FromFile(pathlib.Path(f.name), test_protos_pb2.TestMessage()) assert f"Required fields not set: '{f.name}'" == str(e_info.value)
def ProtoFromFile(cls, path: pathlib.Path) -> deepsmith_pb2.Testcase: """Instantiate a protocol buffer testcase from file. Args: path: Path to the testcase proto file. Returns: Testcase message instance. """ return pbutil.FromFile(path, deepsmith_pb2.Testcase())
def test_FromFile_required_fields_not_set_uninitialized_okay(suffix): """Test that DecodeError not raised if required fields not set.""" with tempfile.NamedTemporaryFile(prefix='labm8_proto_', suffix=suffix) as f: proto_in = test_protos_pb2.AnotherTestMessage(number=1) pbutil.ToFile(test_protos_pb2.AnotherTestMessage(number=1), pathlib.Path(f.name)) pbutil.FromFile(pathlib.Path(f.name), test_protos_pb2.TestMessage(), uninitialized_okay=True)
def ProtoFromFile(cls, path: pathlib.Path) -> deepsmith_pb2.Result: """Instantiate a protocol buffer result from file. Args: path: Path to the result proto file. Returns: Result message instance. """ return pbutil.FromFile(path, deepsmith_pb2.Result())
def __init__(self, path: pathlib.Path): self.path = path.absolute() self.cache = cache.FSCache(self.path) self.corpus = NullCorpus() self.config = pbutil.FromFile( self.path / 'META.pbtxt', internal_pb2.ModelMeta()).config self.atomizer = atomizers.AtomizerBase.FromFile(self.path / 'atomizer') self.backend = { model_pb2.NetworkArchitecture.TENSORFLOW: tensorflow_backend.TensorFlowBackend, model_pb2.NetworkArchitecture.KERAS: keras_backend.KerasBackend, }[self.config.architecture.backend](self.config, self.cache, self.atomizer)
def PackDataPackage(package_dir: pathlib.Path) -> None: """Create an archive and sidecar of a package.""" manifest = pbutil.FromFile(package_dir / 'MANIFEST.pbtxt', dpack_pb2.DataPackage()) PackageManifestIsValid(package_dir, manifest) archive_path = (package_dir / f'../{package_dir.name}.dpack.tar.bz2').resolve() sidecar_path = (package_dir / f'../{package_dir.name}.dpack.pbtxt').resolve() CreatePackageArchive(package_dir, manifest, archive_path) CreatePackageArchiveSidecar(archive_path, manifest, sidecar_path)
def DoFlagsAction(): """Do the action requested by the command line flags.""" if not FLAGS.config: raise app.UsageError("Missing required argument: '--config'") config_path = pathlib.Path(FLAGS.config) if not config_path.is_file(): raise app.UsageError(f"File not found: '{config_path}'") config = pbutil.FromFile(config_path, clgen_pb2.Instance()) os.environ['PWD'] = str(config_path.parent) if FLAGS.clgen_profiling: prof.enable() instance = Instance(config) with instance.Session(): if FLAGS.print_cache_path == 'corpus': print(instance.model.corpus.cache.path) return elif FLAGS.print_cache_path == 'model': print(instance.model.cache.path) return elif FLAGS.print_cache_path == 'sampler': print(instance.model.SamplerCache(instance.sampler)) return elif FLAGS.print_cache_path: raise app.UsageError( f"Invalid --print_cache_path argument: '{FLAGS.print_cache_path}'" ) if FLAGS.print_preprocessed: print(instance.model.corpus.GetTextCorpus(shuffle=False)) return # The default action is to sample the model. if FLAGS.stop_after == 'corpus': instance.model.corpus.Create() elif FLAGS.stop_after == 'train': instance.model.Train() logging.info('Model: %s', instance.model.cache.path) elif FLAGS.stop_after: raise app.UsageError( f"Invalid --stop_after argument: '{FLAGS.stop_after}'") elif FLAGS.export_model: instance.model.Train() export_dir = pathlib.Path(FLAGS.export_model) for path in instance.model.InferenceManifest(): relpath = pathlib.Path( os.path.relpath(path, instance.model.cache.path)) (export_dir / relpath.parent).mkdir(parents=True, exist_ok=True) shutil.copyfile(path, export_dir / relpath) print(export_dir / relpath) else: instance.model.Sample(instance.sampler, FLAGS.min_samples)
def test_ToFile_FromFile_equivalence(suffix): """Test that ToFile() and FromFile() are symmetrical.""" with tempfile.TemporaryDirectory(prefix='labm8_proto_') as d: path = pathlib.Path(d) / f'proto{suffix}' proto_in = test_protos_pb2.TestMessage(string='abc', number=1) pbutil.ToFile(proto_in, path) assert path.is_file() proto_out = test_protos_pb2.TestMessage() pbutil.FromFile(path, proto_out) assert proto_out.string == 'abc' assert proto_out.number == 1 assert proto_in == proto_out
def ServiceConfigFromFlag( flag_name: str, service_config: pbutil.ProtocolBuffer) -> pbutil.ProtocolBuffer: if not getattr(FLAGS, flag_name): raise app.UsageError(f'--{flag_name} not set.') config_path = pathlib.Path(getattr(FLAGS, flag_name)) if not config_path.is_file(): cls_name = type(service_config).__name__ raise app.UsageError(f"{cls_name} file not found: '{config_path}'.") return pbutil.FromFile( config_path, service_config)
def GetProtos( export_path: pathlib.Path, outcomes: typing.List[str], max_src_len: int) -> typing.List[TrainingProto]: paths = sorted(labtypes.flatten( [list((export_path / outcome).iterdir()) for outcome in outcomes])) protos = [] for path in paths: proto = pbutil.FromFile(path, TrainingProto()) if len(proto.src) > max_src_len: continue protos.append(proto) return protos
def test_config_is_valid(): """Test that config proto is valid.""" with tempfile.TemporaryDirectory() as d: config = pbutil.FromFile( bazelutil.DataPath( 'phd/deeplearning/clgen/tests/data/tiny/config.pbtxt'), clgen_pb2.Instance()) # Change the working directory and corpus path to our bazel run dir. config.working_dir = d config.model.corpus.local_directory = str(bazelutil.DataPath( 'phd/deeplearning/clgen/tests/data/tiny/corpus.tar.bz2')) clgen.Instance(config)
def ImportRepo(session: orm.session.Session, language: scrape_repos_pb2.LanguageToClone, metafile: pathlib.Path, pool: multiprocessing.Pool) -> None: """Import contentfiles from repository. Args: session: A database session to import to. language: The language specification for the repo. metafile: The repo metafile. pool: A multiprocessing pool. """ meta = pbutil.FromFile(metafile, scrape_repos_pb2.GitHubRepoMetadata()) clone_dir = metafile.parent / f'{meta.owner}_{meta.name}' repo = contentfiles.GitHubRepository.GetOrAdd(session, meta) repo.language = language.language for importer in language.importer: if not importer.source_code_pattern: logging.error('No source_code_pattern specified! Stopping now.') return pat = importer.source_code_pattern pat = f'{clone_dir}/{pat[1:]}' if pat[ 0] == '^' else f'{clone_dir}/{pat}' cmd = [ 'find', str(clone_dir), '-type', 'f', '-regex', pat, '-not', '-path', '*/.git/*' ] logging.debug('$ %s', ' '.join(cmd)) paths = subprocess.check_output( cmd, universal_newlines=True).rstrip().split('\n') if len(paths) == 1 and not paths[0]: logging.debug('No files to import from %s', clone_dir) return logging.info("Importing %s '%s' files from %s ...", humanize.intcomma(len(paths)), importer.source_code_pattern, clone_dir) all_files_relpaths = public.GetAllFilesRelativePaths(clone_dir) jobs = [ scrape_repos_pb2.ImportWorker( clone_from_url=meta.clone_from_url, clone_dir=str(clone_dir), abspath=p, all_files_relpaths=all_files_relpaths, preprocessors=importer.preprocessor, ) for p in paths ] bar = progressbar.ProgressBar(max_value=len(jobs)) for outputs in bar(pool.imap_unordered(ImportWorker, jobs)): for output in outputs: session.add(output)
def main(argv): if len(argv) > 1: unknown_args = ', '.join(argv[1:]) raise app.UsageError(f"Unknown arguments {unknown_args}") logging.info('Preparing OpenCL testbed.') config = harness_pb2.CldriveHarness() config.opencl_env.extend([env.OclgrindOpenCLEnvironment().name]) config.opencl_opt.extend([FLAGS.opencl_opt]) harness = cldrive.CldriveHarness(config) assert len(harness.testbeds) >= 1 input_directories = FLAGS.input_directories logging.info('Reading testcases from: %s', ' '.join(input_directories)) output_directory = pathlib.Path(FLAGS.output_directory) logging.info('Writing results to %s', output_directory) output_directory.mkdir(parents=True, exist_ok=True) # Load testcases. testcase_dirs = [ pathlib.Path(x) for x in input_directories if pathlib.Path(x).is_dir()] if not testcase_dirs: raise app.UsageError('No --input_directories found.') testcase_paths = labtypes.flatten( [[pathlib.Path(y) for y in fs.ls(x, abspaths=True)] for x in testcase_dirs]) testcases = [ pbutil.FromFile(path, deepsmith_pb2.Testcase()) for path in testcase_paths] logging.info('Read %d testcases.', len(testcases)) if not len(testcases): raise app.UsageError("No testcases found: '%s'", ' '.join(input_directories)) # Execute testcases. req = harness_pb2.RunTestcasesRequest() req.testbed.CopyFrom(harness.testbeds[0]) req.testcases.extend(testcases) res = harness.RunTestcases(req, None) # Write results to file. for testcase, result in zip(testcases, res.results): result_id = crypto.md5_str(str(testcase)) pbutil.ToFile(result, output_directory / f'{result_id}.pbtxt') logging.info('Executed %d testcases and wrote results to %s', len(res.results), output_directory) execution_times = [ result.profiling_events[0].duration_ms for result in res.results] logging.info('Average time to evaluate testcase: %.2f ms', sum(execution_times) / len(execution_times))
def VerifyManifest(package_dir: pathlib.Path) -> bool: """Verify that the MANIFEST.pbtext file matches the contents.""" if not (package_dir / 'MANIFEST.pbtxt').is_file(): logging.info('%s/MANIFEST.pbtxt missing, nothing to do.', package_dir) return False manifest = pbutil.FromFile(package_dir / 'MANIFEST.pbtxt', dpack_pb2.DataPackage()) if not PackageManifestIsValid(package_dir, manifest): logging.error('Package %s contains errors.', package_dir) return False logging.info('%s verified. No changes to files in the manifest.', package_dir) return True
def InitManifest(package_dir: pathlib.Path, contents: typing.List[pathlib.Path], update: bool) -> None: """Write the MANIFEST.pbtxt file for a package.""" manifest = CreatePackageManifest(package_dir, contents) manifest_path = package_dir / 'MANIFEST.pbtxt' if update and pbutil.ProtoIsReadable(manifest_path, dpack_pb2.DataPackage()): old = pbutil.FromFile(manifest_path, dpack_pb2.DataPackage()) MergeManifests(manifest, old) elif manifest_path.is_file(): raise OSError('Refusing to overwrite MANIFEST.pbtxt file.') pbutil.ToFile(manifest, manifest_path) logging.info('Wrote %s', manifest_path.absolute())
def main(argv): """Main entry point.""" if len(argv) > 1: raise app.UsageError("Unknown arguments: '{}'.".format(' '.join( argv[1:]))) config = pathlib.Path(FLAGS.generator) if not pbutil.ProtoIsReadable(config, generator_pb2.ClgenGenerator()): raise app.UsageError( '--generator is not a deepsmith.ClgenGenerator proto') generator_config = pbutil.FromFile(config, generator_pb2.ClgenGenerator()) output_directory = pathlib.Path(FLAGS.output_directory) GenerateTestcases(generator_config, output_directory, FLAGS.num_testcases)
def EnumerateLanguageInstanceConfigs( language: typing.Dict[str, typing.List[str]] ) -> typing.List[clgen_pb2.Instance]: """Enumerate the options for a language.""" configs = [] for corpus, model, sampler in itertools.product(language['corpuses'], EnumerateModels(), language['samplers']): instance_config = clgen_pb2.Instance() instance_config.working_dir = FLAGS.working_dir instance_config.model.CopyFrom(model) instance_config.model.corpus.CopyFrom( pbutil.FromFile( bazelutil.DataPath( f'phd/experimental/deeplearning/polyglot/corpuses/{corpus}.pbtxt' ), corpus_pb2.Corpus())) instance_config.sampler.CopyFrom( pbutil.FromFile( bazelutil.DataPath( f'phd/experimental/deeplearning/polyglot/samplers/{sampler}.pbtxt' ), sampler_pb2.Sampler())) configs.append(instance_config) return configs