def test_ResolveContentId_pre_encoded_corpus_url_mismatch(): """Test that corpuses with different pre-trained URLs have different IDs.""" config_1 = corpus_pb2.Corpus( pre_encoded_corpus_url="mysql://*****:*****@foo:3306/clgen?charset=utf-8" ) config_2 = corpus_pb2.Corpus( pre_encoded_corpus_url="sqlite:////tmp/encoded.db") assert corpuses.ResolveContentId(config_1) != corpuses.ResolveContentId( config_2)
def EnumerateLanguageInstanceConfigs( language: typing.Dict[str, typing.List[str]] ) -> typing.List[clgen_pb2.Instance]: """Enumerate the options for a language.""" configs = [] for corpus, model, sampler in itertools.product(language["corpuses"], EnumerateModels(), language["samplers"]): instance_config = clgen_pb2.Instance() instance_config.working_dir = FLAGS.working_dir instance_config.model.CopyFrom(model) instance_config.model.corpus.CopyFrom( pbutil.FromFile( bazelutil.DataPath( f"phd/experimental/deeplearning/polyglot/corpuses/{corpus}.pbtxt" ), corpus_pb2.Corpus(), )) instance_config.sampler.CopyFrom( pbutil.FromFile( bazelutil.DataPath( f"phd/experimental/deeplearning/polyglot/samplers/{sampler}.pbtxt" ), sampler_pb2.Sampler(), )) configs.append(instance_config) return configs
def test_ResolveContentId_pre_encoded_corpus_url(): """Test that pre_encoded_corpus_url field returns checksum of URL.""" config = corpus_pb2.Corpus( pre_encoded_corpus_url="mysql://*****:*****@foo:3306/clgen?charset=utf-8" ) assert corpuses.ResolveContentId(config) == ( "1fb56a3a74a939ee5be79172b3510a498abe7f3c")
def test_Corpus_pre_encoded_corpus_url_GetTrainingData(abc_pre_encoded): """Test the training data accessor of a pre-encoded corpus.""" c = corpuses.Corpus( corpus_pb2.Corpus(pre_encoded_corpus_url=abc_pre_encoded)) c.Create() # abc_pre_encoded contains two contentfiles, totalling with 8 tokens. assert len(c.GetTrainingData(shuffle=True)) == 8
def abc_corpus_config(abc_corpus): """The proto config for a simple Corpus.""" return corpus_pb2.Corpus( local_directory=abc_corpus, ascii_character_atomizer=True, contentfile_separator="\n\n", )
def PostprocessSampleCorpus(instance: clgen.Instance): """Create a corpus from the model samples and pre-process.""" sample_dir = instance.model.SamplerCache(instance.sampler) # Read the sample protos and write them to a directory of content files. contentfiles_dir = pathlib.Path(str(sample_dir) + '.contentfiles') contentfiles_dir.mkdir(exist_ok=True) logging.info('Writing output contentfiles to %s', contentfiles_dir) if len(list(contentfiles_dir.iterdir())) != len(list( sample_dir.iterdir())): for proto_path in sample_dir.iterdir(): sample = pbutil.FromFile(proto_path, model_pb2.Sample()) with open(contentfiles_dir / proto_path.name, 'w') as f: f.write(sample.text) logging.info('Creating output corpus') output_corpus_config = corpus_pb2.Corpus() output_corpus_config.CopyFrom(instance.model.corpus.config) output_corpus_config.local_directory = str(contentfiles_dir) # We derive the programming language name from the input corpus directory. # This depends on corpuses being in directories named after their language, # e.g. ~/corpuses/opencl, or ~/corpuses/java.A preprocessed_dir = instance.model.corpus.preprocessed.url[len('sqlite:///' ):].parent language = (preprocessed_dir / 'contentfiles').resolve().name output_corpus_config.preprocessor[:] = POSTPROCESSORS[language] output_corpus = corpuses.Corpus(output_corpus_config) try: output_corpus.Create() except errors.EmptyCorpusException: pass return output_corpus
def MakeClgenInstanceConfig( working_dir: pathlib.Path, encoded_db: encoded.EncodedContentFiles, num_training_epochs: int, seed_text: str, neurons_per_layer: int, num_layers: int, ) -> clgen_pb2.Instance: """Construct a CLgen instance. Args: working_dir: The directory to cache CLgen working files in. encoded_db: The directory of encoded content files. num_training_epochs: The number of epochs to train for. seed_text: The text to initiate sampling with. neurons_per_layer: Number of neurons in a layer. """ return clgen_pb2.Instance( working_dir=str(working_dir), model=model_pb2.Model( corpus=corpus_pb2.Corpus(pre_encoded_corpus_url=encoded_db.url, ), architecture=model_pb2.NetworkArchitecture( backend=model_pb2.NetworkArchitecture.TENSORFLOW, neuron_type=model_pb2.NetworkArchitecture.LSTM, neurons_per_layer=neurons_per_layer, num_layers=num_layers, post_layer_dropout_micros=0, ), training=model_pb2.TrainingOptions( num_epochs=num_training_epochs, sequence_length=64, batch_size=64, shuffle_corpus_contentfiles_between_epochs=True, adam_optimizer=model_pb2.AdamOptimizer( initial_learning_rate_micros=2000, learning_rate_decay_per_epoch_micros=50000, beta_1_micros=900000, beta_2_micros=999000, normalized_gradient_clip_micros=5000000, ), ), ), sampler=sampler_pb2.Sampler( start_text=seed_text, batch_size=64, sequence_length=1024, temperature_micros=1000000, # = 1.0 real value termination_criteria=[ sampler_pb2.SampleTerminationCriterion( symtok=sampler_pb2.SymmetricalTokenDepth( depth_increase_token="{", depth_decrease_token="}", )), sampler_pb2.SampleTerminationCriterion( maxlen=sampler_pb2.MaxTokenLength( maximum_tokens_in_sample=20000, )), ], ), )
def test_Corpus_GetTextCorpus_separator(clgen_cache_dir, abc_corpus): """Test the concatenation of the abc corpus with a custom separator.""" del clgen_cache_dir c = corpuses.Corpus( corpus_pb2.Corpus(local_directory=abc_corpus, ascii_character_atomizer=True, contentfile_separator='\n!!\n')) c.Create() # We don't know the ordering of the text corpus. assert 'The cat sat on the mat.' in c.GetTextCorpus(shuffle=False) assert 'Such corpus.\nVery wow.' in c.GetTextCorpus(shuffle=False) assert 'Hello, world!' in c.GetTextCorpus(shuffle=False) assert c.GetTextCorpus(shuffle=False).count('!!') == 2
def CreateCorpusProtoFromFlags() -> corpus_pb2.Corpus: corpus = corpus_pb2.Corpus( local_directory=FLAGS.clgen_corpus_dir, preprocessor=FLAGS.clgen_preprocessor, contentfile_separator="\n\n", ) if FLAGS.clgen_multichar_tokenizer: corpus.greedy_multichar_atomizer.CopyFrom( corpus_pb2.GreedyMulticharAtomizer(tokens=TOKEN_LISTS["opencl"]["tokens"]) ) else: corpus.ascii_character_atomizer = True return corpus
def ResolveEncodedId(content_id: str, config: corpus_pb2.Corpus) -> str: """Compute the hash of a corpus of preprocessed and encoded contentfiles. The hash is computed from the ID of the input files and the serialized representation of the config proto. """ config_without_contentfiles = corpus_pb2.Corpus() config_without_contentfiles.CopyFrom(config) # Clear the contentfiles field, since we use the content_id to uniquely # identify the input files. This means that corpuses with the same content # files delivered through different means (e.g. two separate but identical # directories) have the same hash. config_without_contentfiles.ClearField("contentfiles") return crypto.sha1_list(content_id, config_without_contentfiles.SerializeToString())
def test_Corpus_GetTrainingData_decode(clgen_cache_dir, abc_corpus): """Test the decoded output of GetTrainingData().""" del clgen_cache_dir c = corpuses.Corpus( corpus_pb2.Corpus(local_directory=abc_corpus, ascii_character_atomizer=True, contentfile_separator='\n!!\n')) c.Create() decoded = c.atomizer.DeatomizeIndices(c.GetTrainingData(shuffle=False)) # Test that each content file (plus contentfile separator) is in corpus. assert '\nSuch corpus.\nVery wow.\n!!\n' in decoded assert 'Hello, world!\n!!\n' in decoded assert 'The cat sat on the mat.\n!!\n' in decoded # Test the total length of the corpus. assert len('\nSuch corpus.\nVery wow.\n!!\n' + 'Hello, world!\n!!\n' + 'The cat sat on the mat.\n!!\n') == len(decoded)
def Create(self) -> None: """Create the corpus files. Raises: EmptyCorpusException: If there are no content files, or no successfully pre-processed files. """ self._created = True app.Log(1, "Content ID: %s", self.content_id) # Nothing to do for already-encoded databases. # TODO(github.com/ChrisCummins/clgen/issues/130): Refactor this after # splitting out Corpus class. if self.config.HasField("pre_encoded_corpus_url"): with self.dashboard_db.Session(commit=True) as session: config_to_store = corpus_pb2.Corpus() config_to_store.CopyFrom(self.config) # Clear the contentfiles field, since we use the content_id to uniquely # identify the input files. This means that corpuses with the same content # files delivered through different means (e.g. two separate but identical # directories) have the same hash. config_to_store.ClearField("contentfiles") corpus = session.GetOrAdd( dashboard_db.Corpus, config_proto_sha1=crypto.sha1( config_to_store.SerializeToString()), config_proto=str(config_to_store), preprocessed_url="", encoded_url=self.encoded.url, summary=self.GetShortSummary(), ) session.flush() self._dashboard_db_id = corpus.id return preprocessed_lock_path = ( pathlib.Path(self.preprocessed.url[len("sqlite:///"):]).parent / "LOCK") with lockfile.LockFile(preprocessed_lock_path): self.preprocessed.Create(self.config) if not self.preprocessed.size: raise errors.EmptyCorpusException( f"Pre-processed corpus contains no files: '{self.preprocessed.url}'" ) encoded_lock_path = ( pathlib.Path(self.encoded.url[len("sqlite:///"):]).parent / "LOCK") with lockfile.LockFile(encoded_lock_path): start_time = time.time() atomizer = self.atomizer app.Log( 1, "%s: %s tokens in %s ms", type(atomizer).__name__, humanize.Commas(atomizer.vocab_size), humanize.Commas(int((time.time() - start_time) * 1000)), ) self.encoded.Create(self.preprocessed, atomizer, self.config.contentfile_separator) # Add entry to dashboard database with self.dashboard_db.Session(commit=True) as session: config_to_store = corpus_pb2.Corpus() config_to_store.CopyFrom(self.config) # Clear the contentfiles field, since we use the content_id to uniquely # identify the input files. This means that corpuses with the same content # files delivered through different means (e.g. two separate but identical # directories) have the same hash. config_to_store.ClearField("contentfiles") corpus = session.GetOrAdd( dashboard_db.Corpus, config_proto_sha1=crypto.sha1( config_to_store.SerializeToString()), config_proto=str(config_to_store), preprocessed_url=self.preprocessed.url, encoded_url=self.encoded.url, summary=self.GetShortSummary(), ) session.flush() self._dashboard_db_id = corpus.id
def __init__(self, config: corpus_pb2.Corpus): """Instantiate a corpus from a proto config. If this is a new corpus, a number of files will be created, which may take some time. Args: config: A Corpus message. Raises: TypeError: If the config argument is not a Sampler proto. UserError: In case the corpus is not found, or config contains invalid options. EmptyCorpusException: In case the corpus contains no data. """ if not isinstance(config, corpus_pb2.Corpus): t = type(config).__name__ raise TypeError(f"Config must be a Corpus proto. Received: '{t}'") # Make a local copy of the configuration. self.config = corpus_pb2.Corpus() self.config.CopyFrom(AssertConfigIsValid(config)) self._atomizer = None self._created = False self.dashboard_db = dashboard_db.GetDatabase() self._dashboard_db_id: typing.Optional[int] = None # Set in Create() # An in-memory cache of the encoded contentfiles indices arrays. # Set and used in GetTrainingData(). self._indices_arrays: typing.Optional[typing.List[np.array]] = None cache.cachepath("corpus").mkdir(parents=True, exist_ok=True) hc = hashcache.HashCache(cache.cachepath("hashcache.db"), "sha1") self.content_id = ResolveContentId(self.config, hc) # Database of pre-processed files. preprocessed_id = ResolvePreprocessedId(self.content_id, self.config) cache.cachepath("corpus", "preprocessed", preprocessed_id).mkdir(exist_ok=True, parents=True) preprocessed_db_path = cache.cachepath("corpus", "preprocessed", preprocessed_id, "preprocessed.db") if (self.config.HasField("content_id") and not preprocessed_db_path.is_file()): raise errors.UserError( f"Content ID not found: '{self.content_id}'") self.preprocessed = preprocessed.PreprocessedContentFiles( f"sqlite:///{preprocessed_db_path}") # Create symlink to contentfiles. symlink = ( pathlib.Path(self.preprocessed.url[len("sqlite:///"):]).parent / "contentfiles") if not symlink.is_symlink(): if config.HasField("local_directory"): os.symlink( str( ExpandConfigPath( config.local_directory, path_prefix=FLAGS.clgen_local_path_prefix)), symlink, ) elif config.HasField("local_tar_archive"): os.symlink( str( ExpandConfigPath( config.local_tar_archive, path_prefix=FLAGS.clgen_local_path_prefix, )), symlink, ) # Data of encoded pre-preprocessed files. encoded_id = ResolveEncodedId(self.content_id, self.config) cache.cachepath("corpus", "encoded", encoded_id).mkdir(exist_ok=True, parents=True) db_path = cache.cachepath("corpus", "encoded", encoded_id, "encoded.db") # TODO(github.com/ChrisCummins/clgen/issues/130): Refactor this conditional # logic by making Corpus an abstract class and creating concrete subclasses # for the different types of corpus. if self.config.HasField("pre_encoded_corpus_url"): self.encoded = encoded.EncodedContentFiles( config.pre_encoded_corpus_url) else: self.encoded = encoded.EncodedContentFiles(f"sqlite:///{db_path}") self.atomizer_path = cache.cachepath("corpus", "encoded", encoded_id, "atomizer.pkl") # Create symlink to preprocessed files. # TODO(github.com/ChrisCummins/clgen/issues/130): Refactor this conditional # logic after splitting Corpus class. if not self.config.HasField("pre_encoded_corpus_url"): symlink = ( pathlib.Path(self.encoded.url[len("sqlite:///"):]).parent / "preprocessed") if not symlink.is_symlink(): os.symlink( os.path.relpath( pathlib.Path( self.preprocessed.url[len("sqlite:///"):]).parent, pathlib.Path( self.encoded.url[len("sqlite:///"):]).parent, ), symlink, ) self.hash = encoded_id self.cache = cache.mkcache("corpus", "encoded", encoded_id)
def __init__(self, config: corpus_pb2.Corpus): """Instantiate a corpus from a proto config. If this is a new corpus, a number of files will be created, which may take some time. Args: config: A Corpus message. Raises: TypeError: If the config argument is not a Sampler proto. UserError: In case the corpus is not found, or config contains invalid options. EmptyCorpusException: In case the corpus contains no data. """ if not isinstance(config, corpus_pb2.Corpus): t = type(config).__name__ raise TypeError(f"Config must be a Corpus proto. Received: '{t}'") # Make a local copy of the configuration. self.config = corpus_pb2.Corpus() self.config.CopyFrom(AssertConfigIsValid(config)) self._atomizer = None self._created = False cache.cachepath('corpus').mkdir(parents=True, exist_ok=True) hc = hashcache.HashCache(cache.cachepath('hashcache.db'), 'sha1') self.content_id = ResolveContentId(self.config, hc) # Database of pre-processed files. preprocessed_id = ResolvePreprocessedId(self.content_id, self.config) cache.cachepath('corpus', 'preprocessed', preprocessed_id).mkdir(exist_ok=True, parents=True) preprocessed_db_path = cache.cachepath('corpus', 'preprocessed', preprocessed_id, 'preprocessed.db') if (self.config.HasField('content_id') and not preprocessed_db_path.is_file()): raise errors.UserError( f"Content ID not found: '{self.content_id}'") self.preprocessed = preprocessed.PreprocessedContentFiles( preprocessed_db_path) # Create symlink to contentfiles. symlink = self.preprocessed.database_path.parent / 'contentfiles' if not symlink.is_symlink(): if config.HasField('local_directory'): os.symlink( str( ExpandConfigPath( config.local_directory, path_prefix=FLAGS.clgen_local_path_prefix)), symlink) elif config.HasField('local_tar_archive'): os.symlink( str( ExpandConfigPath( config.local_tar_archive, path_prefix=FLAGS.clgen_local_path_prefix)), symlink) # Data of encoded pre-preprocessed files. encoded_id = ResolveEncodedId(self.content_id, self.config) cache.cachepath('corpus', 'encoded', encoded_id).mkdir(exist_ok=True, parents=True) self.encoded = encoded.EncodedContentFiles( cache.cachepath('corpus', 'encoded', encoded_id, 'encoded.db')) self.atomizer_path = cache.cachepath('corpus', 'encoded', encoded_id, 'atomizer.pkl') # Create symlink to preprocessed files. symlink = self.encoded.database_path.parent / 'preprocessed' if not symlink.is_symlink(): os.symlink( os.path.relpath(self.preprocessed.database_path.parent, self.encoded.database_path.parent), symlink) self.hash = encoded_id self.cache = cache.mkcache('corpus', 'encoded', encoded_id)
def CreateCorpusProtoFromFlags() -> corpus_pb2.Corpus: corpus = corpus_pb2.Corpus( local_directory=FLAGS.clgen_corpus_dir, preprocessor=[ "deeplearning.clgen.preprocessors.opencl:ClangPreprocessWithShim", "deeplearning.clgen.preprocessors.opencl:Compile", "deeplearning.clgen.preprocessors.opencl:NormalizeIdentifiers", "deeplearning.clgen.preprocessors.opencl:StripDoubleUnderscorePrefixes", "deeplearning.clgen.preprocessors.common:StripDuplicateEmptyLines", "deeplearning.clgen.preprocessors.opencl:SanitizeKernelPrototype", "deeplearning.clgen.preprocessors.common:StripTrailingWhitespace", "deeplearning.clgen.preprocessors.opencl:ClangFormat", "deeplearning.clgen.preprocessors.common:MinimumLineCount3", "deeplearning.clgen.preprocessors.opencl:StripDoubleUnderscorePrefixes", "deeplearning.clgen.preprocessors.opencl:Compile", ], contentfile_separator='\n\n', ) if FLAGS.clgen_multichar_tokenizer: corpus.greedy_multichar_atomizer.CopyFrom( corpus_pb2.GreedyMulticharAtomizer(tokens=[ " ", "__assert", "__attribute", "__builtin_astype", "__clc_fabs", "__clc_fma", "__inline", "abs", "alignas", "alignof", "atomic_add", "auto", "barrier", "bool", "break", "case", "char", "clamp", "complex", "const", "constant", "continue", "default", "defined", "do", "double", "else", "enum", "error", "event_t", "extern", "fabs", "false", "float", "for", "get_global_id", "get_global_size", "get_local_id", "get_local_size", "get_num_groups", "global", "goto", "half", "if", "image1d_array_t", "image1d_buffer_t", "image1d_t", "image2d_array_t", "image2d_t", "image3d_t", "imaginary", "include", "inline", "int", "into", "kernel", "line", "local", "long", "noreturn", "pragma", "private", "quad", "read_only", "read_write", "register", "restrict", "return", "sampler_t", "short", "shuffle", "signed", "size_t", "sizeof", "sqrt", "static", "struct", "switch", "true", "typedef", "u32", "uchar", "uint", "ulong", "undef", "union", "unsigned", "void", "volatile", "while", "wide", "write_only", ])) else: corpus.ascii_character_atomizer = True return corpus
def main(argv: typing.List[str]): """Main entry point.""" if len(argv) > 1: raise app.UsageError("Unknown arguments: '{}'.".format(" ".join( argv[1:]))) instance = clgen.Instance( clgen_pb2.Instance( working_dir=FLAGS.clgen_dir, model=model_pb2.Model( corpus=corpus_pb2.Corpus( local_directory=FLAGS.clgen_corpus_dir, ascii_character_atomizer=True, preprocessor=[ "deeplearning.clgen.preprocessors.opencl:ClangPreprocessWithShim", "deeplearning.clgen.preprocessors.opencl:Compile", "deeplearning.clgen.preprocessors.opencl:NormalizeIdentifiers", "deeplearning.clgen.preprocessors.opencl:StripDoubleUnderscorePrefixes", "deeplearning.clgen.preprocessors.common:StripDuplicateEmptyLines", "deeplearning.clgen.preprocessors.opencl:SanitizeKernelPrototype", "deeplearning.clgen.preprocessors.common:StripTrailingWhitespace", "deeplearning.clgen.preprocessors.opencl:ClangFormat", "deeplearning.clgen.preprocessors.common:MinimumLineCount3", "deeplearning.clgen.preprocessors.opencl:Compile", ], contentfile_separator="\n\n", ), architecture=model_pb2.NetworkArchitecture( backend=model_pb2.NetworkArchitecture.TENSORFLOW, neuron_type=model_pb2.NetworkArchitecture.LSTM, neurons_per_layer=512, num_layers=2, post_layer_dropout_micros=0, ), training=model_pb2.TrainingOptions( num_epochs=50, sequence_length=64, batch_size=64, shuffle_corpus_contentfiles_between_epochs=True, adam_optimizer=model_pb2.AdamOptimizer( initial_learning_rate_micros=2000, learning_rate_decay_per_epoch_micros=50000, beta_1_micros=900000, beta_2_micros=999000, normalized_gradient_clip_micros=5000000, ), ), ), sampler=sampler_pb2.Sampler( start_text="kernel void ", batch_size=64, sequence_length=1024, temperature_micros=1000000, # = 1.0 real value termination_criteria=[ sampler_pb2.SampleTerminationCriterion( symtok=sampler_pb2.SymmetricalTokenDepth( depth_increase_token="{", depth_decrease_token="}", )), sampler_pb2.SampleTerminationCriterion( maxlen=sampler_pb2.MaxTokenLength( maximum_tokens_in_sample=20000, )), ], ), ), ) db = grewe_features_db.Database(FLAGS.db) profile_dir = pathlib.Path(FLAGS.profile_dir) profile_dir.mkdir(parents=True, exist_ok=True) profiler = prof.AutoCsvProfiler(profile_dir) with instance.Session(), multiprocessing.Pool() as pool: while True: Sample(instance, db, profiler, pool)
def main(argv: typing.List[str]): """Main entry point.""" if len(argv) > 1: raise app.UsageError("Unknown arguments: '{}'.".format(" ".join( argv[1:]))) os.environ["CLGEN_CACHE"] = f"{FLAGS.result_cache_dir}/clgen" # An OpenCL corpus, configured as described in CGO'17. corpus = corpuses.Corpus( corpus_pb2.Corpus( local_directory=FLAGS.github_kernels_dir, ascii_character_atomizer=True, contentfile_separator="\n\n", preprocessor=[ "deeplearning.clgen.preprocessors.opencl:ClangPreprocessWithShim", "deeplearning.clgen.preprocessors.opencl:Compile", "deeplearning.clgen.preprocessors.opencl:NormalizeIdentifiers", "deeplearning.clgen.preprocessors.opencl:StripDoubleUnderscorePrefixes", "deeplearning.clgen.preprocessors.common:StripDuplicateEmptyLines", "deeplearning.clgen.preprocessors.opencl:SanitizeKernelPrototype", "deeplearning.clgen.preprocessors.common:StripTrailingWhitespace", "deeplearning.clgen.preprocessors.opencl:ClangFormat", "deeplearning.clgen.preprocessors.common:MinimumLineCount3", "deeplearning.clgen.preprocessors.opencl:Compile", ], )) corpus.Create() cache_dir = pathlib.Path(FLAGS.result_cache_dir) / corpus.hash cache_dir.mkdir(parents=True, exist_ok=True) driver = cldrive.CldriveHarness( harness_pb2.CldriveHarness( opencl_env=[FLAGS.opencl_env], opencl_opt=[FLAGS.opencl_opt], )) with corpus.preprocessed.Session() as session: # Query to return all successfully preprocessed OpenCL kernels in a stable # order. q = (session.query(preprocessed.PreprocessedContentFile.text).filter( preprocessed.PreprocessedContentFile.preprocessing_succeeded == True).order_by(preprocessed.PreprocessedContentFile.id)) num_good_files = q.count() num_files = session.query(preprocessed.PreprocessedContentFile).count() app.Log( 1, "Corpus of %s files (%.1f%% of %s)", humanize.Commas(num_good_files), (num_good_files / num_files) * 100, humanize.Commas(num_files), ) srcs = [x[0] for x in q] batch_size = 8 max_batch = math.ceil(len(srcs) / batch_size) all_outcomes = [] for i, start_idx in enumerate(range(0, len(srcs), batch_size)): cached_results_path = cache_dir / f"{i}.pkl" if cached_results_path.is_file(): app.Log(1, "batch %d of %d", i + 1, max_batch) # Read cached results. with open(cached_results_path, "rb") as f: outcomes = pickle.load(f) elif FLAGS.summarize_only: continue else: app.Log(1, "batch %d of %d", i + 1, max_batch) # Evaluate OpenCL kernels and cache results. batch = srcs[start_idx:start_idx + batch_size] testcases = labtypes.flatten( [OpenClSourceToTestCases(src) for src in batch]) results = RunTestCasesOrDie(driver, testcases) outcomes = [ GetOutcomeWithDynamicChecks(result, driver) for result in results ] with open(cached_results_path, "wb") as f: pickle.dump(outcomes, f) all_outcomes += outcomes df = pd.DataFrame( list(zip(all_outcomes, np.ones(len(all_outcomes)))) + [("Total", len(all_outcomes))], columns=["outcome", "count"], ) summary = df.groupby("outcome").sum().reset_index() summary["ratio"] = [ f"{x:.2%}" for x in # Double the "ratio" values because the 'count' column contains a # grand total row. 2 * summary["count"].values / summary["count"].sum() ] summary["count"] = [ humanize.Commas(int(x)) for x in summary["count"] ] print(summary) del df del summary
def __init__(self, config: typing.Union[corpus_pb2.Corpus, corpus_pb2.PreTrainCorpus]): """Instantiate a corpus from a proto config. If this is a new corpus, a number of files will be created, which may take some time. Args: config: A Corpus message. Raises: TypeError: If the config argument is not a Sampler proto. UserError: In case the corpus is not found, or config contains invalid options. EmptyCorpusException: In case the corpus contains no data. """ if not isinstance(config, corpus_pb2.Corpus) and not isinstance(config, corpus_pb2.PreTrainCorpus): raise TypeError(f"Config must be a Corpus proto. Received: '{type(config).__name__}'") # Make a local copy of the configuration. if isinstance(config, corpus_pb2.Corpus): self.config = corpus_pb2.Corpus() self.pre_train = False else: self.config = corpus_pb2.PreTrainCorpus() self.pre_train = True self.config.CopyFrom(AssertConfigIsValid(config)) self._tokenizer = None self._created = False # An in-memory cache of the encoded contentfiles indices arrays. # Set and used in GetTrainingData(). self._indices_arrays: typing.Optional[typing.List[np.array]] = None if environment.WORLD_RANK == 0: cache.cachepath("corpus").mkdir(parents=True, exist_ok=True) distrib.barrier() self.content_id = ResolveContentId(self.config) # Database of pre-processed files. preprocessed_id = ResolvePreprocessedId(self.content_id, self.config) if environment.WORLD_RANK == 0: cache.cachepath("corpus", "preprocessed", preprocessed_id).mkdir(exist_ok=True, parents=True) distrib.barrier() preprocessed_db_path = cache.cachepath("corpus", "preprocessed", preprocessed_id, "preprocessed.db") if self.config.HasField("content_id") and not preprocessed_db_path.is_file(): raise ValueError(f"Content ID not found: '{self.content_id}'") self.preprocessed = preprocessed.PreprocessedContentFiles( f"sqlite:///{preprocessed_db_path}" ) # Create symlink to contentfiles. if environment.WORLD_RANK == 0: symlink = (pathlib.Path(self.preprocessed.url[len("sqlite:///") :]).parent / "contentfiles") if not symlink.is_symlink(): if config.HasField("local_directory"): os.symlink( str(ExpandConfigPath(config.local_directory, path_prefix=FLAGS.clgen_local_path_prefix)), symlink, ) elif config.HasField("local_tar_archive"): os.symlink( str(ExpandConfigPath(config.local_tar_archive, path_prefix=FLAGS.clgen_local_path_prefix)), symlink, ) elif config.HasField("bq_database"): os.symlink( str(ExpandConfigPath(config.bq_database, path_prefix=FLAGS.clgen_local_path_prefix)), symlink, ) # elif config.HasField("fetch_github"): # os.symlink( # str(ExpandConfigPath(config.fetch_github, path_prefix=FLAGS.clgen_local_path_prefix)), # symlink, # ) distrib.barrier() # Data of encoded pre-preprocessed files. encoded_id = ResolveEncodedId(self.content_id, self.config) if environment.WORLD_RANK == 0: cache.cachepath("corpus", "encoded", encoded_id).mkdir(exist_ok=True, parents=True) distrib.barrier() db_path = cache.cachepath("corpus", "encoded", encoded_id, "encoded.db") if self.config.HasField("pre_encoded_corpus_url"): self.encoded = encoded.EncodedContentFiles(config.pre_encoded_corpus_url, self.pre_train) else: self.encoded = encoded.EncodedContentFiles(f"sqlite:///{db_path}", self.pre_train) self.tokenizer_path = cache.cachepath( "corpus", "encoded", encoded_id, "tokenizer.pkl" ) if environment.WORLD_RANK == 0 and not self.config.HasField("pre_encoded_corpus_url"): symlink = (pathlib.Path(self.encoded.url[len("sqlite:///") :]).parent / "preprocessed") if not symlink.is_symlink(): os.symlink( os.path.relpath( pathlib.Path(self.preprocessed.url[len("sqlite:///") :]).parent, pathlib.Path(self.encoded.url[len("sqlite:///") :]).parent, ), symlink, ) self.hash = encoded_id self.cache = cache.mkcache("corpus", "encoded", encoded_id) if environment.WORLD_RANK == 0: commit.saveCommit(self.cache.path) commit.saveCommit(self.cache.path.parent.parent / "preprocessed" / preprocessed_id) distrib.barrier() l.logger().info("Initialized {}train corpus in {}".format("pre_" if self.pre_train else "", self.cache.path)) return