def __init__( self, client: bigquery.Client, dataset_id: str = None, extensions: typing.List[str] = None, ): """Generic Dataset class constructor. Not to be used directly.""" self.client = client self.dataset, self.tables = self._setupDataset( "{}.clgen_{}_github".format(self.client.project, dataset_id or "generic")) self.queryConfig = lambda qt, qr=[], dr=False: bigquery.QueryJobConfig( destination=self.tables[qt], write_disposition='WRITE_TRUNCATE', query_parameters=qr, dry_run=dr, ) self.extensions = extensions self.query_file_id = "" if self.extensions is not None: self.query_file_id = " OR ".join([ "substr(file.path, {}, {}) = '{}'".format( -len(ext), 1 + len(ext), ext) for ext in self.extensions ]) self.file_count = None l.logger().info("{} dataset initialized.".format(self.language)) return
def _initTensors(self): if self.current_step > 0: if self.jsonfile.exists(): with open(self.jsonfile, 'r') as js: loaded_tensors = json.load(js) if loaded_tensors[-1]['step'] > self.current_step: # If previous sessions have written beyond current step, overwrite them. back_index = -2 while loaded_tensors[back_index][ 'step'] > self.current_step: back_index -= 1 self.tensors = loaded_tensors[:back_index + 1] else: self.tensors = loaded_tensors for ch in self.tensors: for k, v in ch.items(): if k == 'step': continue if k not in self.plot_tensors: self.plot_tensors[k] = { 'value': [], 'step': [] } self.plot_tensors[k]['value'].append(v) self.plot_tensors[k]['step'].append(ch['step']) else: l.logger().error( "Training json log-file not found. Will keep track from this point on." ) return
def filecount_query(self) -> typing.Tuple[int, int]: """ Queries the file count of files intended to query. Returns file count in int. """ query = """ SELECT COUNT(*) FROM `bigquery-public-data.github_repos.files` as file {} """.format("" if not self.query_file_id else "WHERE " + self.query_file_id) dry_run_job = self.client.query(query, job_config=self.queryConfig( 'main_files', dr=True)) l.logger().warn("This query is going to consume {}".format( humanize.naturalsize(dry_run_job.total_bytes_processed))) l.logger().info(query) if FLAGS.bq_wait_permission: l.logger().warn("Hit any button to continue...") try: input() except KeyboardInterrupt: return (0, 0) l.logger().info("Running file count query...") try: job = self.client.query(query) for f in job: self.file_count = (f[0], 0) return self.file_count except google.api_core.exceptions.Forbidden as e: l.logger().error(e) exit()
def remove_identical_files(self) -> None: l.logger().info("Removing duplicate files from mined corpus...") if os.path.isfile(str(self.cache_path / "record.json")): with open(self.cache_path / "record.json", 'r') as f: data = json.load(f) repos = data[0] length = data[1]['total_files'] cache_map = {} for i in range(length): with open(self.cache_path / "{}.cl".format(i), 'r') as f: cf = f.read() cf_hash = crypto.sha256_str(cf) if cf_hash not in cache_map: cache_map[cf_hash] = cf new_path = self.cache_path / "distinct_corpus" new_path.mkdir(exist_ok=True, parents=True) for k, v in cache_map.items(): with open(new_path / "{}.cl".format(k), 'w') as f: f.write(v) with open(new_path / "record.json", 'w') as f: data[1]['total_files'] = len(cache_map) json.dump(data, f, indent=2) return
def getOpenCLPlatforms() -> None: """ Identify compatible OpenCL platforms for current system. """ global CL_PLATFORMS CL_PLATFORMS = { 'CPU': None, 'GPU': None, } try: cmd = subprocess.Popen( "{} --clinfo".format(CLDRIVE).split(), stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True, ) stdout, stderr = cmd.communicate() if stderr: raise ValueError(stderr) except Exception as e: l.logger().error(cmd) l.logger().error(e) lines = stdout.split('\n') for line in lines: if line and line[:3] == "GPU" and not CL_PLATFORMS['GPU']: CL_PLATFORMS['GPU'] = line elif line and line[:3] == "CPU" and not CL_PLATFORMS['CPU']: CL_PLATFORMS['CPU'] = line return
def initOrGetQueue(self) -> np.array: """ If feed queue is not initialized, initialize it by getting new datapoint. Otherwise, don't do anything as feed_queue is already loaded from checkpoint. Adds datapoint to InputFeed table of database. Returns: Starting input feed of sampling. """ if not self.feed_queue: try: cf = next(self.loader).squeeze(0) except StopIteration: self.loader = iter(self.dataloader) cf = next(self.loader).squeeze(0) cf = [int(x) for x in cf] self.feed_queue.append( ActiveSampleFeed( input_feed = cf, input_features = extractor.ExtractFeatures(self.tokenizer.ArrayToCode(cf), [self.feat_sampler.feature_space])[self.feat_sampler.feature_space], input_score = math.inf, gen_id = 0, ) ) if environment.WORLD_RANK == 0: self.addToDB( active_feed_database.ActiveInput.FromArgs( tokenizer = self.tokenizer, id = self.active_db.input_count, input_feed = cf, input_features = self.feed_queue[-1].input_features, ) ) l.logger().info("Feed queue input scores: {}".format(', '.join([str(round(c.input_score, 3)) for c in self.feed_queue]))) return self.feed_queue[0].input_feed
def argmax(self, t): """Sample argmax from a tensor.""" if self.use_categorical: try: ct = torch.distributions.relaxed_categorical.RelaxedOneHotCategorical( temperature=self.temperature if self.temperature is not None else 1.0, logits=t, validate_args=False if "1.9." in torch.__version__ else None, ).sample() except ValueError as e: dump_cf = "" dump_types = "" p = pathlib.Path("./dump_argmax_error.log").absolute() if not p.exists(): l.logger().error(t.shape) l.logger().error(p) for d0 in t: for d1 in d0: dump_cf += str(d1) + ", " if isinstance(d1, torch.Tensor): dump_types += str(d1.type()) + ", " else: dump_types += str(type(d1)) + ", " with open(p, 'w') as outf: outf.write( str(t.shape) + "\n\n\n" + dump_cf + "\n\n\n" + dump_types) raise e return torch.argmax(ct, dim=-1)
def __init__(self, config: active_learning_pb2.ActiveLearner): """Instantiate a model. Args: config: A Model message. Raises: TypeError: If the config argument is not a Model proto. UserError: In case on an invalid config. """ # Error early, so that a cache isn't created. if not isinstance(config, active_learning_pb2.ActiveLearner): t = type(config).__name__ raise TypeError( f"Config must be an ActiveLearner proto. Received: '{t}'") self.config = active_learning_pb2.ActiveLearner() # Validate config options. self.config.CopyFrom(AssertConfigIsValid(config)) distrib.lock() self.cache = cache.mkcache("active_model") distrib.unlock() self.downstream_task = downstream_tasks.DownstreamTask.FromTask( self.config.downstream_task, self.config.training_corpus) if environment.WORLD_RANK == 0: ## Store current commit commit.saveCommit(self.cache.path) self.backend = active_committee.QueryByCommittee( self.config, self.cache, self.downstream_task) l.logger().info("Initialized {} in {}".format(self.backend, self.cache.path)) return
def Specialize(self, tokenizer: tokenizers.TokenizerBase) -> None: """Specialize a sampler a vocabulary. This enables the sampler to set state specialized to a specific encoding vocabulary. This is guaranteed to be called before SampleIsComplete(), and ensures that the vocabulary used for all sample arguments to SampleIsComplete() is from this vocabulary. Args: tokenizer: An tokenizer to specialize to. Raises: InvalidStartText: If the start_text cannot be encoded using the vocabulary. UserError: In case the sampler cannot be specialized to this vocabulary. """ try: self.encoded_start_text = tokenizer.TokenizeString(self.start_text) self.tokenized_start_text = tokenizer.AtomizeString(self.start_text) except ValueError: raise ValueError( "Sampler start text cannot be encoded using the corpus vocabulary: " f"'{self.start_text}'" ) if len(self.encoded_start_text) > self.sequence_length: raise ValueError( "Encoded sampler start text must be less than sampler sequence " f"length. Sampler sequence length={self.sequence_length}, encoded " f"start text length={len(self.encoded_start_text)}" ) l.logger().info("Sampling: '{}'\n".format(self.start_text)) [terminator.Specialize(tokenizer) for terminator in self.terminators]
def Train(self, **kwargs) -> "Model": """Train the model. Returns: The model instance. Raises: UnableToAcquireLockError: If the model is locked (i.e. there is another process currently modifying the model). """ self.Create() self.backend.Train(self.corpus, **kwargs) telemetry_logs = self.backend.telemetry.EpochTelemetry() l.logger().info("Trained model for {} {} in {} ms. " "Training loss: {}.".format( telemetry_logs[-1].epoch_num, "steps" if isinstance(self.backend, tf_bert.tfBert) or isinstance(self.backend, torch_bert.torchBert) else "epochs", humanize.intcomma( sum(t.epoch_wall_time_ms for t in telemetry_logs)), telemetry_logs[-1].loss, )) return self
def repository_query(self) -> typing.Tuple[bigquery.table.RowIterator]: """ Queries the repositories' name/branch that contain files with requested specifications (e.g. OpenCL files). Returns iterable of query. """ query = """ SELECT DISTINCT file.repo_name, file.ref FROM `bigquery-public-data.github_repos.files` as file {} """.format("" if not self.query_file_id else "WHERE " + self.query_file_id) dry_run_job = self.client.query(query, job_config=self.queryConfig( 'repositories', dr=True)) l.logger().warn("This query is going to consume {}".format( humanize.naturalsize(dry_run_job.total_bytes_processed))) l.logger().info(query) if FLAGS.bq_wait_permission: l.logger().warn("Hit any button to continue...") try: input() except KeyboardInterrupt: return (None, None) l.logger().info("Retrieving repository list of specs...") try: rows = self.client.query( query, job_config=self.queryConfig('repositories')).result() except google.api_core.exceptions.Forbidden as e: l.logger().error(e) exit() return (rows, None)
def FromText(config, contentfile_separator: str, corpus_txt: str): mask_tokens = False if config.mask_tokens is None else config.mask_tokens if config.token_type == "character": if config.token_list: l.logger().warning( "token list in character-based tokenization is going to be ignored." ) return AsciiCharacterTokenizer.FromText(corpus_txt, mask_tokens) elif config.token_type == "word": with open(config.token_list, 'r') as f: token_set = json.load(f) token_set = set(token_set['opencl']['tokens']) wpc_tok = False if config.wordpiece_tokenization is None else config.wordpiece_tokenization return WordTokenizer.FromText(corpus_txt, token_set, mask_tokens, wpc_tok) elif config.token_type == "ast": if config.token_list: with open(config.token_list, 'r') as f: token_set = json.load(f) token_set = set(token_set['opencl']['tokens']) else: token_set = set() return ASTokenizer.FromText(corpus_txt, token_set, contentfile_separator, mask_tokens) else: raise NotImplementedError
def get_data_features( db, tokenizer, size_limit=None ) -> typing.List[typing.Dict[str, typing.Dict[str, float]]]: """ Get or set feature with data list of tuples. """ datapoints = [] db_feats = db.get_data_features(tokenizer, size_limit) for inp in tqdm.tqdm(db_feats, total=len(db_feats), desc="Fetch data"): feats = workers.ContentFeat(inp) if len(inp) == 2: src, _ = inp include = "" else: src, include, _ = inp try: datapoints.append({ "GreweFeatures": feats["GreweFeatures"], "AutophaseFeatures": feats["AutophaseFeatures"], "InstCountFeatures": feats["InstCountFeatures"], }) except KeyError as e: l.logger().warn(e) return datapoints
def __init__( self, workspace: pathlib.Path, feature_space: str, target: str, git_corpus: corpuses.Corpus = None, ): self.target = target self.benchmarks = [] if self.target != "grid_walk": self.path = pathlib.Path(targets[target]).resolve() self.workspace = workspace self.feature_space = feature_space self.reduced_git_corpus = [ (cf, feats[self.feature_space]) for cf, feats in git_corpus.getFeaturesContents( sequence_length=768) if self.feature_space in feats and feats[self.feature_space] ] self.loadCheckpoint() try: self.target_benchmark = self.benchmarks.pop(0) l.logger().info("Target benchmark: {}\nTarget fetures: {}".format( self.target_benchmark.name, self.target_benchmark.features)) except IndexError: self.target_benchmark = None return
def __init__(self, *args, **kwargs): super(QueryByCommittee, self).__init__(*args, **kwargs) from deeplearning.clgen.util import pytorch if not pytorch.initialized: pytorch.initPytorch() self.pytorch = pytorch self.torch = pytorch.torch self.torch_tpu_available = pytorch.torch_tpu_available self.torch.manual_seed(self.config.committee.random_seed) self.torch.cuda.manual_seed_all(self.config.committee.random_seed) self.ckpt_path = self.cache.path / "checkpoints" self.sample_path = self.cache.path / "samples" self.logfile_path = self.cache.path / "logs" self.validation_results_file = "val_results.txt" self.validation_results_path = self.logfile_path / self.validation_results_file self.committee = [] self.is_validated = False self.trained = False l.logger().info("Active Committee config initialized in {}".format( self.cache.path)) return
def __init__(self, *args, **kwargs): super(tfBert, self).__init__(*args, **kwargs) from deeplearning.clgen.util import tf tf.initTensorflow() self.tf = tf.tf self.bertAttrs = None self.bert_config = None self.train = None self.sample = None self.predict_generator = None self.sampler = None self.train_batch_size = None self.eval_batch_size = None self.learning_rate = None self.num_train_steps = None self.num_warmup_steps = None self.ckpt_path = self._ConfigCheckpointParams() self.logfile_path = self.cache.path / "logs" self.sample_path = self.cache.path / "samples" self.telemetry = telemetry.TrainingLogger(self.logfile_path) self.is_validated = False l.logger().info("BERT Model config initialized in {}".format( self.cache.path)) return
def yield_cl_kernels( path: pathlib.Path ) -> typing.List[typing.Tuple[pathlib.Path, str, str]]: """ Fetch all cl files from base path and atomize, preprocess kernels to single instances. Original benchmarks extracted from suites, go through a series of pre-processors: 1. Include statements are removed. 2. Code is preprocessed with shim (macro expansion). 3. Double underscores are removed. 4. void kernel -> kernel void 5. Translation units are split to tuples of (kernel, utility/global space) """ contentfiles = iter_cl_files(path) kernels = [] pool = multiprocessing.Pool() for kernel_batch in tqdm.tqdm(pool.map(preprocessor_worker, contentfiles), total=len(contentfiles), desc="Yield {} benchmarks".format( path.stem)): kernels += kernel_batch l.logger().info("Pre-processed {} OpenCL benchmarks".format(len(kernels))) pool.close() return kernels
def _ConfigCheckpointParams(self): if FLAGS.select_checkpoint_step >= 0: ckpt_current = self.cache.path / "checkpoints" if not (ckpt_current / "model.ckpt-{}.index".format( FLAGS.select_checkpoint_step)).exists(): raise FileNotFoundError( ckpt_current / "model.ckpt-{}.index".format(FLAGS.select_checkpoint_step)) workspace_rel_path = self.cache.path.relative_to( pathlib.Path(os.environ.get("CLGEN_CACHE")).parent) ckpt_path = pathlib.Path("/tmp" / workspace_rel_path / "checkpoints") ckpt_path.mkdir(exist_ok=True, parents=True) shutil.copy2(ckpt_current / "checkpoint", ckpt_path) shutil.copy2(ckpt_current / "graph.pbtxt", ckpt_path) for ckpt_file in glob.glob( str(ckpt_current / "model.ckpt-{}.*".format( FLAGS.select_checkpoint_step))): shutil.copy2(ckpt_file, ckpt_path) l.logger().warn( "Explicit checkpoint selected. Explicit checkpoints can only be used for validation or sampling." ) elif FLAGS.select_checkpoint_step == -1: ckpt_path = self.cache.path / "checkpoints" else: raise ValueError( "Invalid value {} for --select_checkpoint_step".format( FLAGS.select_checkpoint_step)) l.logger().info("Configured model checkpoints in {}".format(ckpt_path)) return ckpt_path
def to_unique_samples(db: SamplesDatabase, out_db: SamplesDatabase) -> None: """ Read input database, pass through deterministic re-writer and keep only unique samples. """ pool = multiprocessing.Pool() inp_data = [x for x in db.get_data] visited = set() data = [] try: for sha, sample in tqdm.tqdm(pool.imap_unordered( ContentHash_worker, inp_data), total=len(inp_data), desc="Unique-fy samples database"): if sha not in visited: visited.add(sha) data.append(sample) except Exception as e: l.logger().error(e) pool.terminate() raise e pool.close() with out_db.Session() as s: idx = 0 for dp in tqdm.tqdm(data, total=len(data), desc="Adding to DB"): new_dp = get_sample(dp) new_dp.id = idx idx += 1 s.add(new_dp) s.commit() return
def save_pretrained(self, save_directory): """ Save a model and its configuration file to a directory, so that it can be re-loaded using the `:func:`~transformers.PreTrainedModel.from_pretrained`` class method. Arguments: save_directory (:obj:`str`): Directory to which to save. Will be created if it doesn't exist. """ if os.path.isfile(save_directory): l.logger().error("Provided path ({}) should be a directory, not a file".format(save_directory)) return os.makedirs(save_directory, exist_ok=True) # Only save the model itself if we are using distributed training model_to_save = self.module if hasattr(self, "module") else self # Attach architecture to the config model_to_save.config.architectures = [model_to_save.__class__.__name__] # If we save using the predefined names, we can load using `from_pretrained` output_model_file = os.path.join(save_directory, WEIGHTS_NAME) if getattr(self.config, "xla_device", False): if pytorch.xla_model.is_master_ordinal(): # Save configuration file model_to_save.config.save_pretrained(save_directory) # pytorch.xla_model.save takes care of saving only from master pytorch.xla_model.save(model_to_save.state_dict(), output_model_file) else: model_to_save.config.save_pretrained(save_directory) torch.save(model_to_save.state_dict(), output_model_file) l.logger().info("Model weights saved in {}".format(output_model_file))
def to_unique_samples(db: EncodedContentFiles, out_db: EncodedContentFiles, tokenizer) -> None: """ Read input database, pass through deterministic re-writer and keep only unique samples. """ pool = multiprocessing.Pool() visited = set() data = [] f = functools.partial(ContentHash_worker, tokenizer = tokenizer) with db.Session() as s: inp_data = [x for x in s.query(EncodedContentFile).all()] try: for sha, cfile in tqdm.tqdm(pool.imap_unordered(f, inp_data), total = len(inp_data), desc = "Unique-fy encoded database"): if sha not in visited: visited.add(sha) data.append(cfile) except Exception as e: l.logger().error(e) pool.terminate() raise e pool.close() with out_db.Session() as s: idx = 0 for dp in tqdm.tqdm(data, total = len(data), desc = "Adding to DB"): new_dp = EncodedContentFile.FromEncodedContentFile(dp, idx = idx) idx += 1 s.add(new_dp) s.commit() return
def GetDatabase() -> DashboardDatabase: db: DashboardDatabase = DashboardDatabase( url="sqlite:///{}/dashboard.db".format( os.path.abspath(FLAGS.workspace_dir)), must_exist=False) l.logger().info("Created dashboard database {}".format(db.url)) return db
def InitSampling(self, sampler : samplers.Sampler, seed : typing.Optional[int] = None, corpus = None, ) -> None: """This is called only once. Performs basic initialization of sampling""" sample_batch_size = sampler.batch_size data_generator = torchLMDataGenerator.SampleMaskLMBatchGenerator( self.config.training, sampler, self.tokenizer, seed, sample_batch_size, self.config.architecture.max_position_embeddings, self.cache.path, corpus, self.feature_encoder, self.feature_tokenizer, self.feature_sequence_length, ) self._ConfigSampleParams(data_generator, sampler) ckpt_step = self.loadCheckpoint(self.sample) if self.pytorch.num_gpus > 0: self.torch.cuda.empty_cache() if ckpt_step >= 0: l.logger().info("Loaded checkpoint step {}".format(ckpt_step)) self.step_inputs = None self.loader = None self.pred_iterator = None l.logger().info("Initialized model samples in {}".format(self.sample_path / self.sampler.hash)) return
def IsDone(self, session: sqlutil.Session): if session.query(Meta).filter(Meta.key == "done").first(): return True elif FLAGS.override_preprocessing: l.logger().warn("Overriding incomplete pre-processed DB.") return True else: return False
def __init__(self, path: pathlib.Path, name: str, extension: str): super(zipStorage, self).__init__(path, name, extension) self.cached_content = [] self.flush_counter = 20000 self.file_count = 0 self.repos = self.loadRepos self.data_file = "" l.logger().info("Set up ZIP storage in {}".format(self.cache_path))
def _ConfigTrainParams(self, data_generator: tfLMDataGenerator) -> None: """ Model parameter initialization for training and validation. """ if self.bert_config is None: self._ConfigModelParams() self.train_batch_size = self.config.training.batch_size self.eval_batch_size = self.config.training.batch_size self.learning_rate = self.config.training.adam_optimizer.initial_learning_rate_micros / 1e6 self.num_warmup_steps = self.config.training.num_warmup_steps self.steps_per_epoch = data_generator.steps_per_epoch self.num_epochs = data_generator.num_epochs self.num_train_steps = self.steps_per_epoch * self.num_epochs self.max_eval_steps = FLAGS.max_eval_steps self.validation_results_file = "val_results.txt" self.validation_results_path = os.path.join( str(self.logfile_path), self.validation_results_file) tpu_cluster_resolver = None if FLAGS.use_tpu and FLAGS.tpu_name: tpu_cluster_resolver = self.tf.distribute.cluster_resolver.TPUClusterResolver( FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) train_distribute = self.tf.distribute.MirroredStrategy( num_gpus=gpu.numGPUs( )) if FLAGS.use_tpu and FLAGS.mirror_gpus else None is_per_host = self.tf.compat.v1.estimator.tpu.InputPipelineConfig.PER_HOST_V2 run_config = self.tf.compat.v1.estimator.tpu.RunConfig( cluster=tpu_cluster_resolver, master=FLAGS.master, model_dir=str(self.ckpt_path), save_checkpoints_steps=self.steps_per_epoch, save_summary_steps=self.steps_per_epoch, keep_checkpoint_max=0, log_step_count_steps=self.steps_per_epoch, train_distribute=train_distribute, tpu_config=self.tf.compat.v1.estimator.tpu.TPUConfig( iterations_per_loop=self.steps_per_epoch, num_shards=FLAGS.num_tpu_cores, per_host_input_for_training=is_per_host)) model_fn = self._model_fn_builder(bert_config=self.bert_config) # If TPU is not available, this will fall back to normal Estimator on CPU # or GPU. self.train = tfBert.BertEstimator( self.tf.compat.v1.estimator.tpu.TPUEstimator( use_tpu=FLAGS.use_tpu, model_fn=model_fn, config=run_config, params=None, train_batch_size=self.train_batch_size, eval_batch_size=self.eval_batch_size, ), data_generator) l.logger().info(self.GetShortSummary()) return
def ContentHash_worker(sample: Sample) -> typing.Tuple[str, Sample]: """ Return new sample along with content hash of code. """ try: return opencl.ContentHash(sample.text), sample except Exception as e: l.logger().warn(e) return None
def __init__(self, *args, **kwargs): super(torchBert, self).__init__(*args, **kwargs) from deeplearning.clgen.util import pytorch if not pytorch.initialized: pytorch.initPytorch() if self.config.architecture.HasField("feature_encoder") and self.config.architecture.feature_encoder: self.feature_encoder = True self.feature_tokenizer = tokenizers.FeatureTokenizer.FromArgs( self.config.architecture.feature_singular_token_thr, self.config.architecture.feature_max_value_token, self.config.architecture.feature_token_range ) self.feature_sequence_length = self.config.architecture.feature_sequence_length else: self.feature_encoder = False self.feature_tokenizer = None self.feature_sequence_length = None self.pytorch = pytorch self.torch = pytorch.torch self.torch_tpu_available = pytorch.torch_tpu_available self.torch.manual_seed(self.config.training.random_seed) self.torch.cuda.manual_seed_all(self.config.training.random_seed) self.bertAttrs = {} self.featureAttrs = {} self.bert_config = None self.train = None self.sample = None self.predict_generator = None self.sampler = None self.train_batch_size = None self.eval_batch_size = None self.learning_rate = None self.num_train_steps = None self.ckpt_path = self.cache.path / "checkpoints" self.sample_path = self.cache.path / "samples" self.logfile_path = self.cache.path / "logs" if self.config.HasField("pre_train_corpus"): self.pre_logfile_path = self.logfile_path / "pre_train" self.telemetry = telemetry.TrainingLogger(self.logfile_path) if self.config.HasField("pre_train_corpus"): self.pre_telemetry = telemetry.TrainingLogger(self.logfile_path / "pre_train") self.is_validated = False self.trained = False l.logger().info("BERT Model config initialized in {}".format(self.cache.path)) return
def after_run(self, run_context, run_values): """ Requested tensors are evaluated and their values are available """ super(writeValidationDB, self).after_run(run_context, run_values) batch_size = run_values.results[self.input_ids].shape[0] masked_lm_predictions = np.reshape( run_values.results[self.masked_lm_predictions], (batch_size, int(len(run_values.results[self.masked_lm_predictions]) / batch_size)) ) next_sentence_predictions = np.reshape( run_values.results[self.next_sentence_predictions], (batch_size, int(len(run_values.results[self.next_sentence_predictions]) / batch_size)) ) assert run_values.results[self.original_input].shape[0] == batch_size assert run_values.results[self.input_ids].shape[0] == batch_size assert run_values.results[self.input_mask].shape[0] == batch_size assert run_values.results[self.masked_lm_positions].shape[0] == batch_size assert run_values.results[self.masked_lm_ids].shape[0] == batch_size assert run_values.results[self.masked_lm_weights].shape[0] == batch_size assert run_values.results[self.masked_lm_lengths].shape[0] == batch_size assert run_values.results[self.next_sentence_labels].shape[0] == batch_size assert masked_lm_predictions.shape[0] == batch_size assert next_sentence_predictions.shape[0] == batch_size with self.val_db.Session(commit = True) as session: for b in range(batch_size): val_trace = validation_database.BERTValFile( **validation_database.BERTValFile.FromArgs( tokenizer = self.tokenizer, id = self.val_id, train_step = run_values.results[self.global_step], seen_in_training = run_values.results[self.seen_in_training][b], original_input = run_values.results[self.original_input][b], input_ids = run_values.results[self.input_ids][b], input_mask = run_values.results[self.input_mask][b], masked_lm_positions = run_values.results[self.masked_lm_positions][b], masked_lm_ids = run_values.results[self.masked_lm_ids][b], masked_lm_weights = run_values.results[self.masked_lm_weights][b], masked_lm_lengths = run_values.results[self.masked_lm_lengths][b], next_sentence_labels = run_values.results[self.next_sentence_labels][b], masked_lm_predictions = masked_lm_predictions[b], next_sentence_predictions = next_sentence_predictions[b], ) ) try: exists = session.query(validation_database.BERTValFile.sha256).filter_by(sha256 = val_trace.sha256).scalar() is not None except sqlalchemy.orm.exc.MultipleResultsFound as e: l.logger().error("Selected sha256 has been already found more than once.") raise e if not exists: session.add(val_trace) self.val_id += 1 return
def ContentHash_worker(contentfile: EncodedContentFile, tokenizer) -> typing.Tuple[str, EncodedContentFile]: """ Return new contentfile along with content hash of code. """ try: return opencl.ContentHash(tokenizer.ArrayToCode(contentfile.indices_array, with_formatting = False)), contentfile except Exception as e: l.logger().warn(e) return None