def test_Compile_small_program(): """Test that Compile accepts a small program.""" assert opencl.Compile(""" kernel void A(global int*a ) { a[get_global_id(0)] = 0; } """) == """
def text_candidate_worker(sample : np.array, # feed : np.array, feat_sampler : feature_sampler.EuclideanSampler, tokenizer : typing.TypeVar('corpuses.tokenizers.TokenizerBase'), ) -> ActiveSample: sample, sample_indices, input_ids, mlm_lengths, feed = sample try: code = tokenizer.ArrayToCode(sample, with_formatting = False) _ = opencl.Compile(code) features = extractor.ExtractFeatures(code, [feat_sampler.feature_space])[feat_sampler.feature_space] if features: return (True, ActiveSample( sample_feed = feed, sample = sample, sample_indices = [x for x in sample_indices if x != tokenizer.padToken], input_ids = [x for x in input_ids if x != tokenizer.padToken], hole_lengths = mlm_lengths, sample_indices_size = len([x for x in sample_indices if x != tokenizer.padToken]), features = features, score = feat_sampler.calculate_distance(features), )) except ValueError: pass except Exception as e: raise e return (False, ActiveSample( sample_feed = feed, sample = sample, sample_indices = [x for x in sample_indices if x != tokenizer.padToken], input_ids = [x for x in input_ids if x != tokenizer.padToken], hole_lengths = mlm_lengths, sample_indices_size = len([x for x in sample_indices if x != tokenizer.padToken]), features = {}, score = math.inf, ))
def checkIfBatchCompiles(self, sample: np.array) -> int: """Sends a filled sequence to the compiler""" try: stdout = opencl.Compile(self.tokenizer.ArrayToCode(sample)) return 1 except ValueError: return 0
def test_Compile_missing_shim_define(): """Test that Compile rejects a program which depends on the shim header.""" with test.Raises(errors.ClangException): opencl.Compile( """ kernel void A(global FLOAT_T* a) {} """ )
def test_Compile_undefined_variable(): """Test that Compile rejects a program with an undefined variable.""" with pytest.raises(errors.ClangException) as e_info: opencl.Compile(""" kernel void A(global int* a) { undefined_variable; } """) assert 'use of undeclared identifier' in str(e_info.value)
def test_Compile_undefined_function(): """Test that Compile rejects a program with an undefined function.""" with pytest.raises(errors.ClangException) as e_info: opencl.Compile(""" kernel void A(global int* a) { undefined_function(a); } """) assert 'implicit declaration of function' in str(e_info.value)
def test_Compile_user_define(): """Test that Compile accepts a program with a custom #define.""" assert ( opencl.Compile( """ #define FLOAT_T float kernel void A(global FLOAT_T* a) {} """ ) == """ #define FLOAT_T float kernel void A(global FLOAT_T* a) {} """ )
def test_Compile_syntax_error(): """Test that Compile rejects a program with invalid syntax.""" with pytest.raises(errors.ClangException) as e_info: opencl.Compile("kernel void A2@@1!!!#") assert 'error: ' in str(e_info.value)
def test_Compile_empty_input(): """Test that Compile accepts an empty input.""" assert opencl.Compile('') == ''
def _Train( self, corpus, test_sampler: typing.Optional[samplers.Sampler], ) -> None: """Core training function""" if not self.is_trained: train_input_fn = self.train.data_generator.generateTfDataset( sequence_length=self.config.training.sequence_length, num_cpu_threads=os.cpu_count(), use_tpu=FLAGS.use_tpu, is_training=True) l.logger().info( "Splitting {} steps into {} equivalent epochs, {} steps each. Rejected {} redundant step(s)" .format( self.num_train_steps, self.num_epochs, self.steps_per_epoch, self.config.training.num_train_steps - self.num_train_steps)) try: if FLAGS.sample_per_epoch == 0: self.train.estimator.train(input_fn=train_input_fn, max_steps=self.num_train_steps) else: sampler, observers = self._getTestSampler( test_sampler, self.config.training.sequence_length) self.InitSampling(sampler, self.config.training.random_seed) for ep in range(self.num_epochs): self.train.estimator.train(input_fn=train_input_fn, steps=self.steps_per_epoch) for _ in range(FLAGS.sample_per_epoch): start_time = datetime.datetime.utcnow() self.InitSampleBatch() sample_batch, sample_indices = self.SampleNextIndices( ) end_time = datetime.datetime.utcnow() for sample, sind in zip(sample_batch, sample_indices): try: stdout = opencl.Compile( self.tokenizer.ArrayToCode(sample)) compile_flag = 1 except ValueError: compile_flag = 0 feature_vector = extractor.ExtractFeatures( self.tokenizer.ArrayToCode(sample)) sample_proto = model_pb2.Sample( train_step=(ep + 1) * self.steps_per_epoch, sample_feed=sampler.start_text, text=self.tokenizer.tokensToString( sample, ignore_token=self.tokenizer.padToken). replace("\\n", "\n"), encoded_text=",".join( [str(t) for t in sample]), sample_indices='\n'.join([ self.tokenizer.tokensToString( mind).replace('\n', '\\n') for mind in sind ]), encoded_sample_indices='\n'.join([ ','.join([str(x) for x in mind]) for mind in sind ]), sample_time_ms=int( round(1000 * ((end_time - start_time) / sampler.batch_size ).total_seconds())), feature_vector="\n".join([ "{}:{}".format(k, v) for (k, v) in feature_vector.items() ]), num_tokens=len(sample), compile_status=compile_flag, categorical_sampling=self. samplesWithCategorical(), date_added=datetime.datetime.utcnow( ).strftime("%m/%d/%Y, %H:%M:%S"), ) for obs in observers: obs.OnSample(sample_proto) except KeyboardInterrupt: pass if not FLAGS.force_eval: self.Validate() if FLAGS.force_eval and not self.is_validated: self.Validate() # self.telemetry.TfRecordEpochs() return
def ShouldProceed(self, sample_in_progress: typing.List[str], force: bool = False) -> bool: """Determine if a partial sample should be used as the new rollback state. Args: sample_in_progress: A list of strings, where each string is a token. The last token must be ';'. Returns: True if sampling should proceed with the current partial sample, else False. """ candidate_src = self.TryToCloseProgram(sample_in_progress) if not candidate_src: app.Log( 4, "Failed to produce syntactically valid program from partial sample" ) return False # Feature extractor reads from files. path = self.working_dir / "kernel.cl" fs.Write(path, candidate_src.encode("utf-8")) features = self.TryToExtractFeatures(path) if features is None: app.Log(4, "Failed to extract features from partial sample") return False # Grewe feature extractor is robust to code that doesn't compile (i.e. code # containing implicit declarations). Run the code through clang to check # if it actually compiles, else reject it. This is more expensive than the # feature extractor, so run it after. try: opencl.Compile(candidate_src) except clgen_errors.ClangException: app.Log(4, "Failed to compile partial sample") return False # Implement pure hill climbing approach to match a target feature vector. # When enabled, partial samples which increase the distance to the target # feature vector are rejected. if self._target_features is not None: new_feature_distance = scipy.spatial.distance.euclidean( features, self._target_features) app.Log( 2, "Features: %s, distance=%f, norm=%f, delta=%f", features, new_feature_distance, new_feature_distance / self._init_feature_distance, new_feature_distance - self._previous_feature_distance, ) if not force and new_feature_distance > self._previous_feature_distance: # This will only happen once feature values are great than target # feature values. app.Log( 2, "Rejecting candidate because of positive feature delta") return False if (not force and new_feature_distance == self._previous_feature_distance and random.random() > FLAGS. experimental_clgen_backtracking_reject_no_progress_probability ): app.Log(2, "Randomly rejecting candidate with no progress") return False self._previous_features = features self._previous_src = candidate_src self._previous_feature_distance = new_feature_distance return True
def _SampleSeqBatch( self, sampler: 'samplers.Sampler', tokenizer: tokenizers.TokenizerBase, sample_observers: typing.List[sample_observers_lib.SampleObserver], epoch: int, ) -> bool: """ Run a single iteration of the batched sample inner-loop for sequential models. """ start_time = datetime.datetime.utcnow() self.backend.InitSampleBatch(sampler) samples_in_progress = [ sampler.tokenized_start_text.copy() for _ in range(sampler.batch_size) ] done = np.zeros(sampler.batch_size, dtype=np.bool) wall_time_start = start_time seq_count = 0 # The return value of this method. If any of the sample_observers return # False, this value is set to False. continue_sampling = True # Sampling loop. Continues until all samples in the batch are done. while not done.all(): indices = self.backend.SampleNextIndices(sampler, done) # Iterate over all samples in batch to determine whether they're # done. for i in range(len(indices)): if done[i]: continue for index in indices[i]: samples_in_progress[i].append(tokenizer.decoder[index]) step_ind = "" encoded_step_indices = "" if sampler.SampleIsComplete(samples_in_progress[i]): end_time = datetime.datetime.utcnow() sample_kernel = [x for x in samples_in_progress[i]] features = extractor.ExtractRawFeatures(''.join( samples_in_progress[i])) done[i] = 1 try: stdout = opencl.Compile(''.join( samples_in_progress[i])) compile_flag = True except ValueError: compile_flag = False sample = model_pb2.Sample( train_step=epoch, text=samples_in_progress[i], sample_indices="", encoded_sample_indices="", sample_feed=sampler.start_text, encoded_text=",".join([ str(tokenizer.vocab[x]) for x in sample_kernel ]), sample_start_epoch_ms_utc=int( start_time.strftime("%s%f")), sample_time_ms=int( round(1000 * ((end_time - start_time) / sampler.batch_size).total_seconds())), wall_time_ms=int( round(1000 * ((end_time - start_time) / sampler.batch_size).total_seconds())), feature_vector=features, num_tokens=len(samples_in_progress[i]), compile_status=compile_flag, categorical_sampling=self.backend. samplesWithCategorical(), date_added=datetime.datetime.utcnow().strftime( "%m/%d/%Y, %H:%M:%S"), ) # Notify sample observers. continue_sampling &= all( [obs.OnSample(sample) for obs in sample_observers]) seq_count += 1 # Wall sample time is the difference between the end of the previous # sample and the end of the current sample. wall_time_start = datetime.datetime.utcnow() break return continue_sampling, seq_count
def _SampleLMBatch( self, sampler: 'samplers.Sampler', tokenizer: tokenizers.TokenizerBase, sample_observers: typing.List[sample_observers_lib.SampleObserver], epoch: int, ) -> bool: """ Run a sampling iteration over BERT models. """ start_time = datetime.datetime.utcnow() seq_count = 0 self.backend.InitSampleBatch(sampler, workload_size=FLAGS.sample_workload_size) try: org_inputs, input_ids, samples, indices = self.backend.SampleNextIndices( sampler) except StopIteration: return False, seq_count if not samples: # Return empty means model has not produced something that can be stored. # This 'if' accommodates active sampling, which is very selective. return True, seq_count continue_sampling = True if environment.WORLD_RANK == 0: assert len(org_inputs) == len(input_ids) == len(samples) == len( indices), "Length mismatch, {}-{}-{}-{}".format( len(org_inputs), len(input_ids), len(samples), len(indices)) for org, inp, sample, idxs in zip(org_inputs, input_ids, samples, indices): src = self.tokenizer.ArrayToCode(sample, with_formatting=True) try: stdout = opencl.Compile(src) compile_flag = True features = extractor.ExtractRawFeatures(src) except ValueError: compile_flag = False features = "" end_time = datetime.datetime.utcnow() sample = model_pb2.Sample( train_step=epoch, text=src, sample_indices=','.join([ self.tokenizer.decoder[idx].replace('\n', '\\n') for idx in idxs ]).replace('\n', '\\n'), encoded_sample_indices=','.join([str(idx) for idx in idxs]), original_input=self.tokenizer.tokensToString( org, with_formatting=False, ignore_token=self.tokenizer.padToken), sample_feed=self.tokenizer.tokensToString( inp, with_formatting=False, ignore_token=self.tokenizer.padToken), encoded_text=",".join([str(x) for x in sample]), sample_start_epoch_ms_utc=int(start_time.strftime("%s%f")), sample_time_ms=int( round(1000 * ((end_time - start_time) / len(samples)).total_seconds())), wall_time_ms=int( round(1000 * ((end_time - start_time) / len(samples)).total_seconds())), feature_vector=features, num_tokens=np.where( sample == self.tokenizer.padToken)[0][0] if self.tokenizer.padToken in sample else len(sample), compile_status=compile_flag, categorical_sampling=self.backend.samplesWithCategorical(), date_added=datetime.datetime.utcnow().strftime( "%m/%d/%Y, %H:%M:%S"), ) # Notify sample observers. continue_sampling &= all( [obs.OnSample(sample) for obs in sample_observers]) seq_count += 1 if environment.WORLD_SIZE > 1: distrib.write(str(continue_sampling)) else: status = distrib.read() if status == "True": continue_sampling = True elif status == "False": continue_sampling = False else: raise OSError( "Broken distributed message: '{}'".format(status)) return continue_sampling, seq_count
def write_eval_db(eval_db : evaluate_cand_database.SearchCandidateDatabase, tokenizer : "tokenizers.TokenizerBase", samples : typing.List[ActiveSample], target_benchmark : typing.Tuple[str, str], target_features : typing.Dict[str, float], gen_id : int, ) -> None: objs = {} # l.logger().warn("Before prep loop in eval db") for sample in samples: try: _ = opencl.Compile(tokenizer.ArrayToCode(sample.sample)) compile_status = True except ValueError: compile_status = False sobj = evaluate_cand_database.SearchCandidate.FromArgs( tokenizer = tokenizer, id = eval_db.count, input_feed = sample.sample_feed.input_feed, input_ids = sample.input_ids, input_features = sample.sample_feed.input_features, input_score = sample.sample_feed.input_score, hole_lengths = sample.hole_lengths, sample = sample.sample, sample_indices = sample.sample_indices, output_features = sample.features, sample_score = sample.score, target_benchmark = target_benchmark, target_features = target_features, compile_status = compile_status, generation_id = gen_id, ) if sobj.sha256 in objs: objs[sobj.sha256][1] += 1 else: objs[sobj.sha256] = [sobj, 1] # l.logger().warn(eval_db.count) with eval_db.Session(commit = True) as session: offset_idx = 0 for sha, obj in objs.items(): try: entry = session.query(evaluate_cand_database.SearchCandidate).filter_by(sha256 = sha).first() if entry is not None: entry.frequency += obj[1] else: obj[0].frequency = obj[1] obj[0].id += offset_idx offset_idx += 1 session.add(obj[0]) session.commit() except Exception as e: l.logger().error(entry) if entry is not None: l.logger().error(entry.id) l.logger().error(entry.sha256) l.logger().error(sha) l.logger().error("count: {}".format(eval_db.count)) l.logger().error("offset_idx: {}".format(offset_idx)) print(e) # l.logger().warn("Finished eval_DB thread") return
def execute_clsmith(idx: int, tokenizer, timeout_seconds: int = 15) -> typing.List[CLSmithSample]: """ Execute clsmith and return sample. """ try: tdir = pathlib.Path(FLAGS.local_filesystem).resolve() except Exception: tdir = None extra_args = ["-include{}".format(pathlib.Path(CLSMITH_INCLUDE) / "CLSmith.h")] with tempfile.NamedTemporaryFile("w", prefix = "clsmith_", suffix = ".cl", dir = tdir) as f: cmd =[ "timeout", "-s9", str(timeout_seconds), CLSMITH, "-o", str(f.name) ] process = subprocess.Popen( cmd, stdout = subprocess.PIPE, stderr = subprocess.PIPE, universal_newlines = True, ) try: stdout, stderr = process.communicate() except TimeoutError: return None contentfile = open(str(f.name), 'r').read() try: ks = opencl.ExtractSingleKernelsHeaders( opencl.StripDoubleUnderscorePrefixes( c.StripIncludes(contentfile), ) ) except ValueError as e: l.logger().error(contentfile) raise e samples = [] for kernel, include in ks: encoded_sample = tokenizer.AtomizeString(kernel) try: stdout = opencl.Compile(kernel, header_file = include, extra_args = extra_args) compile_status = True except ValueError as e: stdout = str(e) compile_status = False samples.append( CLSmithSample.FromArgs( id = idx, sample = stdout, include = include, encoded_sample = ','.join(encoded_sample), compile_status = compile_status, feature_vector = extractor.ExtractRawFeatures(kernel, header_file = include, extra_args = extra_args), num_tokens = len(encoded_sample) ) ) return samples
def Train(self, corpus, test_sampler : typing.Optional[samplers.Sampler] = None, pre_train : bool = False, **unused_kwargs ) -> None: """ Main training entry point. """ self._ConfigTrainParams( torchLMDataGenerator.TrainMaskLMBatchGenerator( corpus, self.config.training, self.cache.path, self.config.training.num_pretrain_steps if pre_train else None, pre_train, self.feature_encoder, self.feature_tokenizer, self.feature_sequence_length, ), pre_train ) if FLAGS.only_sample: return self.current_step = self.loadCheckpoint(self.train, pre_train = pre_train) if self.pytorch.num_gpus > 0: self.torch.cuda.empty_cache() if self.current_step >= 0: l.logger().info("Loaded checkpoint step {}".format(self.current_step)) self.current_step = max(0, self.current_step) if self.current_step < self.num_train_steps: self.train.model.zero_grad() ## Set batch size in case of TPU training or distributed training. if self.torch_tpu_available: total_train_batch_size = self.train_batch_size * self.pytorch.torch_xla.xrt_world_size() else: total_train_batch_size = ( self.train_batch_size * (self.torch.distributed.get_world_size() if self.pytorch.num_nodes > 1 else 1) ) # Set dataloader in case of TPU training. if self.torch_tpu_available: loader = self.pytorch.torch_ploader.ParallelLoader( self.train.data_generator.dataloader, [self.pytorch.device] ).per_device_loader(self.pytorch.device) else: loader = self.train.data_generator.dataloader # Get dataloader iterator and setup hooks. batch_iterator = iter(loader) if self.is_world_process_zero(): train_hook = hooks.tensorMonitorHook( self.logfile_path if not pre_train else self.pre_logfile_path, self.current_step, min(self.steps_per_epoch, FLAGS.monitor_frequency) ) if FLAGS.reward_compilation >= 0 and not pre_train: correct_sample_obs = sample_observers.SamplesDatabaseObserver( self.logfile_path / "correct_samples.db" ) else: correct_sample_obs = None total_steps = self.config.training.num_pretrain_steps if pre_train else self.config.training.num_train_steps l.logger().info( "Splitting {} steps into {} equivalent epochs, {} steps each. Rejected {} redundant step(s)".format( self.num_train_steps, self.num_epochs, self.steps_per_epoch, total_steps - self.num_train_steps ) ) try: self.train.model.train() epoch_iter = tqdm.auto.trange(self.num_epochs, desc="Epoch", leave = False) if self.is_world_process_zero() else range(self.num_epochs) for epoch in epoch_iter: # In distributed mode, calling the set_epoch() method at # the beginning of each epoch before creating the DataLoader iterator # is necessary to make shuffling work properly across multiple epochs. # Otherwise, the same ordering will be always used. if self.pytorch.num_nodes > 1: loader.sampler.set_epoch(epoch) if epoch < self.current_step // self.steps_per_epoch: continue # Stupid bar won't resume. batch_iter = tqdm.auto.trange(self.steps_per_epoch, desc="Batch", leave = False) if self.is_world_process_zero() else range(self.steps_per_epoch) for step in batch_iter: if self.is_world_process_zero(): start = datetime.datetime.utcnow() try: inputs = next(batch_iterator) except StopIteration: # dataloader has different len() than steps_per_epoch. # This is the easiest way to infinite-loop dataloaders in pytorch. batch_iterator = iter(loader) inputs = next(batch_iterator) self.current_step += 1 # Move inputs to torch device. inputs = self.to_device(inputs) # Run model step on batch step_out = self.model_step(self.train.model, inputs, step = epoch * self.steps_per_epoch + step) # Collect losses and backpropagate total_loss = step_out['total_loss'].mean() total_loss.backward() self.torch.nn.utils.clip_grad_norm_(self.train.model.parameters(), self.max_grad_norm) if self.torch_tpu_available: self.pytorch.torch_xla.optimizer_step(self.train.optimizer) else: self.train.optimizer.step() self.train.scheduler.step() ## Collect tensors for logging. if self.pytorch.num_nodes > 1: total_loss = [self.torch.zeros(tuple(step_out['total_loss' ].shape), dtype = self.torch.float32).to(self.pytorch.device) for _ in range(self.torch.distributed.get_world_size())] masked_lm_loss = [self.torch.zeros(tuple(step_out['masked_lm_loss' ].shape), dtype = self.torch.float32).to(self.pytorch.device) for _ in range(self.torch.distributed.get_world_size())] # next_sentence_loss = [self.torch.zeros(tuple(step_out['next_sentence_loss'].shape), dtype = self.torch.float32).to(self.pytorch.device) for _ in range(self.torch.distributed.get_world_size())] masked_lm_lengths = [self.torch.zeros(tuple(inputs ['masked_lm_lengths' ].shape), dtype = self.torch.int64 ).to(self.pytorch.device) for _ in range(self.torch.distributed.get_world_size())] self.torch.distributed.all_gather(masked_lm_loss, step_out["masked_lm_loss"]) # self.torch.distributed.all_gather(next_sentence_loss, step_out["next_sentence_loss"]) self.torch.distributed.all_gather(masked_lm_lengths, inputs['masked_lm_lengths'].to(self.pytorch.device)) self.torch.distributed.all_gather(total_loss, step_out['total_loss']) else: total_loss = step_out['total_loss' ].unsqueeze(0).cpu() masked_lm_loss = step_out['masked_lm_loss' ].unsqueeze(0).cpu() # next_sentence_loss = step_out['next_sentence_loss'].unsqueeze(0).cpu() masked_lm_lengths = inputs['masked_lm_lengths' ].cpu() if self.is_world_process_zero(): exec_time_ms = int(round((datetime.datetime.utcnow() - start).total_seconds() * 1000)) if FLAGS.reward_compilation >= 0 and FLAGS.reward_compilation <= epoch * self.steps_per_epoch + step and not pre_train: ## Logging when compiler reward is enabled in training. ## This is not compatible with using DDP, and basically compiler-rewarded training is deprecated and proven to be wrong and inefficient. correct_samples = [(x, y) for en, (x, y) in enumerate(zip(inputs['input_ids'].cpu().numpy(), step_out['generated_samples'].cpu().numpy())) if step_out['compile_status'][en] == 1] for s in correct_samples: feature_vector = extractor.ExtractFeatures(self.tokenizer.ArrayToCode(s[1])) correct_sample_obs.OnSample(model_pb2.Sample( train_step = self.current_step, sample_feed = self.tokenizer.tokensToString(s[0], ignore_token = self.tokenizer.padToken).replace("\\n", "\n"), text = self.tokenizer.tokensToString(s[1], ignore_token = self.tokenizer.padToken).replace("\\n", "\n"), encoded_text = ",".join([str(t) for t in s[1]]), sample_indices = '', encoded_sample_indices = '', sample_time_ms = int(round(exec_time_ms / self.train_batch_size)), feature_vector = "\n".join(["{}:{}".format(k, v) for (k, v) in feature_vector.items()]), num_tokens = len([x for x in s[1] if x != self.tokenizer.padToken]), categorical_sampling = False, compile_status = True, date_added = datetime.datetime.utcnow().strftime("%m/%d/%Y, %H:%M:%S"), ) ) if not pre_train: ## Fine-tuning logging. train_hook.step( masked_lm_loss = sum([ml.mean().item() for ml in masked_lm_loss]) / len(masked_lm_loss), # next_sentence_loss = sum([nsl.mean().item() for nsl in next_sentence_loss]) / len(next_sentence_loss), total_loss = sum([tl.mean().item() for tl in total_loss]) / len(total_loss), learning_rate = self.train.scheduler.get_last_lr()[0], num_correct_samples = (correct_sample_obs.sample_id if correct_sample_obs is not None else None), batch_avg_hole_len = sum([sum([int(l) for l in b if l != -1]) / len([int(l) for l in b if l != -1]) for b in masked_lm_lengths]) / len(masked_lm_lengths), batch_execution_time_ms = exec_time_ms, time_per_sample_ms = exec_time_ms / self.train_batch_size, ) else: ## Pre-training logging. train_hook.step( masked_lm_loss = sum([ml.mean().item() for ml in masked_lm_loss]) / len(masked_lm_loss), # next_sentence_loss = sum([nsl.mean().item() for nsl in next_sentence_loss]) / len(next_sentence_loss), total_loss = sum([tl.mean().item() for tl in total_loss]) / len(total_loss), learning_rate = self.train.scheduler.get_last_lr()[0], batch_avg_hole_len = sum([sum([int(l) for l in b if l != -1]) / len([int(l) for l in b if l != -1]) for b in masked_lm_lengths]) / len(masked_lm_lengths), batch_execution_time_ms = exec_time_ms, time_per_sample_ms = exec_time_ms / self.train_batch_size, ) self.train.model.zero_grad() if self.current_step == 0: l.logger().info("Starting Loss: {}".format(sum([tl.mean().item() for tl in total_loss]) / len(total_loss))) # End of Epoch self.saveCheckpoint(self.train, pre_train) if self.is_world_process_zero(): set_mail = "Epoch {} Loss: {}\n".format(self.current_step // self.steps_per_epoch, train_hook.epoch_loss) l.logger().info("Epoch {} Loss: {}".format(self.current_step // self.steps_per_epoch, train_hook.epoch_loss)) if self.pytorch.num_nodes > 1: loader.sampler.set_epoch(epoch) if FLAGS.validate_per_epoch and self.train.data_generator.config.validation_split > 0: val_ml_loss = self.Validate(per_epoch = True, pre_train = pre_train) if self.is_world_process_zero(): train_hook.end_epoch( val_masked_lm_loss = val_ml_loss, # val_next_sentence_loss = val_nsp_loss, val_total_loss = val_ml_loss # + val_nsp_loss, ) set_mail += "Validation Loss: {}\n".format(val_ml_loss) elif self.is_world_process_zero(): train_hook.end_epoch() if FLAGS.notify_me: client.getClient().send_message("clgen:torch_bert", set_mail) if self.torch_tpu_available: self.pytorch.torch_xla.master_print(self.pytorch.torch_xla_met.metrics_report()) if FLAGS.sample_per_epoch > 0: sampler, observers = self._getTestSampler(test_sampler, self.config.training.sequence_length) self.InitSampling(sampler, self.config.training.random_seed) for _ in range(FLAGS.sample_per_epoch): start_time = datetime.datetime.utcnow() self.InitSampleBatch(sampler) org_inputs, input_ids, samples, indices = self.SampleNextIndices() end_time = datetime.datetime.utcnow() for org, inp, sample, idxs in zip(org_inputs, input_ids, samples, indices): try: stdout = opencl.Compile(self.tokenizer.ArrayToCode(sample)) compile_flag = 1 except ValueError: compile_flag = 0 feature_vector = extractor.ExtractFeatures(self.tokenizer.ArrayToCode(sample)) sample_proto = model_pb2.Sample( train_step = self.current_step, sample_feed = sampler.start_text, original_input = self.tokenizer.tokensToString(org, with_formatting = True, ignore_token = self.tokenizer.padToken), text = self.tokenizer.tokensToString(sample, with_formatting = True, ignore_token = self.tokenizer.padToken).replace("\\n", "\n"), encoded_text = ",".join([str(t) for t in sample]), sample_indices = ','.join([self.tokenizer.decoder[idx].replace('\n', '\\n') for idx in idxs]).replace('\n', '\\n'), encoded_sample_indices = ','.join([str(idx) for idx in idxs]), sample_time_ms = int(round(1000 * ((end_time - start_time) / sampler.batch_size).total_seconds())), feature_vector = "\n".join(["{}:{}".format(k, v) for (k, v) in feature_vector.items()]), num_tokens = len(sample), compile_status = compile_flag, categorical_sampling = self.samplesWithCategorical(), date_added = datetime.datetime.utcnow().strftime("%m/%d/%Y, %H:%M:%S"), ) for obs in observers: obs.OnSample(sample_proto) except KeyboardInterrupt: pass if not FLAGS.force_eval: _ = self.Validate(pre_train = pre_train) if FLAGS.force_eval and not self.is_validated: _ = self.Validate(pre_train = pre_train) return
def beam_mutec( srcs: typing.List[typing.Tuple[str, str, float]], target_features: typing.Dict[str, float], feat_space: str, beam_width: int, mutec_cache: samples_database.SamplesDatabase, ) -> typing.List[typing.Tuple[str, float]]: """ Run generational beam search over starting github kernels to minimize distance from target features. """ better_score = True total_beams, beam, closest = set(), [], [] gen_id = 0 while better_score: cands = set() ## Generate mutants for current generation. for src, incl, dist in tqdm.tqdm( srcs, total=len(srcs), desc="Mutec candidates {}".format(gen_id), leave=False): cands.update( generate_mutants(src, incl) ) ### This should collect all mutants and return them, out of a single source. ## Extract their features and calculate distances. pool = multiprocessing.Pool() f = functools.partial( workers.ExtractAndCalculate, target_features=target_features, feature_space=feat_space, ) # total.update(cands) try: for cand in tqdm.tqdm(pool.imap_unordered(f, cands), total=len(cands), desc="Extract Features {}".format(gen_id), leave=False): if cand: beam.append(cand) except Exception as e: l.logger().error(e) pool.terminate() raise e pool.close() ## Sort by distance in ascending order. If score is better, keep doing beam search ## srcs are included to the outputs, in order to keep them if the offsprings are worse. closest = sorted(beam + srcs, key=lambda x: x[2])[:beam_width] total_beams.update([(x, y) for x, y, _ in closest]) min_length = min(len(closest), len(srcs)) if sum([x for _, _, x in closest[:min_length]]) < sum([ x for _, _, x in srcs[:min_length] ]) and gen_id < SEARCH_DEPTH_HARD_LIMIT: srcs = closest beam = [] else: better_score = False gen_id += 1 ## Store all mutants in database. with mutec_cache.Session(commit=True) as s: pool = multiprocessing.Pool() try: idx = mutec_cache.count for dp in tqdm.tqdm(pool.imap_unordered(workers.FeatureExtractor, total_beams), total=len(total_beams), desc="Add mutants to DB", leave=False): if dp: src, incl, feats = dp try: _ = opencl.Compile( src, header_file=incl, extra_args=[ "-include{}".format( pathlib.Path(environment.CLSMITH_INCLUDE) / "CLSmith.h") ] if incl else [""]) compiles = True except ValueError: compiles = False sample = samples_database.Sample.FromArgsLite( idx, incl + src, feats, compiles) exists = s.query(samples_database.Sample.sha256).filter_by( sha256=sample.sha256).scalar() is not None if not exists: s.add(sample) idx += 1 except Exception as e: l.logger().error(e) pool.terminate() raise e pool.close() s.commit() return closest