def FromPreprocessed( cls, preprocessed_cf: preprocessed.PreprocessedContentFile, tokenizer: tokenizers.TokenizerBase, eof: str, pre_train: bool, ) -> "EncodedContentFile": """Instantiate an EncodedContentFile from a preprocessed file. Args: preprocessed_cf: A PreprocessedContentFile instance. tokenizer: The tokenizer to encode using. eof: An end-of-file marker which is concatenated to the encoded sequence. Returns: An EncodedContentFile instance. """ start_time = time.time() try: data = tokenizer.TokenizeString(preprocessed_cf.text) except ValueError as e: l.logger().warn(e) return None #### # TODO kernel analytics # encoded_length = len(data) # token_values = data.sorted() #### encoding_time_ms = int((time.time() - start_time) * 1000) try: if not pre_train: feature_vector = extractor.ExtractRawFeatures(preprocessed_cf.text) else: feature_vector = "" except Exception as e: raise e return EncodedContentFile( id = preprocessed_cf.id, # Encode the end-of-file marker separately to ensure that it resolves to # the correct token. For example if the vocabulary contains 'a', 'b', # and 'ab', then a content file 'a' with EOF marker 'b' would be encoded # as 'ab', instead of 'a'+'b'. data = cls.NumpyArrayToDataString( np.concatenate((data, tokenizer.TokenizeString(eof))) ), tokencount = len(data), feature_vector = feature_vector, encoding_time_ms = encoding_time_ms, wall_time_ms = encoding_time_ms, # The outer-loop may change this. date_added = datetime.datetime.utcnow(), )
def FeatureExtractor( src_incl: typing.Tuple[str, str]) -> typing.Tuple[str, str, str]: """ Extracts Raw features for all feat spaces and returns tuple of source and features. """ src, incl = src_incl try: return src, incl, extractor.ExtractRawFeatures( src, header_file=incl, extra_args=[ "-include{}".format( pathlib.Path(environment.CLSMITH_INCLUDE) / "CLSmith.h") ] if incl else [""]) except ValueError: return src, incl, ""
def ToProto(dp: ActiveFeed) -> samples_database.Sample: return samples_database.Sample(**samples_database.Sample.FromProto( 0, model_pb2.Sample( train_step=-1, text=dp.sample, sample_indices="", encoded_sample_indices="", original_input="", sample_feed=dp.input_feed, encoded_text="", sample_time_ms=0, feature_vector=extractor.ExtractRawFeatures(dp.sample), num_tokens=dp.num_tokens, compile_status=dp.compile_status, categorical_sampling=1, date_added=dp.date_added.strftime("%m/%d/%Y, %H:%M:%S"), )))
def run_extractors(sample: Sample) -> Sample: if sample.compile_status: return Sample(**Sample.FromProto( 0, model_pb2.Sample( train_step=sample.train_step, text=sample.text, sample_indices=sample.sample_indices, encoded_sample_indices=sample.encoded_sample_indices, original_input=sample.original_input, sample_feed=sample.sample_feed, encoded_text=sample.encoded_text, sample_time_ms=sample.sample_time_ms, feature_vector=extractor.ExtractRawFeatures(sample.text), num_tokens=sample.num_tokens, compile_status=sample.compile_status, categorical_sampling=int(sample.categorical_sampling), date_added=sample.date_added.strftime("%m/%d/%Y, %H:%M:%S"), ))) else: return Sample(**Sample.FromProto( 0, model_pb2.Sample( train_step=sample.train_step, text=sample.text, sample_indices=sample.sample_indices, encoded_sample_indices=sample.encoded_sample_indices, original_input=sample.original_input, sample_feed=sample.sample_feed, encoded_text=sample.encoded_text, sample_time_ms=sample.sample_time_ms, feature_vector="", num_tokens=sample.num_tokens, compile_status=sample.compile_status, categorical_sampling=int(sample.categorical_sampling), date_added=sample.date_added.strftime("%m/%d/%Y, %H:%M:%S"), )))
def _SampleSeqBatch( self, sampler: 'samplers.Sampler', tokenizer: tokenizers.TokenizerBase, sample_observers: typing.List[sample_observers_lib.SampleObserver], epoch: int, ) -> bool: """ Run a single iteration of the batched sample inner-loop for sequential models. """ start_time = datetime.datetime.utcnow() self.backend.InitSampleBatch(sampler) samples_in_progress = [ sampler.tokenized_start_text.copy() for _ in range(sampler.batch_size) ] done = np.zeros(sampler.batch_size, dtype=np.bool) wall_time_start = start_time seq_count = 0 # The return value of this method. If any of the sample_observers return # False, this value is set to False. continue_sampling = True # Sampling loop. Continues until all samples in the batch are done. while not done.all(): indices = self.backend.SampleNextIndices(sampler, done) # Iterate over all samples in batch to determine whether they're # done. for i in range(len(indices)): if done[i]: continue for index in indices[i]: samples_in_progress[i].append(tokenizer.decoder[index]) step_ind = "" encoded_step_indices = "" if sampler.SampleIsComplete(samples_in_progress[i]): end_time = datetime.datetime.utcnow() sample_kernel = [x for x in samples_in_progress[i]] features = extractor.ExtractRawFeatures(''.join( samples_in_progress[i])) done[i] = 1 try: stdout = opencl.Compile(''.join( samples_in_progress[i])) compile_flag = True except ValueError: compile_flag = False sample = model_pb2.Sample( train_step=epoch, text=samples_in_progress[i], sample_indices="", encoded_sample_indices="", sample_feed=sampler.start_text, encoded_text=",".join([ str(tokenizer.vocab[x]) for x in sample_kernel ]), sample_start_epoch_ms_utc=int( start_time.strftime("%s%f")), sample_time_ms=int( round(1000 * ((end_time - start_time) / sampler.batch_size).total_seconds())), wall_time_ms=int( round(1000 * ((end_time - start_time) / sampler.batch_size).total_seconds())), feature_vector=features, num_tokens=len(samples_in_progress[i]), compile_status=compile_flag, categorical_sampling=self.backend. samplesWithCategorical(), date_added=datetime.datetime.utcnow().strftime( "%m/%d/%Y, %H:%M:%S"), ) # Notify sample observers. continue_sampling &= all( [obs.OnSample(sample) for obs in sample_observers]) seq_count += 1 # Wall sample time is the difference between the end of the previous # sample and the end of the current sample. wall_time_start = datetime.datetime.utcnow() break return continue_sampling, seq_count
def _SampleLMBatch( self, sampler: 'samplers.Sampler', tokenizer: tokenizers.TokenizerBase, sample_observers: typing.List[sample_observers_lib.SampleObserver], epoch: int, ) -> bool: """ Run a sampling iteration over BERT models. """ start_time = datetime.datetime.utcnow() seq_count = 0 self.backend.InitSampleBatch(sampler, workload_size=FLAGS.sample_workload_size) try: org_inputs, input_ids, samples, indices = self.backend.SampleNextIndices( sampler) except StopIteration: return False, seq_count if not samples: # Return empty means model has not produced something that can be stored. # This 'if' accommodates active sampling, which is very selective. return True, seq_count continue_sampling = True if environment.WORLD_RANK == 0: assert len(org_inputs) == len(input_ids) == len(samples) == len( indices), "Length mismatch, {}-{}-{}-{}".format( len(org_inputs), len(input_ids), len(samples), len(indices)) for org, inp, sample, idxs in zip(org_inputs, input_ids, samples, indices): src = self.tokenizer.ArrayToCode(sample, with_formatting=True) try: stdout = opencl.Compile(src) compile_flag = True features = extractor.ExtractRawFeatures(src) except ValueError: compile_flag = False features = "" end_time = datetime.datetime.utcnow() sample = model_pb2.Sample( train_step=epoch, text=src, sample_indices=','.join([ self.tokenizer.decoder[idx].replace('\n', '\\n') for idx in idxs ]).replace('\n', '\\n'), encoded_sample_indices=','.join([str(idx) for idx in idxs]), original_input=self.tokenizer.tokensToString( org, with_formatting=False, ignore_token=self.tokenizer.padToken), sample_feed=self.tokenizer.tokensToString( inp, with_formatting=False, ignore_token=self.tokenizer.padToken), encoded_text=",".join([str(x) for x in sample]), sample_start_epoch_ms_utc=int(start_time.strftime("%s%f")), sample_time_ms=int( round(1000 * ((end_time - start_time) / len(samples)).total_seconds())), wall_time_ms=int( round(1000 * ((end_time - start_time) / len(samples)).total_seconds())), feature_vector=features, num_tokens=np.where( sample == self.tokenizer.padToken)[0][0] if self.tokenizer.padToken in sample else len(sample), compile_status=compile_flag, categorical_sampling=self.backend.samplesWithCategorical(), date_added=datetime.datetime.utcnow().strftime( "%m/%d/%Y, %H:%M:%S"), ) # Notify sample observers. continue_sampling &= all( [obs.OnSample(sample) for obs in sample_observers]) seq_count += 1 if environment.WORLD_SIZE > 1: distrib.write(str(continue_sampling)) else: status = distrib.read() if status == "True": continue_sampling = True elif status == "False": continue_sampling = False else: raise OSError( "Broken distributed message: '{}'".format(status)) return continue_sampling, seq_count
def execute_clsmith(idx: int, tokenizer, timeout_seconds: int = 15) -> typing.List[CLSmithSample]: """ Execute clsmith and return sample. """ try: tdir = pathlib.Path(FLAGS.local_filesystem).resolve() except Exception: tdir = None extra_args = ["-include{}".format(pathlib.Path(CLSMITH_INCLUDE) / "CLSmith.h")] with tempfile.NamedTemporaryFile("w", prefix = "clsmith_", suffix = ".cl", dir = tdir) as f: cmd =[ "timeout", "-s9", str(timeout_seconds), CLSMITH, "-o", str(f.name) ] process = subprocess.Popen( cmd, stdout = subprocess.PIPE, stderr = subprocess.PIPE, universal_newlines = True, ) try: stdout, stderr = process.communicate() except TimeoutError: return None contentfile = open(str(f.name), 'r').read() try: ks = opencl.ExtractSingleKernelsHeaders( opencl.StripDoubleUnderscorePrefixes( c.StripIncludes(contentfile), ) ) except ValueError as e: l.logger().error(contentfile) raise e samples = [] for kernel, include in ks: encoded_sample = tokenizer.AtomizeString(kernel) try: stdout = opencl.Compile(kernel, header_file = include, extra_args = extra_args) compile_status = True except ValueError as e: stdout = str(e) compile_status = False samples.append( CLSmithSample.FromArgs( id = idx, sample = stdout, include = include, encoded_sample = ','.join(encoded_sample), compile_status = compile_status, feature_vector = extractor.ExtractRawFeatures(kernel, header_file = include, extra_args = extra_args), num_tokens = len(encoded_sample) ) ) return samples