def get_execution_times_ms( self, src: str, dataset: str, global_size: int, local_size: int ) -> typing.Tuple[typing.List[int], typing.List[int], typing.List[int], typing.List[int]]: """ Search code by hash and return lists with all different execution times. """ sha = crypto.sha256_str(src + dataset + str(global_size) + str(local_size)) ctt, ckt, gtt, gkt = [], [], [], [] with self.Session() as session: entry = session.query(CLDriveSample).filter_by(sha256=sha).first() if entry is None: return None else: ctt = [ int(x) // 1000 for x in entry.cpu_transfer_time_ns.split('\n') ] ckt = [ int(x) // 1000 for x in entry.cpu_kernel_time_ns.split('\n') ] gtt = [ int(x) // 1000 for x in entry.gpu_transfer_time_ns.split('\n') ] gkt = [ int(x) // 1000 for x in entry.gpu_kernel_time_ns.split('\n') ] return ctt, ckt, gtt, gkt
def dataset(workspace: str, model_sha: str): global data global cached_models if data == {}: data = parseData() target_sha = crypto.sha256_str(str(workspace) + model_sha) current_model = cached_models[target_sha] datasets = [] for d in glob.glob(str(current_model['path'] / "dataset" / "*.png")): png_path = pathlib.Path(d) dest_file = MEDIA_PATH / workspace / model_sha / "dataset" / png_path.name dest_file.parent.mkdir(exist_ok=True, parents=True) shutil.copyfile(png_path, str(dest_file)) datasets.append({ 'name': png_path.stem, 'plot': "/" + str( dest_file.relative_to( pathlib.Path(flask_app.static_folder).parent)) }) spec_data = { 'summary': current_model['summary'], 'workspace': workspace, 'model_sha': model_sha, 'datasets': datasets, } return flask.render_template("dataset.html", data=spec_data, **GetBaseTemplateArgs())
def parseCorpus(workspace_path): corpuses = [] if (workspace_path / "corpus" / "encoded").exists(): corpus_path = workspace_path / "corpus" / "encoded" for corpus_sha in corpus_path.iterdir(): encoded_db = encoded.EncodedContentFiles("sqlite:///{}".format( corpus_sha / "encoded.db"), must_exist=True) corpus = { 'path': corpus_path / corpus_sha, 'sha': str(corpus_sha.stem), 'datapoint_count': encoded_db.size, 'summary': "{} datapoint corpus, {}".format(encoded_db.size, str(corpus_sha.stem)), 'models': parseModels(workspace_path, str(corpus_sha.stem)) } global cached_corpuses cached_corpuses[crypto.sha256_str( str(workspace_path.name) + str(corpus_sha.name))] = corpus corpuses.append(corpus) return corpuses
def training(workspace: str, model_sha: str): global data global cached_models if data == {}: data = parseData() data['plots'] = [] target_sha = crypto.sha256_str(str(workspace) + model_sha) for d in glob.glob( str(cached_models[target_sha]['path'] / "logs" / "*.png")): png_file = pathlib.Path(d) dest_file = MEDIA_PATH / workspace / model_sha / "logs" / png_file.name dest_file.parent.mkdir(exist_ok=True, parents=True) shutil.copyfile(png_file, dest_file) data['plots'].append("/" + str( dest_file.relative_to( pathlib.Path(flask_app.static_folder).parent))) data['summary'] = cached_models[target_sha]['summary'] data['workspace'] = workspace data['model_sha'] = model_sha return flask.render_template("training.html", data=data, **GetBaseTemplateArgs())
def remove_identical_files(self) -> None: l.logger().info("Removing duplicate files from mined corpus...") if os.path.isfile(str(self.cache_path / "record.json")): with open(self.cache_path / "record.json", 'r') as f: data = json.load(f) repos = data[0] length = data[1]['total_files'] cache_map = {} for i in range(length): with open(self.cache_path / "{}.cl".format(i), 'r') as f: cf = f.read() cf_hash = crypto.sha256_str(cf) if cf_hash not in cache_map: cache_map[cf_hash] = cf new_path = self.cache_path / "distinct_corpus" new_path.mkdir(exist_ok=True, parents=True) for k, v in cache_map.items(): with open(new_path / "{}.cl".format(k), 'w') as f: f.write(v) with open(new_path / "record.json", 'w') as f: data[1]['total_files'] = len(cache_map) json.dump(data, f, indent=2) return
def FromProto(cls, id: int, proto: model_pb2.Sample) -> typing.Dict[str, typing.Any]: return { "id": id, "sha256": crypto.sha256_str(proto.text), "train_step": proto.train_step, "encoded_text": proto.encoded_text, "sample_feed": proto.sample_feed, "text": proto.text, "sample_indices": proto.sample_indices, "encoded_sample_indices": proto.encoded_sample_indices, "feature_vector": proto.feature_vector, "num_tokens": proto.num_tokens, "compile_status": proto.compile_status, "categorical_sampling": proto.categorical_sampling, "sample_time_ms": proto.sample_time_ms, "date_added": datetime.datetime.strptime(proto.date_added, "%m/%d/%Y, %H:%M:%S"), }
def FromArgs( cls, tokenizer, id: int, input_feed: np.array, input_features: typing.Dict[str, float], ) -> typing.TypeVar("ActiveInput"): """Construt ActiveFeed table entry from argumentns.""" str_input_feed = tokenizer.tokensToString( input_feed, ignore_token=tokenizer.padToken) if tokenizer.padToken in input_feed: num_tokens = np.where(input_feed == tokenizer.padToken)[0][0] else: num_tokens = len(input_feed) return ActiveInput( id=id, sha256=crypto.sha256_str(str_input_feed), input_feed=str_input_feed, encoded_feed=','.join([str(x) for x in input_feed]), input_features='\n'.join( ["{}:{}".format(k, v) for k, v in input_features.items()]), num_tokens=int(num_tokens), date_added=datetime.datetime.utcnow(), )
def samples_distribution(data) -> None: freqd = {} for dp in data: gen, sam = dp.generation_id, dp.sample hsm = crypto.sha256_str(sam) if gen in freqd: if hsm in freqd[gen]: freqd[gen][hsm] += 1 else: freqd[gen][hsm] = 1 else: freqd[gen] = {} freqd[gen][hsm] = 1 for k, v in freqd.items(): gdict = {} for samp, freq in v.items(): if freq in gdict: gdict[freq] += 1 else: gdict[freq] = 1 freqd[k] = (list(gdict.keys()), list(gdict.values())) plt.GrouppedBars( groups=freqd, # Dict[Dict[int, int]] plot_name="freq_samples_per_gen", path=pathlib.Path(FLAGS.eval_cand_db).absolute().parent, title="Repetition of samples per generation", x_name="# of repetitions", ) return
def corpus(workspace: str, corpus_sha: str): global data global cached_corpuses if data == {}: data = parseData() target_sha = crypto.sha256_str(str(workspace) + corpus_sha) corpus = cached_corpuses[target_sha] corpus_stats = [] for d in glob.glob(str(corpus['path'] / "*.png")): png_path = pathlib.Path(d) dest_file = MEDIA_PATH / workspace / corpus_sha / png_path.name dest_file.parent.mkdir(exist_ok=True, parents=True) shutil.copyfile(png_path, str(dest_file)) corpus_stats.append({ 'name': png_path.stem, 'plot': "/" + str( dest_file.relative_to( pathlib.Path(flask_app.static_folder).parent)) }) corpus['stats'] = corpus_stats print(corpus['summary']) return flask.render_template("corpus.html", data=corpus, **GetBaseTemplateArgs())
def FromArgs(cls, act_l_pf: int, act_s_dep: int, act_s_wid: int, feat_space: str) -> typing.TypeVar("ActiveSamplingSpecs"): return ActiveSamplingSpecs( sha256=crypto.sha256_str( str(act_l_pf) + str(act_s_dep) + str(act_s_wid) + feat_space), active_limit_per_feed=act_l_pf, active_search_depth=act_s_dep, active_search_width=act_s_wid, feature_space=feat_space, )
def model_specs(workspace: str, model_sha: str): global data global cached_models if data == {}: data = parseData() target_sha = crypto.sha256_str(str(workspace) + model_sha) current_model = cached_models[target_sha] spec_data = {'config': current_model['config']} return flask.render_template("model_specs.html", data=spec_data, **GetBaseTemplateArgs())
def ContentHash(src: str) -> str: """ Re-write code with deterministic, sequential rewriter, remove whitespaces and new lines and calculate the hash of the string. Args: src: The source code to compute. Returns: 256-bit hash of pure source code string. """ rw = SequentialNormalizeIdentifiers(src) return crypto.sha256_str(rw.replace(" ", "").replace("\n", ""))
def __init__( self, config: typing.Union[active_learning_pb2.MLP], downstream_task: downstream_tasks.DownstreamTask, ) -> "ModelConfig": if isinstance(config, active_learning_pb2.MLP): self.name = "MLP" self.config = config self.downstream_task = downstream_task self.sha256 = crypto.sha256_str(str(config)) self.num_train_steps = config.num_train_steps self.num_warmup_steps = config.num_warmup_steps self.num_epochs = 1 self.steps_per_epoch = config.num_train_steps self.batch_size = config.batch_size self.learning_rate = config.initial_learning_rate_micros / 1e6 self.max_grad_norm = 1.0 if len(self.config.layer) == 0: raise ValueError("Layer list is empty for committee model") if self.config.layer[0].HasField("linear"): if self.config.layer[ 0].linear.in_features != self.downstream_task.input_size: raise ValueError( "Mismatch between committee member's input size {} and downstream task's input size {}" .format(self.config.layer[0].linear.in_features, self.downstream_task.input_size)) self.layer_config = [] for l in self.config.layer: if l.HasField("linear"): self.layer_config.append(('Linear', { 'in_features': l.linear.in_features, 'out_features': l.linear.out_features, })) elif l.HasField("dropout"): self.layer_config.append(('Dropout', { 'dropout_prob': l.dropout.dropout_prob })) elif l.HasField("layer_norm"): self.layer_config.append(('LayerNorm', { 'layer_norm_eps': l.dropout.layer_norm_eps })) elif l.HasField("act_fn"): self.layer_config.append((l.act_fn, {})) return
def FromArgs( cls, id: int, global_size: int, local_size: int, source: str, dataset: str, cpu_transfer_time_ns: typing.List[int], cpu_kernel_time_ns: typing.List[int], gpu_transfer_time_ns: typing.List[int], gpu_kernel_time_ns: typing.List[int], transferred_bytes: int, status: str, ) -> typing.Dict[str, typing.Any]: return CLDriveSample( **{ "id": id, "sha256": crypto.sha256_str(source + dataset + str(global_size) + str(local_size)), "global_size": global_size, "local_size": local_size, "source": source, "dataset": dataset, "cpu_transfer_time_ns": '\n'.join( [str(int(x)) for x in cpu_transfer_time_ns if x != 'nan']), "cpu_kernel_time_ns": '\n'.join( [str(int(x)) for x in cpu_kernel_time_ns if x != 'nan']), "gpu_transfer_time_ns": '\n'.join( [str(int(x)) for x in gpu_transfer_time_ns if x != 'nan']), "gpu_kernel_time_ns": '\n'.join( [str(int(x)) for x in gpu_kernel_time_ns if x != 'nan']), "transferred_bytes": transferred_bytes, "status": status, "date_added": datetime.datetime.utcnow(), })
def IRContentHash(src: str, header_file=None, use_aux_headers: bool = True) -> str: """ Collect optimized LLVM-IR of source code and compute its hash. Args: src: The source code to compute. Returns: 256-bit hash of pure source code string. """ bc = CompileLlvmBytecode(src, header_file=header_file, use_aux_headers=use_aux_headers) return crypto.sha256_str(''.join(bc.split('\n')[2:]))
def FromArgs( cls, tokenizer, id: int, input_feed: np.array, input_features: typing.Dict[str, float], sample: np.array, output_features: typing.Dict[str, float], sample_quality: float, target_benchmark: typing.Tuple[str, str], target_features: typing.Dict[str, float], compile_status: bool, generation_id: int, ) -> typing.TypeVar("ActiveFeed"): """Construt ActiveFeed table entry from argumentns.""" str_input_feed = tokenizer.tokensToString( input_feed, ignore_token=tokenizer.padToken, with_formatting=True) str_sample = tokenizer.ArrayToCode(sample, with_formatting=True) num_tokens = len(sample) if tokenizer.padToken in sample: num_tokens = np.where(sample == tokenizer.padToken)[0][0] return ActiveFeed( id=id, sha256=crypto.sha256_str(str_input_feed + str_sample), input_feed=str_input_feed, encoded_feed=','.join([str(x) for x in input_feed]), input_features='\n'.join( ["{}:{}".format(k, v) for k, v in input_features.items()]), sample=str_sample, num_tokens=int(num_tokens), output_features='\n'.join([ "{}:{}".format(k, v) for k, v in output_features.items() ]) if output_features else "None", target_benchmark="// {}\n{}".format(target_benchmark[0], target_benchmark[1]), target_features='\n'.join([ "{}:{}".format(k, v) for k, v in target_features.items() ]) if target_features else "None", sample_quality=sample_quality, compile_status=compile_status, generation_id=generation_id, date_added=datetime.datetime.utcnow(), )
def get_entry(self, src: str, dataset: str, global_size: int, local_size: int) -> "CLDriveSample": """ Fetch row from DB, if exists. """ sha = crypto.sha256_str(src + dataset + str(global_size) + str(local_size)) try: with self.Session() as session: entry = session.query(CLDriveSample).filter_by( sha256=sha).first() if entry is not None: return entry else: return None except Exception as e: l.logger().error(e) return
def sampling(workspace: str, model_sha: str): global data global cached_models if data == {}: data = parseData() target_sha = crypto.sha256_str(str(workspace) + model_sha) current_model = cached_models[target_sha] samplers = current_model['samplers'] data = { 'summary': current_model['summary'], 'workspace': workspace, 'model_sha': model_sha, 'samplers': samplers, } return flask.render_template("sampling.html", data=data, **GetBaseTemplateArgs())
def parseModels(workspace_path, corpus_sha: str): models = [] if (workspace_path / "model").exists(): for model_sha in (workspace_path / "model").iterdir(): model_path = workspace_path / "model" / model_sha if (model_path / "tokenizer").exists() and pathlib.Path( os.readlink( model_path / "tokenizer")).parent.name == corpus_sha: if (model_path / "META.pbtxt").exists(): meta = parseMeta(model_path / "META.pbtxt") model = { 'path': model_path, 'sha': str(model_sha.name), 'config': meta, 'tokenizer': tokenizers.TokenizerBase.FromFile( model_path / pathlib.Path( os.readlink(model_path / "tokenizer"))), 'training_log': parseTrainLogs(model_path / "logs"), # TODO 'validation': parseValidationDB(model_path / "logs" / "validation_samples.db"), 'samplers': parseSamplers(workspace_path, model_path / "samples", str(model_sha.name)), # TODO sample_db ? 'summary': parseModelSummary(meta) } global cached_models cached_models[crypto.sha256_str( str(workspace_path.name) + str(model_sha.name))] = model models.append(model) return models
def FromArgs(cls, id : int, sample : str, include : str, encoded_sample : str, compile_status : bool, feature_vector : str, num_tokens : int, ) -> "CLSmithSample": """ Do you want to use CLSmithDatabase as a means to store only code without much fuss ? This function is for you! """ return CLSmithSample(**{ "id" : id, "sha256" : crypto.sha256_str(sample), "sample" : sample, "include" : include, "encoded_sample" : encoded_sample, "compile_status" : compile_status, "feature_vector" : feature_vector, "num_tokens" : num_tokens, "date_added" : datetime.datetime.utcnow(), })
def FromArgsLite(cls, id: int, text: str, feature_vector: str, compiles: bool) -> "Sample": """ Do you want to use SamplesDatabase as a means to store only code without much fuss ? This function is for you! """ return Sample( **{ "id": id, "sha256": crypto.sha256_str(text), "train_step": -1, "encoded_text": "", "original_input": "", "sample_feed": "", "text": text, "sample_indices": "", "encoded_sample_indices": "", "compile_status": compiles, "feature_vector": feature_vector, "num_tokens": 0, "categorical_sampling": "False", "sample_time_ms": 0, "date_added": datetime.datetime.utcnow(), })
def OnSample(self, sample: model_pb2.Sample) -> bool: """Sample receive callback. Returns True if sampling should continue.""" sample_id = crypto.sha256_str(sample.text) sample_path = self.cache_path / f"{sample_id}.pbtxt" pbutil.ToFile(sample, sample_path) return True
def FromArgs( cls, tokenizer, id: int, train_step: int, seen_in_training, original_input: typing.List[int], input_ids: typing.List[int], input_mask: typing.List[int], masked_lm_ids: typing.List[int], masked_lm_positions: typing.List[int], masked_lm_weights: typing.List[float], masked_lm_lengths: typing.List[int], next_sentence_labels: typing.List[int], masked_lm_predictions: typing.List[int], next_sentence_predictions: typing.List[int], ) -> typing.Dict[str, typing.Any]: str_original_input = tokenizer.tokensToString( original_input, ignore_token=tokenizer.padToken, with_formatting=True) str_input_ids = tokenizer.tokensToString( input_ids, ignore_token=tokenizer.padToken, with_formatting=True) str_masked_lm_ids = '\n'.join([ tokenizer.decoder[x] if ('\n' not in tokenizer.vocab or ('\n' in tokenizer.vocab and x != tokenizer.vocab['\n'])) else '\\n' for x in masked_lm_ids ]) str_masked_lm_predictions = '\n'.join([ tokenizer.decoder[x] if ('\n' not in tokenizer.vocab or ('\n' in tokenizer.vocab and x != tokenizer.vocab['\n'])) else '\\n' for x in masked_lm_predictions ]) return { "id": id, "sha256": crypto.sha256_str( str(int(train_step)) + str_original_input + str_input_ids + str_masked_lm_ids + str_masked_lm_predictions), "train_step": int(train_step), "original_input": str_original_input, "encoded_original_input": ','.join([str(x) for x in original_input]), "input_ids": str_input_ids, "encoded_input_ids": ','.join([str(x) for x in input_ids]), "input_mask": ','.join([str(x) for x in input_mask]), "masked_lm_positions": ','.join([str(x) for x in masked_lm_positions]), "masked_lm_ids": str_masked_lm_ids, "encoded_mask_lm_ids": ','.join([str(x) for x in masked_lm_ids]), "masked_lm_weights": ','.join([str(int(x)) for x in masked_lm_weights]), "masked_lm_lengths": ','.join([str(int(x)) for x in masked_lm_lengths if x >= 0]), "next_sentence_labels": int(next_sentence_labels), "masked_lm_predictions": str_masked_lm_predictions, "encoded_masked_lm_predictions": ','.join([str(x) for x in masked_lm_predictions]), "next_sentence_predictions": int(next_sentence_predictions), "num_targets": list(masked_lm_ids).index(tokenizer.padToken) if tokenizer.padToken in list(masked_lm_ids) else len(list(masked_lm_ids)), "seen_in_training": int(seen_in_training), "date_added": datetime.datetime.utcnow(), }
def add_entry(self, src: str, dataset: str, status: str, global_size: int, local_size: int, df: pd.DataFrame) -> None: """ Adds execution entries from pandas dataframe. """ sha = crypto.sha256_str(src + dataset + str(global_size) + str(local_size)) try: with self.Session(commit=True) as session: entry = session.query(CLDriveSample).filter_by( sha256=sha).first() if entry is None: if status in {"CPU", "GPU"}: idx = 0 transferred_bytes = float('NaN') while idx < len(df.transferred_bytes) and math.isnan( transferred_bytes): try: transferred_bytes = int( df.transferred_bytes[idx]) except ValueError: idx += 1 session.add( CLDriveSample.FromArgs( id=self.count, global_size=global_size, local_size=local_size, source=src, dataset=dataset, cpu_transfer_time_ns=list( df[df['device'].str.contains( "CPU")].transfer_time_ns), cpu_kernel_time_ns=list( df[df['device'].str.contains( "CPU")].kernel_time_ns), gpu_transfer_time_ns=list( df[df['device'].str.contains( "GPU")].transfer_time_ns), gpu_kernel_time_ns=list( df[df['device'].str.contains( "GPU")].kernel_time_ns), transferred_bytes=transferred_bytes, status=status, )) else: session.add( CLDriveSample.FromArgs( id=self.count, global_size=global_size, local_size=local_size, source=src, dataset=dataset, cpu_transfer_time_ns=[], cpu_kernel_time_ns=[], gpu_transfer_time_ns=[], gpu_kernel_time_ns=[], transferred_bytes=-1, status=status, )) if self._status_cache is not None: assert sha not in self._status_cache, "{} should not be in DB".format( sha) self._status_cache[sha] = status elif status in {"CPU", "GPU"}: assert False, "This shouldnt happen" entry.cpu_transfer_time_ns = entry.cpu_transfer_time_ns + "\n" + '\n'.join( [ str(x) for x in df[df['device'].str.contains( "CPU")].transfer_time_ns ]) entry.cpu_kernel_time_ns = entry.cpu_kernel_time_ns + "\n" + '\n'.join( [ str(x) for x in df[df['device'].str.contains( "CPU")].kernel_time_ns ]) entry.gpu_transfer_time_ns = entry.gpu_transfer_time_ns + "\n" + '\n'.join( [ str(x) for x in df[df['device'].str.contains( "GPU")].transfer_time_ns ]) entry.gpu_kernel_time_ns = entry.gpu_kernel_time_ns + "\n" + '\n'.join( [ str(x) for x in df[df['device'].str.contains( "GPU")].kernel_time_ns ]) session.commit() except Exception as e: raise e return
def validation_samples(workspace: str, model_sha: str): global data global cached_models if data == {}: data = parseData() target_sha = crypto.sha256_str(str(workspace) + model_sha) current_model = cached_models[target_sha] validation = current_model['validation'] if validation['path']: val_db = validation_database.ValidationDatabase(str( validation['path']), must_exist=True) with val_db.Session() as session: validation['val_samples'] = session.query( validation_database.BERTValFile).all() validation['val_metrics'] = session.query( validation_database.ValResults).all() # random.shuffle(validation['val_samples']) for sample in validation['val_samples']: processed_input_ids = [] if '[HOLE]' in sample.input_ids: mask_type = '[HOLE]' elif '[MASK]' in sample.input_ids: mask_type = '[MASK]' else: mask_type = '' if mask_type == '[HOLE]': input_ids = sample.input_ids.split(mask_type) mask_num = sample.num_targets for i in range(mask_num): processed_input_ids += [ { 'text': input_ids[i], 'color': 'plain', 'length': len(input_ids[i]), }, { 'text': mask_type, 'color': 'hole', 'length': int(sample.masked_lm_lengths.split(',')[i]), }, { 'text': sample.masked_lm_predictions.split('\n') [i].replace(' ', '[ ]').replace('\n', '\\n'), 'color': 'prediction', 'length': 1, }, { 'text': sample.masked_lm_ids.split('\n')[i].replace( ' ', '[ ]').replace('\n', '\\n'), 'color': 'target', 'length': 1, }, ] while i < len(input_ids) - 1: i += 1 processed_input_ids.append( { 'text': input_ids[i], 'color': 'plain', 'length': len(input_ids[i]), }, ) elif mask_type == '[MASK]': processed_input_ids = [{ 'text': sample.input_ids, 'color': 'plain', }] sample.input_ids = processed_input_ids validation['summary'] = current_model['summary'] validation['workspace'] = workspace validation['model_sha'] = model_sha return flask.render_template("validation_samples.html", data=validation, **GetBaseTemplateArgs())
def DriveSource(src : str, group_name : str, feats : typing.Dict[str, float], cldrive_db : cldrive.CLDriveExecutions, ) -> typing.Generator: """ For a given source code, drive to CLDrive and return a ready row. Args: src : source code to process feats : Grewe Feature vector of source code. cldrive_db : Caches cldrive executions of source code. """ # for gsize in tqdm.tqdm([2**6, 2**7, 2**8, 2**10, 2**12, 2**14, 2**16, 2**18, 2**20], desc = "gsize", leave = False): for gsize in tqdm.tqdm([2**10, 2**12, 2**14, 2**16, 2**18, 2**20], desc = "gsize", leave = False): for lsize in tqdm.tqdm([2**2, 2**3, 2**4, 2**5, 2**6, 2**7, 2**8], desc = "lsize", leave = False): if lsize > gsize: continue sha = crypto.sha256_str(src + group_name + str(gsize) + str(lsize)) if sha in cldrive_db.status_cache: cached = cldrive_db.get_entry(src, group_name, gsize, lsize) if cached.status in {"CPU", "GPU"}: yield ToDataFrameRow( name = "{}.cl".format(sha), grewe_feats = feats, transferred_bytes = cached.transferred_bytes, global_size = gsize, local_size = lsize, label = cached.status, cpu_transfer_time_ns = sum([int(float(x)) for x in cached.cpu_transfer_time_ns.split('\n') if x != 'nan']) // len([x for x in cached.cpu_transfer_time_ns.split('\n') if x != 'nan']), cpu_kernel_time_ns = sum([int(float(x)) for x in cached.cpu_kernel_time_ns.split('\n') if x != 'nan']) // len([x for x in cached.cpu_kernel_time_ns.split('\n') if x != 'nan']), gpu_transfer_time_ns = sum([int(float(x)) for x in cached.gpu_transfer_time_ns.split('\n') if x != 'nan']) // len([x for x in cached.gpu_transfer_time_ns.split('\n') if x != 'nan']), gpu_kernel_time_ns = sum([int(float(x)) for x in cached.gpu_kernel_time_ns.split('\n') if x != 'nan']) // len([x for x in cached.gpu_kernel_time_ns.split('\n') if x != 'nan']), ) else: yield None else: df, label = opencl.CLDriveDataFrame(src, num_runs = 100, gsize = gsize, lsize = lsize, timeout = 60) cldrive_db.add_entry(src, group_name, label, gsize, lsize, df) if label not in {"CPU", "GPU"}: yield None else: idx = 0 transferred_bytes = float('NaN') while idx < len(df.transferred_bytes) and math.isnan(transferred_bytes): try: transferred_bytes = int(df.transferred_bytes[idx]) except ValueError: idx += 1 yield ToDataFrameRow( name = "{}.cl".format(sha), grewe_feats = feats, transferred_bytes = transferred_bytes, global_size = gsize, local_size = lsize, label = label, cpu_transfer_time_ns = df[df['device'].str.contains("CPU")].transfer_time_ns.mean(), cpu_kernel_time_ns = df[df['device'].str.contains("CPU")].kernel_time_ns.mean(), gpu_transfer_time_ns = df[df['device'].str.contains("GPU")].transfer_time_ns.mean(), gpu_kernel_time_ns = df[df['device'].str.contains("GPU")].kernel_time_ns.mean(), )
def OnSample(self, sample: model_pb2.Sample) -> bool: """Sample receive callback. Returns True if sampling should continue.""" sample_id = crypto.sha256_str(sample.text) path = self.path / f"{sample_id}.txt" fs.Write(path, sample.text.encode("utf-8")) return True
def sample_files(workspace: str, model_sha: str, sampler_sha: str, sample_db: str): global data global cached_models if data == {}: data = parseData() current_sampler = {} target_sha = crypto.sha256_str(str(workspace) + model_sha) for sampler in cached_models[target_sha]['samplers']: if sampler['sha'] == sampler_sha: current_sampler = sampler break db_file = current_sampler['path'] / "{}.db".format(sample_db) samples_db = samples_database.SamplesDatabase( "sqlite:///{}".format(db_file), must_exist=True) with samples_db.Session() as session: sample_files = session.query(samples_database.Sample).all() for sample in sample_files: processed_feed = [] processed_indices = [] if '[HOLE]' in sample.sample_feed: mask_type = '[HOLE]' elif '[MASK]' in sample.sample_feed: mask_type = '[MASK]' else: mask_type = '' sample_feed = sample.sample_feed.split(mask_type) sample_indices = sample.sample_indices.split('\n') assert len(sample_feed) - 1 == len(sample_indices), ( "sample hole length/generation mismatch: {}, {}".format( len(sample_feed), len(sample_indices), )) prediction = sample.text for i in range(len(sample_feed) - 1): processed_feed += [ { 'text': sample_feed[i], 'color': 'plain', }, { 'text': mask_type, 'color': 'mask', }, ] processed_indices += [ { 'text': sample_feed[i], 'color': 'plain', }, { 'text': mask_type, 'color': 'mask', }, { 'text': sample_indices[i].replace("\\n", "\n"), 'color': 'prediction', }, ] while i < len(sample_feed) - 1: i += 1 processed_indices.append( { 'text': sample_feed[i], 'color': 'plain', }, ) processed_feed.append({'text': sample_feed[i], 'color': 'plain'}) sample.sample_indices = processed_indices sample.sample_feed = processed_feed sample_specs = { 'summary': cached_models[target_sha]['summary'], 'workspace': workspace, 'model_sha': model_sha, 'samples': sample_files, } return flask.render_template("sample_files.html", data=sample_specs, **GetBaseTemplateArgs())
def ResolveContentId(config: typing.Union[corpus_pb2.Corpus, corpus_pb2.PreTrainCorpus]) -> str: """Compute the hash of the input contentfiles. This function resolves the unique sha1 checksum of a set of content files. Args: config: The corpus config proto. Returns: A hex encoded sha1 string. """ # We can take a massive shortcut if the content ID is already set in the # config proto. if config.HasField("content_id"): # TODO(github.com/ChrisCummins/clgen/issues/130): Refactor this after splitting # out Corpus class. return config.content_id elif config.HasField("pre_encoded_corpus_url"): # TODO(github.com/ChrisCummins/clgen/issues/130): Refactor this after splitting # out Corpus class. return crypto.sha1_str(config.pre_encoded_corpus_url) start_time = time.time() if config.HasField("local_directory"): local_directory = ExpandConfigPath( config.local_directory, path_prefix=FLAGS.clgen_local_path_prefix ) # After the first time we compute the hash of a directory, we write it into # a file. This is a shortcut to work around the fact that computing the # directory checksum is O(n) with respect to the number of files in the # directory (even if the directory is already cached by the hash cache). # This means that it is the responsibility of the user to delete this cached # file if the directory is changed. hash_file_path = pathlib.Path(str(local_directory) + ".sha1.txt") if hash_file_path.is_file(): l.logger().info("Reading directory hash: '{}'.".format(hash_file_path)) with open(hash_file_path) as f: content_id = f.read().rstrip() else: # No hash file, so compute the directory hash and create it. try: # content_id = hc.GetHash(local_directory) content_id = crypto.sha256_str(str(local_directory)) except FileNotFoundError as e: raise ValueError(e) # Create the hash file in the directory so that next time we don't need # to reference the hash cache. with open(hash_file_path, "w") as f: print(content_id, file=f) l.logger().info("Wrote directory hash: '{}'.".format(hash_file_path)) elif config.HasField("local_tar_archive"): # This if not an efficient means of getting the hash, as it requires always # unpacking the archive and reading the entire contents. It would be nicer # to maintain a cache which maps the mtime of tarballs to their content ID, # similart to how local_directory is implemented. content_id = GetHashOfArchiveContents( ExpandConfigPath(config.local_tar_archive, path_prefix=FLAGS.clgen_local_path_prefix) ) elif config.HasField("bq_database"): content_id = crypto.sha256_str(str(config.bq_database)) # elif config.HasField("fetch_github"): # gitfile_path = ExpandConfigPath( # config.fetch_github, path_prefix=FLAGS.clgen_local_path_prefix # ) # gitfile_path.mkdir(exist_ok=True, parents=True) # github_fetcher = github.GithubFetcher(gitfile_path) # github_fetcher.fetch() # hash_file_path = pathlib.Path(str(gitfile_path) + ".sha1.txt") # if hash_file_path.is_file(): # l.logger().info("Reading directory hash: '{}'.".format(hash_file_path)) # with open(hash_file_path) as f: # content_id = f.read().rstrip() # else: # # No hash file, so compute the directory hash and create it. # try: # content_id = hc.GetHash(gitfile_path) # except FileNotFoundError as e: # raise ValueError(e) # # Create the hash file in the directory so that next time we don't need # # to reference the hash cache. # with open(hash_file_path, "w") as f: # print(content_id, file=f) # l.logger().info("Wrote directory hash: '{}'.".format(hash_file_path)) else: raise NotImplementedError("Unsupported Corpus.contentfiles field value") return content_id
def FromArgs( cls, tokenizer, id: int, input_feed: np.array, input_ids: np.array, input_features: typing.Dict[str, float], input_score: float, hole_lengths: typing.List[int], sample: np.array, sample_indices: np.array, output_features: typing.Dict[str, float], sample_score: float, target_benchmark: typing.Tuple[str, str], target_features: typing.Dict[str, float], compile_status: bool, generation_id: int, # timestep : int, ) -> typing.TypeVar("SearchCandidate"): """Construt SearchCandidate table entry from argumentns.""" str_input_feed = tokenizer.tokensToString( input_ids, ignore_token=tokenizer.padToken, with_formatting=True) str_sample = tokenizer.ArrayToCode(sample, with_formatting=True) len_indices = len(sample_indices) sample_indices = tokenizer.tokensToString( sample_indices, ignore_token=tokenizer.padToken) num_tokens = len(sample) if tokenizer.padToken in sample: num_tokens = np.where(sample == tokenizer.padToken)[0][0] actual_length = len(input_ids) - 3 if tokenizer.padToken in input_ids: actual_length = np.where(input_ids == tokenizer.padToken)[0][0] - 3 return SearchCandidate( id=id, sha256=crypto.sha256_str(str_input_feed + str_sample + str(hole_lengths)), sample_sha256=crypto.sha256_str(str_sample), generation_id=generation_id, frequency=1, abs_hole_lengths=','.join( [str(hl) for hl in hole_lengths if hl >= 0]), rel_hole_lengths=','.join([ str(hl / (hl + actual_length)) for hl in hole_lengths if hl >= 0 ]), hole_ind_length=len_indices, input_feed=tokenizer.ArrayToCode(input_feed, with_formatting=True), input_ids=str_input_feed, encoded_input_ids=','.join([str(x) for x in input_ids]), input_features='\n'.join([ "{}:{}".format(k, v) for k, v in input_features.items() ]) if input_features else "None", input_score=input_score, sample=str_sample, sample_indices=sample_indices, num_tokens=int(num_tokens), output_features='\n'.join([ "{}:{}".format(k, v) for k, v in output_features.items() ]) if output_features else "None", sample_score=sample_score, target_benchmark="// {}\n{}".format(target_benchmark[0], target_benchmark[1]), target_features='\n'.join([ "{}:{}".format(k, v) for k, v in target_features.items() ]) if target_features else "None", compile_status=compile_status, score_delta=(sample_score - input_score) / input_score if not math.isinf(input_score) else math.inf, features_delta='\n'.join([ "{}:{}".format(k, output_features[k] - input_features[k]) for k in input_features.keys() if (output_features[k] - input_features[k] != 0) ]) if input_features and output_features else math.inf, date_added=datetime.datetime.utcnow(), )