def train(self, data_from_labeled_set: List[tf.Tensor], data_from_unlabeled_set: List[tf.Tensor]): assert len(data_from_labeled_set) == len(data_from_unlabeled_set) y = tf.concat([ tf.zeros(dtype=tf.int32, shape=[len(data_from_labeled_set)]), tf.ones(dtype=tf.int32, shape=[len(data_from_unlabeled_set)]) ], axis=0) x = tf.concat([data_from_labeled_set, data_from_unlabeled_set], axis=0) utils.check_equal(len(y), len(x)) self._model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"]) logging.info("Fitting Model") shuffle = np.random.permutation(len(x)) x: np.ndarray = x.numpy()[shuffle].astype(np.int32) y = y.numpy()[shuffle] x_tr = x[:int(SPLIT * len(x))] x_va = x[int(SPLIT * len(x)):] y_tr = y[:int(SPLIT * len(y))] y_va = y[int(SPLIT * len(y)):] self._model.fit(x=x_tr, y=y_tr, batch_size=self.batch_size, validation_data=(x_va, y_va), verbose=True) accuracy = np.mean(self._model.predict(x_va) == y_va) logging.info(f"Eval: {accuracy:0.2%}") logging.info(f"Done {type(self)}")
def _create_float_feature(values, feature_len): feature_list = list(values) utils.check_equal(len(feature_list), feature_len) feature = tf.train.Feature(float_list=tf.train.FloatList( value=list(feature_list))) return feature
def process_strat_output( strategy_outputs, name, strategy, current_batch_size, ): """Uniformizes the different outputs of strategy.run calls.""" if isinstance(strategy_outputs, values.PerReplica): strategy_outputs: values.PerReplica # LOGGER.debug("process_strat_output: %s: %s", name, str(strategy_outputs)) output = deal_w_entry(strategy_outputs) utils.check_equal(output.shape, current_batch_size) elif (isinstance(strategy_outputs, tuple) and isinstance(strategy_outputs[0], values.PerReplica)): strategy_outputs: Tuple[values.PerReplica, Ellipsis] output = [] for indiv_val in strategy_outputs: output.append(deal_w_entry(indiv_val)) output = tuple(output) elif (isinstance(strategy_outputs, dict) and isinstance( next(iter(strategy_outputs.values())), values.PerReplica)): strategy_outputs: Dict[str, values.PerReplica] output = {} for k, indiv_val in strategy_outputs.items(): output[k] = deal_w_entry(indiv_val) elif isinstance(strategy_outputs, ops.EagerTensor) or ( isinstance(strategy_outputs, tuple) and isinstance(strategy_outputs[0], ops.EagerTensor)): output = strategy_outputs else: raise RuntimeError( f"{name}: {type(strategy_outputs)}, {type(strategy)}") return output
def validate_instance_type_flag(): # Validate the value: instance_tuple = _FLAG_INSTANCE_TYPE.value.strip().split("-") utils.check_equal(len(instance_tuple), 3) utils.check_contained(instance_tuple[0], {"n1", "n2"}) utils.check_contained(instance_tuple[1], {"standard", "highmem"}) num_cpus = int(instance_tuple[2]) utils.check_operator(operator.le, num_cpus, 64) utils.check_operator(operator.ge, num_cpus, 0)
def _stack_per_sent(samples_a, samples_b): """Extract both of the sentences of a sample, and stack all of them. We need both sets of samples because we need to pad the the longuest sentence of both sets. There are two sentences in a sample. We want to train the filter as if they were independent samples. So, we extract the sentences from the samples by using the segment_ids. We added a third segment id for the padding in order to not get the padding when we filter with the segment_ids. """ lengths = [] packs = [] for i, samples in enumerate([samples_a, samples_b]): # The weird [1:-1] is to remove the <cls> token and the <sep> # token from the first sentenceof a sample sents_0 = [ sample["input_ids"][sample["segment_ids"] == 0][1:-1] for sample in tqdm.tqdm(samples) ] sents_1 = [ sample["input_ids"][sample["segment_ids"] == 1][:-1] for sample in tqdm.tqdm(samples) ] # if i == 1: # for sample in itertools.islice(samples, 0, 100, 10): # logging.info(sample["segment_ids"]) # itertools.chain just .. chains the iteration over two iterables. # like, [x for x in itertools.chain(range(3), range(3))] would be # [0, 1, 2, 0, 1, 2] length = max(itertools.chain(map(len, sents_0), map(len, sents_1))) packs.append((sents_0, sents_1)) lengths.append(length) maxlen = max(lengths) output = [] for pack in tqdm.tqdm(packs): sents = [ tf.pad(sent, [[0, maxlen - len(sent)]]) for sent in itertools.chain(*pack) ] output.append(sents) utils.check_equal(len(output), 2) return tf.stack(output[0]), tf.stack(output[1])
def process_strat_output( strategy_outputs, name, strategy, current_batch_size, ): """Uniformizes the different outputs of strategy.run calls. """ ############################################################################## # Single PerReplica ############################################################################## if isinstance(strategy_outputs, values.PerReplica): strategy_outputs: values.PerReplica output = to_eager_tensor(strategy_outputs) utils.check_equal(output.shape, current_batch_size) ############################################################################## # Tuple of PerReplicas ############################################################################## elif (isinstance(strategy_outputs, tuple) and isinstance(strategy_outputs[0], values.PerReplica)): strategy_outputs: Tuple[values.PerReplica, Ellipsis] output = [] for indiv_val in strategy_outputs: output.append(to_eager_tensor(indiv_val)) output = tuple(output) ############################################################################## # Dict of PerReplicas ############################################################################## elif (isinstance(strategy_outputs, dict) and isinstance( next(iter(strategy_outputs.values())), values.PerReplica)): strategy_outputs: Dict[str, values.PerReplica] output = {} for k, indiv_val in strategy_outputs.items(): output[k] = to_eager_tensor(indiv_val) ############################################################################## # EagerTensor ############################################################################## elif (isinstance(strategy_outputs, ops.EagerTensor) or (isinstance(strategy_outputs, tuple) and isinstance(strategy_outputs[0], ops.EagerTensor))): output = strategy_outputs else: raise RuntimeError( f"{name}: {type(strategy_outputs)}, {type(strategy)}") return output
async def get_domain_attrs(request): data = await request.post() domain = data.get("domain", "") nginx_user, nginxs = config.get_domain_nginxs(domain) config_file = config.get_domain(domain).get("config_file", "") backend_port = config.get_domain(domain).get("backend_port") all_servers = [ GatewayNGINX(nginx_user, host).get_servers(config_file, backend_port) for host in nginxs ] if not check_equal(all_servers): # 网关数据不一样,有可能是因为主机连接失败,打印到终端用于DEBUG logger.info("执行获取{0}网关数据,数据不一致: {1}".format(domain, all_servers)) response = dict(servres=[], status="501", err_msg="出现错误, 网关数据不一致") else: ok, servers = all_servers.pop() if ok and servers: response = dict(servers=tuple(servers), status="200", err_msg="") else: # 如果远程命令失败,servers变量是标准错误输出 if not servers: stderr = "未能获取到后端服务器,请联系管理员确认配置无误" else: stderr = servers logger.info("获取upstreams失败,输出: {0}".format(stderr)) response = dict(servers=[], status="500", err_msg=str(stderr)) return web.json_response(response)
def create_one_vm_vm(): runtime = _ONEVM_RUNTIME_VERSION if runtime == "v2-alpha": utils.check_equal(_FLAG_TPU_QTY.value, "8") command = [ "gcloud", "alpha", "compute", "tpus", "tpu-vm", "create", f"{_FLAG_INSTANCE_NAME.value}", f"--zone={_FLAG_ZONE.value}", f"--accelerator-type={make_accelerator_type()}", f"--version={runtime}", ] run_gcloud_command(command)
qty_shuffle=1, # Will never change max_length_generation=350 ), tokenizer, BATCH_SIZE, SPLIT) num_entries_in_split = ( task_specific.DATASET_CARDINALITIES["kilt_eli5"][SPLIT] ) entries_counter = tqdm.tqdm(total=num_entries_in_split) for batch_no, batch in enumerate(itertools.islice(ds, NUM_ENTRIES)): #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # Display the inputs and outputs. #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ rich_console = rich.console.Console(color_system="256") print_sample = generation.make_print_sample() assert not np.all(batch[0] == batch[1]), batch[0] == batch[1] with utils.log_duration( LOGGER, "main", "all of tokenizer.decode for a batch." ): for i in range(batch.shape[0]): print(f"{batch.shape = }") utils.check_equal(len(batch.shape), 2) utils.check_equal(batch.shape[0], BATCH_SIZE) tokens = batch.numpy()[i] input_text = tokenizer.decode(tokens) print(f"Batch {batch_no}, Sample {i} / {BATCH_SIZE} of batch:") print(f"\tNum tokens: {len(tokens)}") print_sample( input_text, f"input batch_no {batch_no}", rich_console )
__email__ = "*****@*****.**" __credits__ = ["???"] __license__ = "???" __version__ = "1.0.0" __status__ = "Production" from utils import check_empty, check_equal, check_equal_re, check_equals, check_not_empty, check_return_code, print_header, view_output, print_warning, print_info from utils import x, assert_contains # print_header("3. Special Purpose Services") # print_header("3.1 Set Daemon umask (Scored)") check_equal("grep umask /etc/sysconfig/init", "umask 027") # print_header("3.2 Remove X Windows (Scored)") # Original CIS test # check_equal( # 'grep "^id:" /etc/inittab', # "id:3:initdefault" # ) # Syco hardened servers use this. check_equal('grep "^\~\~\:S\:wait\:\/sbin\/sulogin" /etc/inittab', "~~:S:wait:/sbin/sulogin") result = x('yum grouplist "X Window System"') max_lines = len(result) assert_contains(result[max_lines - 3], "Available Groups:")
def main(argv): if len(argv) > 1: raise RuntimeError(argv) absl_logging.use_python_logging() retriever_config = tf_utils.REALMSave( **utils.from_json_file(_FLAG_RETRIEVER_CONFIG_PATH.value)) extra = "_FROM_SUBSET" if _FLAG_USE_SUBSET.value else "" time_stamp = time.strftime("%Y%m%d-%H%M%S") target_path = os.path.join(_FLAG_OUTPUT_PATH.value, time_stamp + extra).strip() if target_path[-1] != "/": target_path += "/" ############################################################################## # Setup devices and strategy ############################################################################## with utils.log_duration(LOGGER, "main", "Initializing devices"): tpu_config = tf_utils.init_tpus() device_type = tf_utils.current_accelerator_type() LOGGER.debug("Devices: %s", str(tf_utils.devices_to_use())) if device_type == "TPU": if tpu_config is None: raise RuntimeError("We should have a tpu_config.") strategy = tf.distribute.TPUStrategy(tpu_config.resolver) batch_size = len( tf_utils.devices_to_use()) * _FLAG_BATCH_SIZE.value elif device_type == "GPU" or device_type == "CPU": strategy = tf.distribute.MirroredStrategy() batch_size = len( tf_utils.devices_to_use()) * _FLAG_BATCH_SIZE.value else: raise RuntimeError(device_type) ############################################################################## # Load the dataset. ############################################################################## eli5 = {} keys = ["train", "eval", "test"] gpt2_tokenizer = transformers.GPT2TokenizerFast.from_pretrained("gpt2-xl") gpt2_tokenizer.pad_token = gpt2_tokenizer.eos_token with utils.log_duration(LOGGER, "main", "Loading the ELI5 datasets."): for split in tqdm.tqdm(keys): load_path = os.path.join(_FLAGS_DATASET_ROOT.value, "HuggingfaceDatasets", f"{split}_kilt_eli5.hf") with tf.device("/job:localhost"): eli5[split] = datasets.load_from_disk(load_path) if _FLAG_USE_SUBSET.value: _warn_subset() ############################################################################## # ############################################################################## with utils.log_duration(LOGGER, "Main", "Load the textual dataset"): # Extract the appropriate text # The buffer_size is taken from the original ORQA code. blocks_dataset = tf.data.TFRecordDataset(retriever_config.text_records, buffer_size=512 * 1024 * 1024) blocks_dataset = blocks_dataset.batch( retriever_config.num_block_records, drop_remainder=True) blocks = tf.data.experimental.get_single_element(blocks_dataset) with tempfile.TemporaryDirectory() as tmp_dir: ############################################################################ # Prepare the output file. ############################################################################ tmp_dir = pathlib.Path(tmp_dir) h5_output_path = tmp_dir / "codes.h5" output_file = h5py.File(h5_output_path, "w") flags_dict = { flag.name: flag.value for flag in flags.FLAGS.flags_by_module_dict()[argv[0]] } utils.to_json_file(tmp_dir / "params.json", flags_dict) for split in keys: with utils.log_duration( LOGGER, "main", "Creating the output hdf5 file, embeddings."): num_entries = len(eli5[split]["id"]) if _FLAG_USE_SUBSET.value: num_entries = min(num_entries, _FLAG_SUBSET_AMOUNT.value) split_group = output_file.create_group(split) with utils.log_duration( LOGGER, "main", "Creating the output hdf5 file, retrieval."): split_group.create_dataset( constants.CTH5Fields.distances, shape=(num_entries, _FLAG_NUM_RETRIEVALS.value), dtype=np.float32, ) split_group.create_dataset( constants.CTH5Fields.gpt2_question_ids_inputs, shape=(num_entries, _FLAG_CONTEXT_SIZE.value), dtype=np.int32) if split != "test": split_group.create_dataset( constants.CTH5Fields.gpt2_answer_ids_inputs, shape=(num_entries, _FLAG_CONTEXT_SIZE.value), dtype=np.int32) split_group.create_dataset( constants.CTH5Fields.gpt2_retrieved_ids, shape=( num_entries, _FLAG_NUM_RETRIEVALS.value, _FLAG_MAX_LENGTH_RETRIEVALS.value, ), dtype=np.int32) with utils.log_duration(LOGGER, "main", "Loading the reference db."): checkpoint_path = os.path.join( retriever_config.query_embedder_path, "encoded", "encoded.ckpt") reference_db_device = tf_utils.device_mapping().CPUs[0].name with tf.device(reference_db_device): reference_db = tf_utils.load_reference_db( checkpoint_path, variable_name="block_emb", ) ############################################################################ # Prep the encoder and the tokenizer ############################################################################ with utils.log_duration( LOGGER, "main", "Loading the encoder model and the tokenizer."): with strategy.scope(): query_encoder = hub.load(retriever_config.query_embedder_path, tags={}) encode_fn = _make_encode_fn(query_encoder) encode_fn_strategy_run = _make_encode_fn_strategy_run_fn( strategy=strategy, encode_fn=encode_fn, ) vocab_file = os.path.join(retriever_config.query_embedder_path, "assets", "vocab.txt") utils.check_exists(vocab_file) do_lower_case = query_encoder.signatures["tokenization_info"]( )["do_lower_case"] tokenization_info = dict(vocab_file=vocab_file, do_lower_case=do_lower_case) tokenizer, vocab_lookup_table = bert_utils.get_tf_tokenizer( query_encoder, tokenization_info) ############################################################################ # Preprocess the dataset ############################################################################ cls_token_id = tf.cast(vocab_lookup_table.lookup(tf.constant("[CLS]")), tf.int32) sep_token_id = tf.cast(vocab_lookup_table.lookup(tf.constant("[SEP]")), tf.int32) transform = _make_transform_fn( bert_tokenizer=tokenizer, bert_cls_token_id=cls_token_id, bert_sep_token_id=sep_token_id, ) with utils.log_duration(LOGGER, "main", "generating codes"): tqdm_splits = tqdm.tqdm(keys) for split in tqdm_splits: tqdm_splits.set_description(f"Split `{split}`") eli5: Dict[str, datasets.Dataset] write_start = 0 if _FLAG_USE_SUBSET.value: _warn_subset(tqdm_splits) eli5[split] = eli5[split][:_FLAG_SUBSET_AMOUNT.value] utils.check_operator(operator.le, len(eli5[split]["id"]), _FLAG_SUBSET_AMOUNT.value) utils.check_operator(operator.le, len(eli5[split]["input"]), _FLAG_SUBSET_AMOUNT.value) else: utils.check_equal(len(eli5[split]), len(eli5[split]["id"])) utils.check_equal(len(eli5[split]), len(eli5[split]["input"])) if split != "test": for_slices = dict(sample_id=eli5[split]["id"], question=eli5[split]["input"], answer=[ sample["answer"][0] for sample in eli5[split]["output"] ]) else: for_slices = dict( sample_id=eli5[split]["id"], question=eli5[split]["input"], ) ds = tf.data.Dataset.from_tensor_slices(for_slices) ds = ds.map(transform, num_parallel_calls=tf.data.experimental.AUTOTUNE) ds = ds.apply( tf.data.experimental.dense_to_ragged_batch(batch_size)) ds = ds.map(_squeeze, num_parallel_calls=tf.data.experimental.AUTOTUNE) tqdm_inner = tqdm.tqdm(enumerate(ds), total=len(eli5[split]["id"]) // _FLAG_BATCH_SIZE.value, desc=f"Split `{split}`: Batches") for i, batch in tqdm_inner: ###################################################################### # Enforce the current real batch size ###################################################################### current_batch_size = batch["sample_id"].shape[0] for k, v in batch.items(): utils.check_equal(v.shape[0], current_batch_size) ###################################################################### gpt2_question_ids_inputs = _prep_field( batch["question"], gpt2_tokenizer) utils.check_equal(gpt2_question_ids_inputs.dtype, np.int32) utils.check_equal(gpt2_question_ids_inputs.shape[0], current_batch_size) if split != "test": gpt2_answer_ids_inputs = _prep_field( batch["answer"], gpt2_tokenizer) utils.check_equal(gpt2_answer_ids_inputs.dtype, np.int32) utils.check_equal(gpt2_answer_ids_inputs.shape[0], current_batch_size) assert len(gpt2_answer_ids_inputs.shape) == 2, ( gpt2_answer_ids_inputs.shape) ###################################################################### # Save the gpt2 tokenized question and answer ###################################################################### end = write_start + current_batch_size utils.check_equal( output_file[split][ constants.CTH5Fields.gpt2_question_ids_inputs] [write_start:end].shape[0], current_batch_size) output_file[split][ constants.CTH5Fields.gpt2_question_ids_inputs][ write_start:end] = gpt2_question_ids_inputs if split != "test": output_file[split][ constants.CTH5Fields.gpt2_answer_ids_inputs][ write_start:end] = gpt2_answer_ids_inputs ###################################################################### # Encode the samples. ###################################################################### batch = strategy.experimental_distribute_values_from_function( tf_utils.make_dict_distribute_fn(batch)) embeddings = encode_fn_strategy_run(batch) embeddings = tf_utils.process_strat_output( embeddings, "embeddings", strategy, current_batch_size) utils.check_isinstance(embeddings, ops.EagerTensor) utils.check_equal(embeddings.shape[0], current_batch_size) # pytype doesn't seem to see that we check the type utils.check_equal(embeddings.shape[1], _FLAG_EMBEDDING_DEPTH.value) # pytype: disable=attribute-error ###################################################################### # Retrieve. ###################################################################### with tf.device(reference_db_device): top_k, inner_prods = tf_utils.mips_exact_search( embeddings, _FLAG_NUM_RETRIEVALS.value, reference_db) top_k = tf_utils.process_strat_output( top_k, "top_k", strategy, current_batch_size) utils.check_equal( inner_prods.shape, (current_batch_size, _FLAG_NUM_RETRIEVALS.value)) utils.check_equal( top_k.shape, (current_batch_size, _FLAG_NUM_RETRIEVALS.value)) output_file[split]["distances"][ write_start:end] = inner_prods gathered = tf.gather(blocks, top_k).numpy() utils.check_equal(gathered.shape[0], current_batch_size) utils.check_equal(write_start + gathered.shape[0], end) for j in range(gathered.shape[0]): local_gathered = gathered[j].tolist() utils.check_equal(len(local_gathered), _FLAG_NUM_RETRIEVALS.value) local_gathered = [ sample.decode() for sample in local_gathered ] token_ids = np.array( gpt2_tokenizer.batch_encode_plus( local_gathered, padding="max_length", truncation=True, ).input_ids) for line in token_ids: assert not np.all(line == 0), line token_ids[token_ids == gpt2_tokenizer.eos_token_id] = -1 output_file[split][ constants.CTH5Fields.gpt2_retrieved_ids][ write_start + j] = token_ids[:, :_FLAG_MAX_LENGTH_RETRIEVALS. value] write_start += current_batch_size ############################################################################ # Upload the results to GCS ############################################################################ LOGGER.debug("DONE WITH THE PRODUCTION") output_file.close() with utils.log_duration(LOGGER, "main", "gsutil transfer"): command = [ "/root/google-cloud-sdk/bin/gsutil", "-m", "cp", "-r", str(tmp_dir / "*"), target_path ] LOGGER.debug("Command: %s", " ".join(command)) subprocess.check_call(command) LOGGER.debug("ALL DONE")
check_return_code, print_header, view_output, print_warning, print_info, ) # print_header("1. Install Updates, Patches and Additional Security Software") # print_header("1.1 Filesystem Configuration") # print_header("1.1.1 Create Separate Partition for /tmp (Scored)") check_equal('grep "[[:space:]]/tmp[[:space:]]" /etc/fstab', "/tmp") # print_header("1.1.2 Set nodev option for /tmp Partition (Scored)") # No tmp partition should have nodev. check_equal("grep /tmp /etc/fstab", "nodev") check_equal("mount | grep /tmp", "nodev") # print_header("1.1.3 Set nosuid option for /tmp Partition (Scored)") # No tmp partition should have nosuid. check_equal("grep /tmp /etc/fstab", "nosuid") check_equal("mount | grep /tmp", "nosuid") # print_header("1.1.4 Set noexec option for /tmp Partition (Scored)")
def create_lm_ds_kilt_eli5( *, tokenizer, context_window_size, dataset_name, # pylint: disable=unused-argument batch_size, split, db_path, # pylint: disable=unused-argument random_seed, use_subset, # pylint: disable=unused-argument subset_size, # pylint: disable=unused-argument repeat, use_helper_words, approach_type, retriever, num_retrievals, retrieval_temperature, enable_debug_checks, retrieval_bank_size, # pylint: disable=unused-argument dataset_type, qty_shuffle, tfr_prefix, max_length_generation, ): """Dataset preparation function for the Kilt version of the ELI5 dataset. This is for when the dataset is consumed by language models. Args: tokenizer: Tokenizer of the reader model. context_window_size: Size of the context of the reader model. Not used here. dataset_name: Exact name of the dataset. Some datasets share the same function, with small specific differences. Not used here. batch_size: Size of the batch for the reader model. prefetch_size: How many batches to prefetch. split: The train, evaluation or test split. dataset_paths_root: Root directory of the datasets. Not used here. random_seed: Seed used to shuffle the dataset. Should change at each epoch. use_subset: Whether to use a subset of the data subset_size: Size of the subset repeat: Whether to repeat the dataset use_helper_words: Whether to add helper words in the merged samples. approach_type: Type of overall solution we are using. retriever: Object that does the retrieval. num_retrievals: Number of retrievals to do. retrieval_temperature: For the retrieval methods that do sampling, what temperature to use. Returns: A tf.data.Dataset object that generates input_ids and label_ids for the generator model. Raises: RuntimeError: If we didn't find any files with the glob pattern. RuntimeError: If we are using a dataset type that is not supported. """ maybe_retrieve_and_merge = _make_maybe_retrieve_and_merge_fn( tokenizer=tokenizer, context_size=context_window_size, retriever=retriever, temperature=retrieval_temperature, num_retrievals=num_retrievals, ds_split=split, approach_type=approach_type, # FLAG_APPROACH_TYPE.value use_helper_words=use_helper_words, # FLAG_USE_HELPER_WORDS enable_debug_checks=enable_debug_checks, max_length_generation=max_length_generation, ) utils.check_equal(dataset_type, constants.DatasetTypeChoices.tfr) glob_pattern = os.path.join(tfr_prefix, f"{split}*") filenames = list(tf.io.gfile.glob(glob_pattern)) if not filenames: raise RuntimeError( f"filnames is empty. Glob pattern was: {glob_pattern}") parse = make_parse_fn(split, context_window_size) ds = tf.data.TFRecordDataset( filenames=filenames, num_parallel_reads=tf.data.experimental.AUTOTUNE, ) ds = ds.map( parse, num_parallel_calls=tf.data.experimental.AUTOTUNE, deterministic=False, ) if repeat: ds = ds.repeat() utils.check_not_none(random_seed) utils.check_not_none(qty_shuffle) ds = ds.shuffle(qty_shuffle, seed=random_seed) ds = ds.batch( batch_size, drop_remainder=split != constants.SplitChoices.test, ) # We can't use parallel calls here, the huggingface Rust fast tokenizer # breaks with multiple threads. It seems to still be worth it over their # slow one though, vs using parallel threads. ds = ds.map(maybe_retrieve_and_merge) # return map(maybe_retrieve_and_merge, ds) return ds
__version__ = "1.0.0" __status__ = "Production" from utils import check_empty, check_equal, check_equal_re, check_equals, check_not_empty, check_return_code, print_header, view_output, print_warning, print_info # print_header("2. OS Services") # print_header("2.1 Remove Legacy Services") # print_header("2.1.1 Remove telnet-server (Scored)") check_equal( "rpm -q telnet-server", "package telnet-server is not installed" ) # print_header("2.1.2 Remove telnet Clients (Scored)") check_equal( "rpm -q telnet", "package telnet is not installed" ) # print_header("2.1.3 Remove rsh-server (Scored)") check_equal( "rpm -q rsh-server", "package rsh-server is not installed" )
__license__ = "???" __version__ = "1.0.0" __status__ = "Production" from utils import check_empty, check_equal, check_equal_re, check_equals, check_not_empty, check_return_code, print_header, view_output, print_warning, print_info from utils import x, assert_contains # print_header("3. Special Purpose Services") # print_header("3.1 Set Daemon umask (Scored)") check_equal( "grep umask /etc/sysconfig/init", "umask 027" ) # print_header("3.2 Remove X Windows (Scored)") # Original CIS test # check_equal( # 'grep "^id:" /etc/inittab', # "id:3:initdefault" # ) # Syco hardened servers use this. check_equal( 'grep "^\~\~\:S\:wait\:\/sbin\/sulogin" /etc/inittab', "~~:S:wait:/sbin/sulogin" )
def main(argv): ####################################################################### # Initial Setup. Logging, Flags, Random seeds. ####################################################################### if len(argv) > 1: raise app.UsageError("Too many command-line arguments.") absl_logging.use_python_logging() flags_dict = { flag.name: flag.value for flag in FLAGS.flags_by_module_dict()[argv[0]] } if FLAGS.use_subset: message = (f"{colorama.Back.RED}{colorama.Fore.WHITE}" f"{colorama.Style.BRIGHT}USING A SUBSET OF THE DATASET" f"{colorama.Style.RESET_ALL}") LOGGER.warning(message) utils.log_module_args(LOGGER, argv[0]) if not FLAGS.output_dir.startswith("gs://"): utils.check_exists(FLAG_OUTPUT_DIR.value) if not tf.io.gfile.isdir(FLAG_OUTPUT_DIR.value): raise RuntimeError("Output dir needs to be a directory.") tf.random.set_seed(FLAG_RANDOM_SEED.value) np.random.seed(FLAG_RANDOM_SEED.value) # Prepare the instance output directory path and save the config there folder_name = time.strftime( f"{FLAG_RUN_NAME.value}_{FLAG_APPROACH_TYPE.value}_%Y%m%d-%H%M%S") instance_output_dir = os.path.join(FLAG_OUTPUT_DIR.value, folder_name).strip() if not instance_output_dir.endswith("/"): instance_output_dir += "/" json_target = os.path.join(instance_output_dir, "training_params.json") if not json_target.strip().startswith("gs://"): subprocess.check_call(["mkdir", "-p", instance_output_dir]) utils.to_json_file(json_target, instance_output_dir) ############################################################################## # Initialization and Configuration of the Devices. ############################################################################## tpu_setup = None # current_acelerator_type is always "CPU" in the beginning with TPUs if tf_utils.current_accelerator_type() == "CPU": tpu_setup = tf_utils.init_tpus() LOGGER.debug("Devices we are computing on:\n%s", utils.wrap_iterable(map(str, tf_utils.devices_to_use()))) LOGGER.debug("All devices:") LOGGER.debug(tf_utils.device_mapping()) if tf_utils.current_accelerator_type() == "GPU": tf.config.set_soft_device_placement(True) if tf_utils.current_accelerator_type() != "TPU": tf.debugging.set_log_device_placement(True) if FLAG_DISTRIBUTE_MODE.value in constants.PURE_DATA_PARALLEL_STRATEGIES: actual_num_replicas = len(tf_utils.devices_to_use()) elif FLAG_DISTRIBUTE_MODE.value in constants.DATA_PARALLEL_DMC: actual_num_replicas = FLAG_NUM_REPLICAS.value else: actual_num_replicas = 1 ############################################################################## # We load the retriever model if it is needed. ############################################################################## # Not currently used. retriever = None # if (FLAG_APPROACH_TYPE.value == # constants.ApproachTypeChoices.lm_and_realm): # raise NotImplementedError("This part needs to be tested anew.") # config_path = FLAG_RETRIEVER_CONFIG_PATH.value # realm_save = tf_utils.REALMSave(**utils.from_json_file(config_path)) # # # Approx 15 min when not in dev mode, on CPU # with utils.log_duration(LOGGER, "main", # "whole of BERTScaNNRetriever.__init__", # logging.INFO): # scann_config = retrievers.ScannConfig( # **utils.from_json_file(FLAG_SCANN_CONFIG_PATH.value)) # retriever = retrievers.BERTScaNNRetriever( # retriever_module_path=realm_save.query_embedder_path, # block_records_path=realm_save.text_records, # num_block_records=realm_save.num_block_records, # mode=tf.estimator.ModeKeys.EVAL, # scann_config=scann_config) # elif (FLAG_APPROACH_TYPE.value == # constants.ApproachTypeChoices.cached_realm): # raise NotImplementedError("This part needs to be tested anew.") # config_path = FLAG_RETRIEVER_CONFIG_PATH.value # realm_save = tf_utils.REALMSave(**utils.from_json_file(config_path)) # # # Approx 15 min when not in dev mode, on CPU # with utils.log_duration(LOGGER, "main", # "whole of FullyCachedRetriever.__init__", # logging.INFO): # # retriever = retrievers.FullyCachedRetriever( # db_path=FLAG_FULLYCACHED_H5_PATH.value, # block_records_path=realm_save.text_records, # num_block_records=realm_save.num_block_records, # ) ############################################################################## # Distributed training task ############################################################################## if FLAG_TASK.value == constants.TaskChoices.train: with utils.log_duration(LOGGER, "main", "Load model"): utils.print_mem("before loading model", LOGGER) model_specific = task_specific.load_model( FLAG_MODEL_LOAD_PATH.value, FLAG_MODEL_KEY.value, FLAG_DISTRIBUTE_MODE.value, tpu_setup, FLAG_NUM_REPLICAS.value) utils.print_mem("after loading model", LOGGER) model_or_replicas = model_specific.model if isinstance(model_or_replicas, list): model_or_replicas: List[transformers.TFGPT2LMHeadModel] else: model_or_replicas: transformers.TFGPT2LMHeadModel tokenizer = model_specific.tokenizer def make_optimizer(): return tensor2tensor.utils.adafactor.AdafactorOptimizer( learning_rate=FLAG_LEARNING_RATE.value) if model_specific.strategy: with model_specific.strategy.scope(): optimizer = make_optimizer() else: optimizer = make_optimizer() ############################################################################ # Prepare the dataset functions ############################################################################ rg = np.random.default_rng(FLAG_RANDOM_SEED.value) def call_lm_preproc(repeat, split, random_seed): """Using functools.partial prevents the linter from doing its job.""" if FLAG_DATASET_NAME.value == constants.DatasetNameChoices.kilt_eli5: return task_specific.create_lm_ds_kilt_eli5( tokenizer=tokenizer, context_window_size=( model_or_replicas[0].config.n_positions if isinstance( model_or_replicas, list) else model_or_replicas.config.n_positions), dataset_name=FLAG_DATASET_NAME.value, # Batches are split over the replicas: batch_size=FLAG_BATCH_SIZE.value * actual_num_replicas, db_path=FLAG_DB_PATH.value, random_seed=random_seed, use_subset=FLAG_USE_SUBSET.value, subset_size=FLAG_SUBSET_SIZE.value, use_helper_words=FLAG_USE_HELPER_WORDS.value, approach_type=FLAG_APPROACH_TYPE.value, num_retrievals=FLAG_NUM_RETRIEVALS.value, retrieval_temperature=FLAG_RETRIEVAL_TEMPERATURE.value, retriever=retriever, repeat=repeat, split=split, enable_debug_checks=FLAG_DATASET_DEBUG.value, retrieval_bank_size=FLAG_RETRIEVAL_BANK_SIZE.value, dataset_type=FLAG_DATASET_TYPE.value, qty_shuffle=FLAG_QTY_SHUFFLE.value, tfr_prefix=FLAG_TFR_PREFIX.value, max_length_generation=FLAG_MAX_LENGTH_GENERATION.value, ) else: raise NotImplementedError( f"FLAG_DATASET_NAME.value unsupported: `{FLAG_DATASET_NAME.value}`" ) make_training_dataset: Callable[Ellipsis, tf.data.Dataset] = functools.partial( call_lm_preproc, split="train", repeat=False, ) make_eval_dataset: Callable[Ellipsis, tf.data.Dataset] = functools.partial( call_lm_preproc, split="eval", repeat=True, ) ############################################################################ # Prepare the step functions ############################################################################ utils.check_contained(FLAG_DISTRIBUTE_MODE.value, constants.DistributeModeChoices.choices()) tf_function_flags = dict( experimental_compile=FLAG_EXPERIMENTAL_COMPILE.value, experimental_relax_shapes=not FLAG_INPUT_FIXED_SIZE.value) if (FLAG_DISTRIBUTE_MODE.value == constants.DistributeModeChoices.split_and_data_parallel): if not isinstance(model_or_replicas, list): raise RuntimeError(type(model_or_replicas)) training_step = build_manual_data_parallel_training_step( model_or_replicas, optimizer, tf_function_flags) else: training_step = build_regular_training_step( model_or_replicas, optimizer, strategy=model_specific.strategy, tf_function_kwargs=tf_function_flags) evaluation_step = build_evaluation_step(model_or_replicas, tf_function_flags) secs_since_last_ckpt = time.time() # Model checkpoints are saved to the tmp_directory and then rsynced to GCS ########################################################################## # Prepare the different logging facilities ########################################################################## train_log_dir = os.path.join(instance_output_dir, "tensorboard", "train") eval_log_dir = os.path.join(instance_output_dir, "tensorboard", "eval") flags_log_dir = os.path.join(instance_output_dir, "tensorboard", "params") writers = dict(train=tf.summary.create_file_writer(train_log_dir), eval=tf.summary.create_file_writer(eval_log_dir), flags=tf.summary.create_file_writer(flags_log_dir)) with writers["flags"].as_default(): tf.summary.text( "Flags", # Tensorboard takes Markdown: json.dumps(flags_dict, indent=4).replace("\n", "\n\n"), step=0) ma_loss = dict(train=utils.MovingAverage(0.9), eval=utils.MovingAverage(0.9)) step_counters = dict(train=0, eval=0) batch_counters = dict(train=0, eval=0) prev_batch_end = time.time() # The eval ds has no real concept of epoch, repeats forever, shuffling # each time it reaches its end with utils.log_duration(LOGGER, "main", "All of make_eval_dataset"): eval_ds_instance = make_eval_dataset(random_seed=rg.integers( -2**63, 2**63 - 1), ) LOGGER.debug("Distributing the eval dataset to the replicas.") if FLAG_DATASET_TYPE.value == "tfr": eval_ds_instance = ( model_specific.strategy.experimental_distribute_dataset( eval_ds_instance)) LOGGER.debug("Done distributing the eval dataset to the replcias.") eval_ds_instance = iter(eval_ds_instance) ########################################################################## # Training Loop ########################################################################## for epoch in itertools.count(): #################################################################### # Epoch Setup #################################################################### LOGGER.debug("EPOCH %d START", epoch) # Shuffle differently every epoch with utils.log_duration(LOGGER, "main", "All of make_training_dataset"): train_ds_instance = make_training_dataset( random_seed=rg.integers(-2**63, 2**63 - 1), ) LOGGER.debug( "Attempting to distribute the training dataset to the replicas." ) if FLAG_DATASET_TYPE.value == "tfr": train_ds_instance = ( model_specific.strategy.experimental_distribute_dataset( train_ds_instance)) LOGGER.debug( "Done distributing the training dataset to the replicas.") train_ds_instance = iter(train_ds_instance) # This allows us to see if we reached the end of the training iterator, # in which case "did_at_least_one_training_batch == False". # We could also test that it did all the batches, to similar results. did_at_least_one_training_batch = True split = "eval" while did_at_least_one_training_batch: # Invert split if split == "train": split = "eval" else: split = "train" # Prepare to test if we did at least one training batch if split == "train": did_at_least_one_training_batch = False if split == "train": dataset_iterator = itertools.islice( train_ds_instance, FLAG_BATCHES_BETWEEN_EVALS.value) else: # The evaluation DS is tiny, so we reshuffle and take a random dataset_iterator = itertools.islice( eval_ds_instance, FLAG_NUMBER_EVAL_BATCHES.value) LOGGER.debug("Batching") for batch in dataset_iterator: # LOGGER.debug("Input sentence:\n\"%s\"", # tokenizer.decode([x for x in batch["input_ids"][0] # if x != tokenizer.eos_token_id])) # LOGGER.debug("Label:\n\"%s\"", # tokenizer.decode([(x if x != -100 else 0) # for x in batch["label_ids"][0]])) if FLAG_DATASET_TYPE.value != "tfr": batch = (model_specific.strategy. experimental_distribute_values_from_function( tf_utils.make_dict_distribute_fn(batch))) # We only care about training epochs as, obviously, we don't train # over eval samples; the number of eval samples seen only # contributes to lowering the variance in the evaluation of when to # do early stopping. if split == "train": did_at_least_one_training_batch = True input_ids = batch["input_ids"] label_ids = batch["label_ids"] #################################################################### # Training Step #################################################################### step_counters[split] += (FLAG_BATCH_SIZE.value * actual_num_replicas) if split == "train": batch_counters[split] += 1 training_kwargs = dict( input_ids=input_ids, label_ids=label_ids, ) if model_specific.strategy: utils.print_mem("before running", LOGGER) LOGGER.debug("Training, Calling strategy.run") loss = model_specific.strategy.run( training_step, kwargs=training_kwargs) LOGGER.debug("Training, Done with strategy.run") utils.print_mem("after running", LOGGER) else: loss = training_step(**training_kwargs) # pytype: disable=wrong-arg-count # If we are in the strategy-free data parallel mode, we need # to change the weights of all replicas to those of the model at # index 0 if (FLAG_DISTRIBUTE_MODE.value == constants.DistributeModeChoices. split_and_data_parallel): for replica in model_or_replicas[1:]: replica.set_weights( model_or_replicas[0].get_weights()) #################################################################### # Evaluation Step #################################################################### elif split == "eval": evaluation_kwargs = dict( input_ids=input_ids, label_ids=label_ids, ) if model_specific.strategy: loss = model_specific.strategy.run( evaluation_step, kwargs=evaluation_kwargs) else: loss = evaluation_step(**evaluation_kwargs) else: raise ValueError( f"Unexpected value for split: {split}") #################################################################### # Logging #################################################################### if (FLAG_DISTRIBUTE_MODE.value in constants.PURE_DATA_PARALLEL_STRATEGIES): utils.check_equal(len(loss.values), actual_num_replicas) LOGGER.debug("Split: %s", split) LOGGER.debug("Real num replicas: %s", actual_num_replicas) LOGGER.debug("Loss: %s", loss) LOGGER.debug("Loss values: %s", loss.values) average_loss = float( tf.math.reduce_mean(loss.values).numpy()) else: average_loss = float(loss.numpy()) # tf.debugging.check_numerics(loss) now = time.time() batch_duration = now - prev_batch_end prev_batch_end = now ma_loss[split].update(average_loss) # Actual logging LOGGER.info("Epoch: # %d", epoch) LOGGER.info("Tensorboard_dir: %s", instance_output_dir) LOGGER.info("Batch: %s # %d", split, batch_counters[split]) LOGGER.info("Step: %s # %d", split, step_counters[split]) if FLAG_USE_SUBSET.value: LOGGER.warning(">> USING A SUBSET OF THE DATASET <<") LOGGER.info("%(split)s Batch loss: %(metric)f", dict(split=split, metric=average_loss)) LOGGER.info( "%(split)s Moving average loss: %(metric)f", dict(split=split, metric=ma_loss[split].average)) LOGGER.info( "%(split)s Moving average ppl: %(metric)f", dict(split=split, metric=np.exp(ma_loss[split].average))) LOGGER.info( "%(split)s Batch duration: %(duration)s", dict(split=split, duration=utils.TimeStamp.from_seconds( batch_duration).format())) if FLAG_DISTRIBUTE_MODE.value in constants.DATA_PARALLEL_DMC: LOGGER.info( "%(split)s Duration per sample: %(duration)s", dict(split=split, duration=utils.TimeStamp.from_seconds( batch_duration / (FLAG_BATCH_SIZE.value * actual_num_replicas)))) # Write to Tensorboard with writers[split].as_default(): tf.summary.scalar(f"Loss/{split}", average_loss, step_counters[split]) tf.summary.scalar(f"PPL/{split}", np.exp(average_loss), step_counters[split]) writers[split].flush() # Save every 5 min if (time.time() - secs_since_last_ckpt) / (60 * 20) >= 1: secs_since_last_ckpt = time.time() save_model(train_steps=step_counters["train"], model_or_replicas=model_or_replicas, instance_output_dir=instance_output_dir) secs_since_last_ckpt = time.time() save_model(train_steps=step_counters["train"], model_or_replicas=model_or_replicas, instance_output_dir=instance_output_dir) ############################################################# # Post Training Cleanup ####################################################################### for writer in writers.values(): writer.close()
def generate_textid_corpus(args: argparse.Namespace) -> None: """ Read raw files (in specified directory), parse and filter, then output the Bert token-ids for all files to another directory :param args: ArgumentParser-parsed arguments :return: None """ if not args.mode in VALID_MODES: raise ValueError(f"The argument 'mode' needs to be one of " f"{VALID_MODES}, got {args.mode}.") if platform.system() == "Darwin" and args.mode in MODES_NEEDING_BLINGFIRE: raise Exception( f"Got a mode requiring Blingfire (mode = {args.mode}), " "yet Blingfire doesn't support Macos.") if not blingfire: # If we aren't using blingfire, then we must use spacy # for sentence segmentation. try: spacy_model = spacy.load("en_core_web_sm") except OSError: print() print("Exception:") print("Didn't find the model for spacy.") print("Run 'python -m spacy download en_core_web_sm'") exit(-1) # Get list of input file paths in_list = sorted(glob.glob(os.path.join(args.input_dir, "*.txt"))) if args.max_number_of_books: in_list = in_list[:args.max_number_of_books] logging.warning( f"{colorama.Fore.RED}>>> USING A MAX NUMBER OF BOOKS <<<" f"{colorama.Style.RESET_ALL}") # Load blingfire textid model if args.mode == "blingfire" and platform.system() == "Darwin": raise Exception("BlingFire is not compatible with MacOS.") idtok_model = None if blingfire and args.mode in MODES_NEEDING_BLINGFIRE: model_path = os.path.join(args.textid_dir, args.base_tok_file) utils.check_file_exists(model_path) idtok_model = blingfire.load_model(model_path) utils.check_file_exists(args.vocab_path) bert_full_tokenizer = tokenization.FullTokenizer(vocab_file=str( args.vocab_path), do_lower_case=False) if args.mode == "check": with open(args.vocab_path) as fin: ids_to_words = fin.read().strip().split("\n") words_to_ids = {i: word for i, word in enumerate(ids_to_words)} # Iterate through each raw file if args.mode != "blingfire": print("WARNING: We aren't in a mode that doesn't " f"exclusively use Blingfire. Will be slow.\nMode: {args.mode}") logging.info(f"Main Loop - {args.mode}") for i, in_file_path in enumerate(tqdm.tqdm(in_list)): # Generate output file path file_basename = os.path.splitext(os.path.basename(in_file_path))[0] out_file_path = os.path.join(args.output_dir, file_basename) # Read file chunk by chunk with open(in_file_path) as in_file: # We read the whole file, then cut to CHUNK_MAX_LEN characters long. # This seems like a more resistant way to guarantee that we # correctly get full sentences. # The length of the chunks at 100k is the longuest that doesn't # break spacy's sentence tokenizer. logging.debug("Loading a file >") file_text = in_file.read().strip() if not file_text: continue logging.debug("< Done loading a file") for i in range(len(file_text) // CHUNK_MAX_LEN): logging.debug("Chunking. >") chunk = file_text[i * CHUNK_MAX_LEN:(i + 1) * CHUNK_MAX_LEN] # Get the blingfire-processed sentences from this chunk # (NOTE: maybe redundant, look into it maybe removing if slow) sent_tok_start = time.time() logging.debug("< Done chunking.") logging.debug("Segmentizing sentence. >") if blingfire: sentences = chunk_to_sentences(chunk) else: sentences = [str(x) for x in spacy_model(chunk).sents] # Ignore the first and last sentences, as they've # likely been cut weirdly by the chunking process. # We loose less than 1/1000th of all sentences by doing this. # (with a CHUNK_MAX_LEN of 100k). logging.debug(f"Number of sentences: {len(sentences)}") sentences = sentences[1:-1] logging.debug(f"< Done segmentizing sentence. It took " f"{time.time() - sent_tok_start} seconds.") # Additional filtering for plaintext sentences filter_time_start = time.time() logging.debug("Filtering sentences >") ft_sentences = filter_sentences(sentences) logging.debug(f"< Done filtering sentences. It took " f"{time.time() - filter_time_start} seconds.") # Convert each sentence to their textid bpe_tok_time_start = time.time() logging.debug("Tokenizing sentences >") curr_ids = utils.TypedList(np.ndarray) for ft_sent in ft_sentences: ids = None if blingfire: ids = blingfire.text_to_ids(idtok_model, ft_sent, args.id_seq_length, args.oov_id) if args.mode == "bert-native" or args.mode == "check": bert_tokens = bert_full_tokenizer.tokenize(ft_sent) bert_tok_ids = bert_full_tokenizer.convert_tokens_to_ids( bert_tokens) bert_tok_ids_ = utils.TypedList(int) for x in bert_tok_ids: bert_tok_ids_.append(x) bert_tok_ids = bert_tok_ids_ while len(bert_tok_ids) < args.id_seq_length: bert_tok_ids.append(0) bert_tok_ids = np.array( list(bert_tok_ids), dtype=np.int32)[:args.id_seq_length] if args.mode == "bert-native": ids = bert_tok_ids if args.mode == "check": # In the "check" mode, we test that both the # bert native tokenizer and blingfire return # the same thing. utils.check_equal(ids.shape, bert_tok_ids.shape) comp = ids == bert_tok_ids if not np.all(comp): def bert_decode(ids): return " ".join( ids_to_words[wid] for wid in ids if wid != 0) #.replace(" ##", "") # print("Blingfire ids:") # print(ids) print( "\n################################################" ) print("Mismatch between decoders:") print( f"\t Blingfire decoded: \"{bert_decode(ids)}\"" ) print( f"\t- Bert-native decoded: \"{bert_decode(bert_tok_ids)}\"" ) print( "################################################\n" ) # print("Bert-native tokenizer ids:") # print(bert_tok_ids) num_errors = np.sum(np.logical_not(comp)) out_of = max(np.sum(ids != 0), np.sum(bert_tok_ids != 0)) if num_errors / out_of >= 1: raise ValueError(f"{num_errors} " f"different out of {out_of} " f"non padding values") curr_ids.append(ids) logging.debug(f"< Done tokenizing sentences. It took " f"{time.time() - bpe_tok_time_start} seconds.") concat_time_start = time.time() logging.debug("Concatenating the ids. >") if not curr_ids: logging.warning(">> Warning: empty cur_file_ids") id_mat = np.array(list(curr_ids), dtype=np.int32) logging.debug(f"< Done Concatenating the ids. Took " f"{time.time() - concat_time_start} seconds.") if len(id_mat) == 0: logging.warn( f"We got an id_mat of size 0.\nFile index = {i}." f"\nBook file path = {in_file_path}.") logging.debug("Saving >") path = pathlib.Path(out_file_path) np.save(path.parent / (f"{i}_" + str(path.name)), id_mat) logging.debug("< Done saving.") # Free model if blingfire: blingfire.free_model(idtok_model)
print_header("5 Logging and Auditing") # print_header("5.1 Configure Syslog") # print_header("5.1.1 Install the rsyslog package (Scored)") check_equal_re( "rpm -q rsyslog", "rsyslog.*" ) # print_header("5.1.2 Activate the rsyslog Service (Scored)") check_equal( "rpm -q syslog", "package syslog is not installed" ) check_empty("chkconfig --list | grep syslog") check_equal_re( "chkconfig --list rsyslog", "rsyslog.*0:off.*1:off.*2:on.*3:on.*4:on.*5:on.*6:off" ) # print_header("5.1.3 Configure /etc/rsyslog.conf (Not Scored)") print_warning("Manually review the contents of the /etc/rsyslog.conf file to ensure appropriate logging is set. ") view_output("ls -l /var/log/") # print_header("5.1.4 Create and Set Permissions on rsyslog Log Files (Scored)") print_header(" TODO - Ensure that the log files are logging information")
def make_accelerator_type() -> str: utils.check_equal(_FLAG_TPU_TYPE.value, "v3") utils.check_equal(_FLAG_TPU_QTY.value, "8") assert not _FLAG_PREEMPTIBLE_TPU.value, _FLAG_PREEMPTIBLE_TPU.value return f"{_FLAG_TPU_TYPE.value}-{_FLAG_TPU_QTY.value}"
__version__ = "1.0.0" __status__ = "Production" from utils import check_empty, check_equal, check_equal_re, check_equals, check_not_empty, check_return_code, print_header, view_output, print_warning, print_info # print_header("4 Network Configuration and Firewalls") # print_header("4.1 Modify Network Parameters (Host Only)") # print_header("4.1.1 Disable IP Forwarding (Scored)") check_equal( "/sbin/sysctl net.ipv4.ip_forward", "net.ipv4.ip_forward = 0" ) # print_header("4.1.2 Disable Send Packet Redirects (Scored)") check_equal( "/sbin/sysctl net.ipv4.conf.all.send_redirects", "net.ipv4.conf.all.send_redirects = 0" ) check_equal( "/sbin/sysctl net.ipv4.conf.default.send_redirects", "net.ipv4.conf.default.send_redirects = 0" ) # print_header("4.2 Modify Network Parameters (Host and Router)")
def main(argv): # Arguments and logging boilerplate if len(argv) > 1: raise RuntimeError(argv) absl_logging.use_python_logging() utils.log_module_args(LOGGER, argv[0]) # Load a retriever config. retriever_config = tf_utils.REALMConfig( **utils.from_json_file(_FLAG_RETRIEVER_CONFIG_PATH.value)) assert not _FLAG_USE_SUBSET.value # Preparation of the output path time_stamp = time.strftime("%Y%m%d-%H%M%S") target_path = os.path.join(_FLAG_OUTPUT_PATH.value, time_stamp.strip()) if target_path[-1] != "/": target_path += "/" ############################################################################## # Setup devices and strategy ############################################################################## # Duration is pretty much instantaneous with utils.log_duration(LOGGER, "main", "Initializing devices"): tpu_config = tf_utils.init_tpus(local=_FLAG_TPU_IS_LOCAL.value, tpu_name=_FLAG_TPU_NAME.value) device_type = tf_utils.current_accelerator_type() LOGGER.debug("Devices: %s", str(tf_utils.devices_to_use())) if _FLAG_TPU_NAME.value and device_type == "CPU": raise RuntimeError("Device is CPU and we expected a TPU.") if device_type == "TPU": if tpu_config is None: raise RuntimeError("We should have a tpu_config.") strategy = tf.distribute.TPUStrategy(tpu_config.resolver) batch_size = len( tf_utils.devices_to_use()) * _FLAG_BATCH_SIZE.value elif device_type == "GPU" or device_type == "CPU": strategy = tf.distribute.MirroredStrategy() batch_size = len( tf_utils.devices_to_use()) * _FLAG_BATCH_SIZE.value else: raise RuntimeError(device_type) ############################################################################## # Load the KILT ELI5 dataset. ############################################################################## # Takes a while eli5 = {} keys = ["train", "validation", "test"] gpt2_tokenizer = transformers.GPT2TokenizerFast.from_pretrained("gpt2-xl") gpt2_tokenizer.pad_token = gpt2_tokenizer.eos_token with utils.log_duration(LOGGER, "main", "Loading the ELI5 datasets."): if _FLAG_DATASET_ROOT.value: for split in tqdm.tqdm(keys): load_path = os.path.join(_FLAG_DATASET_ROOT.value, "HuggingfaceDatasets", f"{split}_kilt_eli5.hf") with tf.device("/job:localhost"): eli5[split] = datasets.load_from_disk(load_path) else: eli5 = datasets.load_dataset("kilt_tasks", "eli5") ############################################################################## # Load the dataset of the text that will be retrieved. ############################################################################## # Takes a long time with utils.log_duration(LOGGER, "Main", "Load the textual dataset"): # Extract the appropriate text # The buffer_size is taken from the original ORQA code. blocks_dataset = tf.data.TFRecordDataset(retriever_config.text_records, buffer_size=512 * 1024 * 1024) blocks_dataset = blocks_dataset.batch( retriever_config.num_block_records, drop_remainder=False) blocks: tf.Tensor = tf.data.experimental.get_single_element( blocks_dataset) ############################################################################ # Increase the number of maximum open file descriptors to make space # for all the shards. ############################################################################ max_num_fd = _FLAG_NUM_SHARDS.value * 3 + _MIN_N_FD resource.setrlimit(resource.RLIMIT_NOFILE, (max_num_fd, max_num_fd)) ############################################################################ # Prepare the output files. ############################################################################ writers = {} all_paths = {} for split in keys: maybe_subset = "_subset" if _FLAG_USE_SUBSET.value else "" # Prepare paths. They can't be in a generator. A function generator would be # fine though. paths = [ os.path.join(target_path + maybe_subset, f"{split}_{i}.tfr") for i in range(_FLAG_NUM_SHARDS.value) ] all_paths[split] = paths writers[split] = [] # Create The TFR writers. for i, path in enumerate(paths): writers[split].append(tf.io.TFRecordWriter(path)) # Load the reference DB. We used to accidentally do this once per split :O with utils.log_duration(LOGGER, "main", "Loading the reference db."): checkpoint_path = os.path.join(retriever_config.query_embedder_path, "encoded", "encoded.ckpt") reference_db_device = tf_utils.device_mapping().CPUs[0].name with tf.device(reference_db_device): reference_db = tf_utils.load_reference_db( checkpoint_path, variable_name="block_emb", ) ############################################################################ # Prep the encoder and the tokenizer ############################################################################ with utils.log_duration(LOGGER, "main", "Loading the encoder model and the tokenizer."): with strategy.scope(): query_encoder = hub.load(retriever_config.query_embedder_path, tags={}) encode_fn = _make_encode_fn(query_encoder) encode_fn_strategy_run = make_encode_fn_strategy_run_fn( strategy=strategy, encode_fn=encode_fn, ) vocab_file = os.path.join(retriever_config.query_embedder_path, "assets", "vocab.txt") utils.check_exists(vocab_file) do_lower_case = query_encoder.signatures["tokenization_info"]( )["do_lower_case"] tokenization_info = dict(vocab_file=vocab_file, do_lower_case=do_lower_case) tokenizer, vocab_lookup_table = bert_utils.get_tf_tokenizer( query_encoder, tokenization_info) ############################################################################ # Preprocess the dataset ############################################################################ cls_token_id = tf.cast(vocab_lookup_table.lookup(tf.constant("[CLS]")), tf.int32) sep_token_id = tf.cast(vocab_lookup_table.lookup(tf.constant("[SEP]")), tf.int32) transform = _make_transform_fn( bert_tokenizer=tokenizer, bert_cls_token_id=cls_token_id, bert_sep_token_id=sep_token_id, ) feature_dtypes = { constants.CTH5Fields.distances: tf.float32, constants.CTH5Fields.gpt2_retrieved_ids: tf.int32, constants.CTH5Fields.gpt2_answer_ids_inputs: tf.int32, constants.CTH5Fields.gpt2_question_ids_inputs: tf.int32, } with utils.log_duration(LOGGER, "main", "generating codes"): for split in keys: sample_count = 0 eli5: Dict[str, datasets.Dataset] if split != "test": for_slices = dict(sample_id=eli5[split]["id"], question=eli5[split]["input"], answer=[ sample[0]["answer"] for sample in eli5[split]["output"] ]) else: for_slices = dict( sample_id=eli5[split]["id"], question=eli5[split]["input"], ) ds = tf.data.Dataset.from_tensor_slices(for_slices) ds = ds.map(transform, num_parallel_calls=tf.data.experimental.AUTOTUNE) ds = ds.apply( tf.data.experimental.dense_to_ragged_batch(batch_size)) ds = ds.map(_squeeze, num_parallel_calls=tf.data.experimental.AUTOTUNE) tqdm_inner = tqdm.tqdm(enumerate(ds), total=len(eli5[split]["id"]) // _FLAG_BATCH_SIZE.value, desc=f"Split `{split}`: Batches") for i, batch in tqdm_inner: features = collections.defaultdict(list) ###################################################################### # Enforce the current real batch size ###################################################################### current_batch_size = batch["sample_id"].shape[0] for k, v in batch.items(): utils.check_equal(v.shape[0], current_batch_size) ###################################################################### gpt2_question_ids_inputs = _prep_field(batch["question"], gpt2_tokenizer) utils.check_equal(gpt2_question_ids_inputs.dtype, np.int32) utils.check_equal(gpt2_question_ids_inputs.shape[0], current_batch_size) if split != "test": gpt2_answer_ids_inputs = _prep_field( batch["answer"], gpt2_tokenizer) utils.check_equal(gpt2_answer_ids_inputs.dtype, np.int32) utils.check_equal(gpt2_answer_ids_inputs.shape[0], current_batch_size) assert len(gpt2_answer_ids_inputs.shape) == 2, ( gpt2_answer_ids_inputs.shape) ###################################################################### # Save the gpt2 tokenized question and answer ###################################################################### features[constants.CTH5Fields.gpt2_question_ids_inputs].extend( gpt2_question_ids_inputs) if split != "test": features[ constants.CTH5Fields.gpt2_answer_ids_inputs].extend( gpt2_answer_ids_inputs) ###################################################################### # Encode the samples. ###################################################################### batch = strategy.experimental_distribute_values_from_function( tf_utils.make_dict_distribute_fn(batch)) embeddings = encode_fn_strategy_run(batch) embeddings = tf_utils.process_strat_output( embeddings, "embeddings", strategy, current_batch_size) utils.check_isinstance(embeddings, ops.EagerTensor) utils.check_equal(embeddings.shape[0], current_batch_size) # pytype doesn't seem to see that we check the type utils.check_equal(embeddings.shape[1], _FLAG_EMBEDDING_DEPTH.value) # pytype: disable=attribute-error ###################################################################### # Retrieve. ###################################################################### # Do exact retrieval with tf.device(reference_db_device): top_k, inner_prods = tf_utils.mips_exact_search( embeddings, _FLAG_NUM_RETRIEVALS.value, reference_db) # Collate the results top_k = tf_utils.process_strat_output(top_k, "top_k", strategy, current_batch_size) # Check the shapes utils.check_equal( inner_prods.shape, (current_batch_size, _FLAG_NUM_RETRIEVALS.value)) utils.check_equal( top_k.shape, (current_batch_size, _FLAG_NUM_RETRIEVALS.value)) # Save the distances features[constants.CTH5Fields.distances].extend(inner_prods) # Retrieve the text fields associated to the indices gathered = tf.gather(blocks, top_k).numpy() utils.check_equal(gathered.shape[0], current_batch_size) utils.check_equal(gathered.shape[1], _FLAG_NUM_RETRIEVALS.value) retrievals = [] for index_in_batch in range(current_batch_size): # Put the appropriate byte strings in a list local_gathered = gathered[index_in_batch].tolist() utils.check_equal(len(local_gathered), _FLAG_NUM_RETRIEVALS.value) # Decode to utf-8 local_gathered = [ sample.decode() for sample in local_gathered ] # Encode to GPT2 BPE token_ids = np.array( gpt2_tokenizer.batch_encode_plus( local_gathered, padding="max_length", truncation=True, ).input_ids) # Make sure no line is empty # TODO(julesgm): Maybe optional for line in token_ids: assert not np.all(line == 0), line # Convert the eos_tokens token_ids[token_ids == gpt2_tokenizer.eos_token_id] = -1 # Save the retrievals retrievals.append(token_ids) # Save the feature features[constants.CTH5Fields.gpt2_retrieved_ids] = retrievals utils.check_equal( retrievals[0].shape, (_FLAG_NUM_RETRIEVALS.value, _FLAG_CONTEXT_SIZE.value)) for k, v in features.items(): utils.check_equal(len(v), current_batch_size) for index_in_batch in range(current_batch_size): feature_dict = {} for feature_k, feature_v in features.items(): # Cast the feature to its appropriate dtype casted_feats = tf.cast(feature_v[index_in_batch], feature_dtypes[feature_k]) # Serialize the tensor to bytes feature_bytes = tf.io.serialize_tensor(casted_feats) # Build a bytes list tf.train.Feature object, # the serialization tree node feature_dict[feature_k] = _bytes_feature(feature_bytes) # Create the serialization tree root # Expects a list of features feature = tf.train.Features(feature=feature_dict) # Expects a tf.train.Features object example_obj = tf.train.Example(features=feature) # Serialize that to bytes serialized_example = example_obj.SerializeToString() # Write the bytes # TODO(julesgm): Parallelize this with a thread or a process pool & # futures. writers[split][sample_count % _FLAG_NUM_SHARDS.value].write( serialized_example) sample_count += 1 if sample_count % 1000 == 0: LOGGER.debug("Paths: %s", str(all_paths[split][0])) LOGGER.debug("Flushing and closing the `%s` writers", split) for writer in tqdm.tqdm(writers[split]): writer.flush() writer.close() LOGGER.debug("Done.")
__credits__ = ["???"] __license__ = "???" __version__ = "1.0.0" __status__ = "Production" from utils import check_empty, check_equal, check_equal_re, check_equals, check_not_empty, check_return_code, print_header, view_output, print_warning, print_info # print_header("1. Install Updates, Patches and Additional Security Software") # print_header("1.1 Filesystem Configuration") # print_header("1.1.1 Create Separate Partition for /tmp (Scored)") check_equal('grep "[[:space:]]/tmp[[:space:]]" /etc/fstab', '/tmp') # print_header("1.1.2 Set nodev option for /tmp Partition (Scored)") # No tmp partition should have nodev. check_equal("grep /tmp /etc/fstab", "nodev") check_equal("mount | grep /tmp", "nodev") # print_header("1.1.3 Set nosuid option for /tmp Partition (Scored)") # No tmp partition should have nosuid. check_equal("grep /tmp /etc/fstab", "nosuid") check_equal("mount | grep /tmp", "nosuid") # print_header("1.1.4 Set noexec option for /tmp Partition (Scored)")
def _create_int_feature(values, feature_len): feature_list = list(values) utils.check_equal(len(feature_list), feature_len) feature = tf.train.Feature(int64_list=tf.train.Int64List( value=feature_list)) return feature
__credits__ = ["???"] __license__ = "???" __version__ = "1.0.0" __status__ = "Production" from utils import check_empty, check_equal, check_equal_re, check_equals, check_not_empty, check_return_code, print_header, view_output, print_warning, print_info # print_header("2. OS Services") # print_header("2.1 Remove Legacy Services") # print_header("2.1.1 Remove telnet-server (Scored)") check_equal("rpm -q telnet-server", "package telnet-server is not installed") # print_header("2.1.2 Remove telnet Clients (Scored)") check_equal("rpm -q telnet", "package telnet is not installed") # print_header("2.1.3 Remove rsh-server (Scored)") check_equal("rpm -q rsh-server", "package rsh-server is not installed") # print_header("2.1.4 Remove rsh (Scored)") check_equal("rpm -q rsh", "package rsh is not installed") # print_header("2.1.5 Remove NIS Client (Scored)")
def main(argv): if len(argv) > 1: raise app.UsageError("Too many command-line arguments.") absl_logging.use_python_logging() utils.log_module_args(LOGGER, argv[0]) # Some checks for the flags utils.check_exists(FLAGS.source_text_path) utils.check_exists(os.path.dirname(FLAGS.subset_text_path)) utils.check_exists(os.path.dirname(FLAGS.subset_embeddings_ds_path)) utils.check_operator(operator.lt, FLAGS.subset_total, FLAGS.source_total) utils.check_glob_prefix(FLAGS.source_embeddings_prefix) # Select a random subset with utils.log_duration(LOGGER, "main", "preparing indices"): indices = np.random.choice(FLAGS.source_total, FLAGS.subset_total, replace=False) indices.sort() # Process the textual data # Much (5 min vs 2 h) faster than iterating through the records and writing # only those we want. An hypothesis for this is that # get_single_element would allow to get elements without parsing all of the # elements along the way, like simply iterating through the records would. # Or did they get constant time indexing in TFRecords? # Inspired by the ORQA codebase: # https://github.com/google-research/language/blob/master/language/orqa/models/orqa_model.py#L147 with utils.log_duration(LOGGER, "main", "preparing data"): text_ds = tf.data.TFRecordDataset(FLAGS.source_text_path, buffer_size=512 * 1024 * 1024, num_parallel_reads=os.cpu_count()) text_ds = text_ds.batch(FLAGS.source_total) text_ds = tf.data.experimental.get_single_element(text_ds) subset = tf.gather(text_ds, tf.constant(indices)) with utils.log_duration(LOGGER, "main", "writing text data"): with tf.io.TFRecordWriter(FLAGS.subset_text_path) as text_writer: for text in tqdm.tqdm(subset, total=FLAGS.subset_total): text = text.numpy() # REALM's data uses no packaging of the data into features, etc. text_writer.write(text) with utils.log_duration(LOGGER, "main", "All of the embedding task"): # Process the embeddings data with tf.device("/cpu:0"): with utils.log_duration(LOGGER, "main", "Loading the checkpoint"): embs = tf.train.load_checkpoint( FLAGS.source_embeddings_prefix).get_tensor("block_emb") utils.check_equal(embs.shape[0], FLAGS.source_total) with utils.log_duration(LOGGER, "main", "taking a subset of the indices"): subset = embs[indices] tf_db = tf.Variable(subset, shape=subset.shape) ckpt = tf.train.Checkpoint(block_emb=tf_db) with utils.log_duration(LOGGER, "main", "Saving the checkpoint"): ckpt.save(FLAGS.subset_embeddings_ds_path) LOGGER.debug("Done")
__license__ = "???" __version__ = "1.0.0" __status__ = "Production" from utils import check_empty, check_equal, check_equal_re, check_equals, check_not_empty, check_return_code, print_header, view_output, print_warning, print_info # print_header("6 System Access, Authentication and Authorization") # print_header("6.1 Configure cron and anacron") # print_header("6.1.1 Enable anacron Daemon (Scored)") check_equal("rpm -q anacron", "package anacron is not installed") print_info("Not installed syco servers.") print_header("6.1.2 Enable crond Daemon (Scored)") check_equal_re( "chkconfig --list crond", "crond.*0:off.*1:off.*2:on.*3:on.*4:on.*5:on.*6:off" ) # print_header("6.1.3 Set User/Group Owner and Permission on /etc/anacrontab (Scored)") check_equal('stat -c "%a %u %g" /etc/anacrontab | egrep "600 0 0"', "600 0 0") # print_header("6.1.4 Set User/Group Owner and Permission on /etc/crontab (Scored)") check_equal('stat -c "%a %u %g" /etc/crontab | egrep "600 0 0"', "600 0 0")
def main(argv): if len(argv) > 1: raise RuntimeError(argv) absl_logging.use_python_logging() utils.log_module_args(LOGGER, argv[0]) retriever_config = tf_utils.REALMSave( **utils.from_json_file(_FLAG_RETRIEVER_CONFIG_PATH.value)) assert not _FLAG_USE_SUBSET.value time_stamp = time.strftime("%Y%m%d-%H%M%S") target_path = os.path.join(_FLAG_OUTPUT_PATH.value, time_stamp.strip()) if target_path[-1] != "/": target_path += "/" ############################################################################## # Setup devices and strategy ############################################################################## with utils.log_duration(LOGGER, "main", "Initializing devices"): tpu_config = tf_utils.init_tpus() device_type = tf_utils.current_accelerator_type() LOGGER.debug("Devices: %s", str(tf_utils.devices_to_use())) if device_type == "TPU": if tpu_config is None: raise RuntimeError("We should have a tpu_config.") strategy = tf.distribute.TPUStrategy(tpu_config.resolver) batch_size = len( tf_utils.devices_to_use()) * _FLAG_BATCH_SIZE.value elif device_type == "GPU" or device_type == "CPU": strategy = tf.distribute.MirroredStrategy() batch_size = len( tf_utils.devices_to_use()) * _FLAG_BATCH_SIZE.value else: raise RuntimeError(device_type) ############################################################################## # Load the dataset. ############################################################################## eli5 = {} keys = ["train", "eval", "test"] gpt2_tokenizer = transformers.GPT2TokenizerFast.from_pretrained("gpt2-xl") gpt2_tokenizer.pad_token = gpt2_tokenizer.eos_token with utils.log_duration(LOGGER, "main", "Loading the ELI5 datasets."): for split in tqdm.tqdm(keys): load_path = os.path.join(_FLAG_DATASET_ROOT.value, "HuggingfaceDatasets", f"{split}_kilt_eli5.hf") with tf.device("/job:localhost"): eli5[split] = datasets.load_from_disk(load_path) ############################################################################## # ############################################################################## with utils.log_duration(LOGGER, "Main", "Load the textual dataset"): # Extract the appropriate text # The buffer_size is taken from the original ORQA code. blocks_dataset = tf.data.TFRecordDataset(retriever_config.text_records, buffer_size=512 * 1024 * 1024) blocks_dataset = blocks_dataset.batch( retriever_config.num_block_records, drop_remainder=True) blocks = tf.data.experimental.get_single_element(blocks_dataset) ############################################################################ # Prepare the output file. ############################################################################ writers = {} all_paths = {} for split in keys: maybe_subset = "_subset" if _FLAG_USE_SUBSET.value else "" paths = [ os.path.join(target_path + maybe_subset, f"{split}_{i}.tfr") for i in range(_FLAG_NUM_SHARDS.value) ] all_paths[split] = paths writers[split] = [tf.io.TFRecordWriter(filename) for filename in paths] with utils.log_duration(LOGGER, "main", "Loading the reference db."): checkpoint_path = os.path.join( retriever_config.query_embedder_path, "encoded", "encoded.ckpt") reference_db_device = tf_utils.device_mapping().CPUs[0].name with tf.device(reference_db_device): reference_db = tf_utils.load_reference_db( checkpoint_path, variable_name="block_emb", ) ############################################################################ # Prep the encoder and the tokenizer ############################################################################ with utils.log_duration(LOGGER, "main", "Loading the encoder model and the tokenizer."): with strategy.scope(): query_encoder = hub.load(retriever_config.query_embedder_path, tags={}) encode_fn = _make_encode_fn(query_encoder) encode_fn_strategy_run = make_encode_fn_strategy_run_fn( strategy=strategy, encode_fn=encode_fn, ) vocab_file = os.path.join(retriever_config.query_embedder_path, "assets", "vocab.txt") utils.check_exists(vocab_file) do_lower_case = query_encoder.signatures["tokenization_info"]( )["do_lower_case"] tokenization_info = dict(vocab_file=vocab_file, do_lower_case=do_lower_case) tokenizer, vocab_lookup_table = bert_utils.get_tf_tokenizer( query_encoder, tokenization_info) ############################################################################ # Preprocess the dataset ############################################################################ cls_token_id = tf.cast(vocab_lookup_table.lookup(tf.constant("[CLS]")), tf.int32) sep_token_id = tf.cast(vocab_lookup_table.lookup(tf.constant("[SEP]")), tf.int32) transform = _make_transform_fn( bert_tokenizer=tokenizer, bert_cls_token_id=cls_token_id, bert_sep_token_id=sep_token_id, ) feature_dtypes = { constants.CTH5Fields.distances: tf.float32, constants.CTH5Fields.gpt2_retrieved_ids: tf.int32, constants.CTH5Fields.gpt2_answer_ids_inputs: tf.int32, constants.CTH5Fields.gpt2_question_ids_inputs: tf.int32, } with utils.log_duration(LOGGER, "main", "generating codes"): for split in keys: sample_count = 0 eli5: Dict[str, datasets.Dataset] if split != "test": for_slices = dict(sample_id=eli5[split]["id"], question=eli5[split]["input"], answer=[ sample["answer"][0] for sample in eli5[split]["output"] ]) else: for_slices = dict( sample_id=eli5[split]["id"], question=eli5[split]["input"], ) ds = tf.data.Dataset.from_tensor_slices(for_slices) ds = ds.map(transform, num_parallel_calls=tf.data.experimental.AUTOTUNE) ds = ds.apply( tf.data.experimental.dense_to_ragged_batch(batch_size)) ds = ds.map(_squeeze, num_parallel_calls=tf.data.experimental.AUTOTUNE) tqdm_inner = tqdm.tqdm(enumerate(ds), total=len(eli5[split]["id"]) // _FLAG_BATCH_SIZE.value, desc=f"Split `{split}`: Batches") for i, batch in tqdm_inner: features = collections.defaultdict(list) ###################################################################### # Enforce the current real batch size ###################################################################### current_batch_size = batch["sample_id"].shape[0] for k, v in batch.items(): utils.check_equal(v.shape[0], current_batch_size) ###################################################################### gpt2_question_ids_inputs = _prep_field(batch["question"], gpt2_tokenizer) utils.check_equal(gpt2_question_ids_inputs.dtype, np.int32) utils.check_equal(gpt2_question_ids_inputs.shape[0], current_batch_size) if split != "test": gpt2_answer_ids_inputs = _prep_field( batch["answer"], gpt2_tokenizer) utils.check_equal(gpt2_answer_ids_inputs.dtype, np.int32) utils.check_equal(gpt2_answer_ids_inputs.shape[0], current_batch_size) assert len(gpt2_answer_ids_inputs.shape) == 2, ( gpt2_answer_ids_inputs.shape) ###################################################################### # Save the gpt2 tokenized question and answer ###################################################################### features[constants.CTH5Fields.gpt2_question_ids_inputs].extend( gpt2_question_ids_inputs) if split != "test": features[ constants.CTH5Fields.gpt2_answer_ids_inputs].extend( gpt2_answer_ids_inputs) ###################################################################### # Encode the samples. ###################################################################### batch = strategy.experimental_distribute_values_from_function( tf_utils.make_dict_distribute_fn(batch)) embeddings = encode_fn_strategy_run(batch) embeddings = tf_utils.process_strat_output( embeddings, "embeddings", strategy, current_batch_size) utils.check_isinstance(embeddings, ops.EagerTensor) utils.check_equal(embeddings.shape[0], current_batch_size) # pytype doesn't seem to see that we check the type utils.check_equal(embeddings.shape[1], _FLAG_EMBEDDING_DEPTH.value) # pytype: disable=attribute-error ###################################################################### # Retrieve. ###################################################################### with tf.device(reference_db_device): top_k, inner_prods = tf_utils.mips_exact_search( embeddings, _FLAG_NUM_RETRIEVALS.value, reference_db) top_k = tf_utils.process_strat_output(top_k, "top_k", strategy, current_batch_size) utils.check_equal( inner_prods.shape, (current_batch_size, _FLAG_NUM_RETRIEVALS.value)) utils.check_equal( top_k.shape, (current_batch_size, _FLAG_NUM_RETRIEVALS.value)) features[constants.CTH5Fields.distances].extend(inner_prods) gathered = tf.gather(blocks, top_k).numpy() utils.check_equal(gathered.shape[0], current_batch_size) retrievals = [] for j in range(gathered.shape[0]): local_gathered = gathered[j].tolist() utils.check_equal(len(local_gathered), _FLAG_NUM_RETRIEVALS.value) local_gathered = [ sample.decode() for sample in local_gathered ] token_ids = np.array( gpt2_tokenizer.batch_encode_plus( local_gathered, padding="max_length", truncation=True, ).input_ids) for line in token_ids: assert not np.all(line == 0), line token_ids[token_ids == gpt2_tokenizer.eos_token_id] = -1 retrievals.append(token_ids) features[constants.CTH5Fields.gpt2_retrieved_ids] = retrievals utils.check_equal( retrievals[0].shape, (_FLAG_NUM_RETRIEVALS.value, _FLAG_CONTEXT_SIZE.value)) for k, v in features.items(): utils.check_equal(len(v), current_batch_size) for k in range(current_batch_size): feature = tf.train.Features( feature={ k: _bytes_feature( tf.io.serialize_tensor( tf.cast(v[k], feature_dtypes[k]))) for k, v in features.items() }) writers[split][ sample_count % _FLAG_NUM_SHARDS.value].write( tf.train.Example( features=feature).SerializeToString()) sample_count += 1 if sample_count % 1000 == 0: LOGGER.debug("Paths: %s", str(all_paths[split][0])) LOGGER.debug("Done.")
__credits__ = ["???"] __license__ = "???" __version__ = "1.0.0" __status__ = "Production" from utils import check_empty, check_equal, check_equal_re, check_equals, check_not_empty, check_return_code, print_header, view_output, print_warning, print_info # print_header("4 Network Configuration and Firewalls") # print_header("4.1 Modify Network Parameters (Host Only)") # print_header("4.1.1 Disable IP Forwarding (Scored)") check_equal("/sbin/sysctl net.ipv4.ip_forward", "net.ipv4.ip_forward = 0") # print_header("4.1.2 Disable Send Packet Redirects (Scored)") check_equal("/sbin/sysctl net.ipv4.conf.all.send_redirects", "net.ipv4.conf.all.send_redirects = 0") check_equal("/sbin/sysctl net.ipv4.conf.default.send_redirects", "net.ipv4.conf.default.send_redirects = 0") # print_header("4.2 Modify Network Parameters (Host and Router)") # print_header("4.2.1 Disable Source Routed Packet Acceptance (Scored)") check_equal("/sbin/sysctl net.ipv4.conf.all.accept_source_route", "net.ipv4.conf.all.accept_source_route = 0")
def main(argv): if len(argv) > 1: raise RuntimeError(argv[1:]) absl_logging.use_python_logging() utils.check_contained(_FLAG_APPROACH_TYPE.value, _ACCEPTABLE_APPROACHES) db_path = _FLAG_DB_PATH.value model_path = _FLAG_MODEL_PATH.value tpu_config = tf_utils.init_tpus() device_type = tf_utils.devices_to_use()[0].device_type if device_type == "TPU": assert isinstance(tpu_config, tf_utils.TpuConfigType) strategy = tf.distribute.TPUStrategy(tpu_config.resolver) elif device_type == "GPU" or "CPU": # MirroredStrategy automatically becomes OneDeviceStrategy if there is # just one device, like one GPU or only CPUs. strategy = tf.distribute.MirroredStrategy() else: raise RuntimeError() ############################################################################## # Load Model ############################################################################## with utils.log_duration(LOGGER, main.__name__, "All of model preparation"): def make_model_tf(path): with utils.log_duration(LOGGER, make_model_tf.__name__, "Load model."): if os.path.exists(path): config_path = os.path.join(path, "config.json") model_path = os.path.join(path, "tf_model.h5") utils.check_exists(config_path) utils.check_exists(model_path) config = transformers.GPT2Config.from_pretrained( config_path) return transformers.TFGPT2LMHeadModel.from_pretrained( model_path, config=config) else: return transformers.TFGPT2LMHeadModel.from_pretrained( path, ) with strategy.scope(): if model_path.startswith("gs://"): with utils.log_duration(LOGGER, main.__name__, "Download model from GS"): with tempfile.TemporaryDirectory() as td: td += os.path.sep if os.path.exists("/root/google-cloud-sdk/bin/gsutil"): exec_ = "/root/google-cloud-sdk/bin/gsutil" else: exec_ = "gsutil" command = [ exec_, "-m", "cp", "-r", os.path.join(model_path, "*"), td, ] LOGGER.debug("Running bash command: %s", " ".join(command)) subprocess.check_call(command) LOGGER.debug("Files at the temp dir(%s): %s", td, str(os.listdir(td))) model = make_model_tf(td) else: model = make_model_tf(model_path) model.__call__ = tf.function( model.__call__, experimental_relax_shapes=True, experimental_compile=True, ) ############################################################################## # Load Dataset Pipeline ############################################################################## utils.check_contained( _FLAG_APPROACH_TYPE.value, { constants.ApproachTypeChoices.naked_lm, constants.ApproachTypeChoices.naked_lm }) devices = tf_utils.devices_to_use() num_replicas = len(devices) if devices[0].device_type in {"GPU", "TPU" } else 1 # Only a batch size of 1 is currently supported. We need attention masks utils.check_equal(_FLAG_BATCH_SIZE.value, 1) batch_size = _FLAG_BATCH_SIZE.value * num_replicas approach_type = _FLAG_APPROACH_TYPE.value # Things that will never change: random_seed = 0 use_helper_words = True retrieval_temperature = 1 context_window_size = 1024 logging.debug("Loading dataset.") tokenizer = transformers.GPT2TokenizerFast.from_pretrained("gpt2-xl") ds = task_specific.create_lm_ds_kilt_eli5( tokenizer=tokenizer, context_window_size=context_window_size, dataset_name="kilt_eli5", batch_size=1, # >> We set our own batch size elsewhere db_path=db_path, random_seed=random_seed, use_subset=False, subset_size=-1, use_helper_words=use_helper_words, approach_type=approach_type, num_retrievals=5, # Will never change retrieval_temperature=retrieval_temperature, retriever=None, # Cached retrievals don't need a retriever repeat=False, # Will never change split=_FLAG_SPLIT.value, enable_debug_checks=False, retrieval_bank_size=5, # Will never change dataset_type=_FLAG_DATASET_TYPE.value, tfr_prefix=_FLAG_TFR_PREFIX.value, qty_shuffle=1, # Will never change max_length_generation=_FLAG_GENERATION_LENGTH_LIMIT.value) def further_prep_generate_not_test(batch): batch = tf.boolean_mask( batch["input_ids"], tf.logical_and(batch["label_ids"] == -100, batch["input_ids"] != tokenizer.eos_token_id)) return batch @tf.function def further_prep_generate_test(batch): batch = tf.boolean_mask(batch["input_ids"], batch["input_ids"] != tokenizer.eos_token_id) return batch if _FLAG_SPLIT.value == constants.SplitChoices.test: ds = ds.map(further_prep_generate_test) else: ds = ds.map(further_prep_generate_not_test) ds = ds.padded_batch(batch_size=batch_size, padding_values=tokenizer.eos_token_id) ds = strategy.experimental_distribute_dataset(ds) ############################################################################## # Generate ############################################################################## LOGGER.debug("Generating.") generations = [] counter = tqdm.tqdm(ds, total=task_specific.DATASET_CARDINALITIES["kilt_eli5"][ _FLAG_SPLIT.value]) for batch_no, batch in enumerate(counter): output = strategy.run( model.generate, kwargs=dict(input_ids=batch, max_length=_FLAG_GENERATION_LENGTH_LIMIT.value, use_cache=True, attention_mask=batch == tokenizer.eos_token_id)) LOGGER.debug("INPUT: %s", tokenizer.decode(batch[0])) output = tf_utils.process_strat_output(strategy_outputs=output, current_batch_size=batch_size, strategy=strategy, name="generations") with utils.log_duration(LOGGER, "main", "all of tokenizer.decode for a batch."): for i in range(batch_size): text = tokenizer.decode(output.numpy()[i]) LOGGER.debug("Batch %d Generation %d", batch_no, i) LOGGER.debug(text.replace("\n", " <\\n> ")) generations.append(text) counter.update(batch.shape[0]) utils.to_json_file( os.path.join(_FLAG_OUTPUT_PATH.value, _FLAG_SPLIT.value, _FLAG_APPROACH_TYPE.value, time.strftime("%Y%m%d-%H%M%S.json")), dict(flags={ flag.name: flag.value for flag in flags.FLAGS.flags_by_module_dict()[argv[0]] }, generations=generations)) logging.debug("Saved to: %s", _FLAG_OUTPUT_PATH.value)
def main(argv): ############################################################################## # Initial Setup. Logging, Flags, Random seeds. ############################################################################## if len(argv) > 1: raise app.UsageError("Too many command-line arguments.") absl_logging.use_python_logging() flags_dict = { flag.name: flag.value for flag in FLAGS.flags_by_module_dict()[argv[0]] } if FLAGS.use_subset: message = (f"{colorama.Back.RED}{colorama.Fore.WHITE}" f"{colorama.Style.BRIGHT}USING A SUBSET OF THE DATASET" f"{colorama.Style.RESET_ALL}") LOGGER.warning(message) utils.log_module_args(LOGGER, argv[0]) if not FLAGS.output_dir.startswith("gs://"): utils.check_exists(FLAG_OUTPUT_DIR.value) if not tf.io.gfile.isdir(FLAG_OUTPUT_DIR.value): raise RuntimeError("Output dir needs to be a directory.") tf.random.set_seed(FLAG_RANDOM_SEED.value) np.random.seed(FLAG_RANDOM_SEED.value) # Prepare the instance output directory path and save the config there # Prepare the path folder_name = time.strftime( f"{FLAG_RUN_NAME.value}_{FLAG_APPROACH_TYPE.value}_%Y%m%d-%H%M%S") instance_output_dir = os.path.join(FLAG_OUTPUT_DIR.value, folder_name).strip() if not instance_output_dir.endswith("/"): instance_output_dir += "/" json_target = os.path.join(instance_output_dir, "training_params.json") # Make the folder if we're not on gcloud if not json_target.strip().startswith("gs://"): subprocess.check_call(["mkdir", "-p", instance_output_dir]) # Safe the config file utils.to_json_file(json_target, flags_dict) ############################################################################## # Initialization and Configuration of the Devices. ############################################################################## tpu_setup = None accel = tf_utils.current_accelerator_type() if FLAG_TPU_IS_LOCAL.value: assert accel == "TPU", accel if accel == "TPU": assert FLAG_TPU_IS_LOCAL.value, FLAG_TPU_IS_LOCAL.value if tf_utils.current_accelerator_type() in {"CPU", "TPU"}: tpu_setup = tf_utils.init_tpus(tpu_name=FLAG_TPU_NAME.value, local=FLAG_TPU_IS_LOCAL.value) LOGGER.debug("Devices we are computing on:\n%s", utils.wrap_iterable(map(str, tf_utils.devices_to_use()))) LOGGER.debug("All devices:") LOGGER.debug(tf_utils.device_mapping()) if tf_utils.current_accelerator_type() == "GPU": tf.config.set_soft_device_placement(True) if tf_utils.current_accelerator_type() != "TPU": tf.debugging.set_log_device_placement(True) utils.check_operator(operator.ne, tf_utils.current_accelerator_type(), "CPU") assert FLAG_TPU_NAME.value == socket.gethostname(), ( "This is a configuration choice. You can remove this. " "There will be no side effects.") if FLAG_DISTRIBUTE_MODE.value in constants.PURE_DATA_PARALLEL_STRATEGIES: actual_num_replicas = len(tf_utils.devices_to_use()) elif FLAG_DISTRIBUTE_MODE.value in constants.DATA_PARALLEL_DMC: actual_num_replicas = FLAG_NUM_REPLICAS.value else: actual_num_replicas = 1 ############################################################################## # We load the retriever model if it is needed. ############################################################################## # Not currently used. See old commits. retriever = None ############################################################################## # Distributed training task ############################################################################## if FLAG_TASK.value == constants.TaskChoices.train: with utils.log_duration(LOGGER, "main", "Load model"): utils.print_mem("before loading model", LOGGER) model_specific = task_specific.load_model( FLAG_MODEL_KEY.value, FLAG_DISTRIBUTE_MODE.value, tpu_setup, FLAG_NUM_REPLICAS.value) utils.print_mem("after loading model", LOGGER) model = model_specific.model if isinstance(model, list): model: List[transformers.TFGPT2LMHeadModel] else: model: transformers.TFGPT2LMHeadModel tokenizer = model_specific.tokenizer def make_optimizer(): if FLAG_OPTIMIZER_TYPE.value == constants.OptimizerTypes.adafactor: return tensor2tensor.utils.adafactor.AdafactorOptimizer( learning_rate=FLAG_LEARNING_RATE.value) elif FLAG_OPTIMIZER_TYPE.value == constants.OptimizerTypes.adam: return tf.keras.optimizers.Adam( learning_rate=FLAG_LEARNING_RATE.value) else: raise ValueError(FLAG_OPTIMIZER_TYPE.value) if model_specific.strategy: with model_specific.strategy.scope(): optimizer = make_optimizer() else: optimizer = make_optimizer() ############################################################################ # Prepare the dataset functions ############################################################################ rg = np.random.default_rng(FLAG_RANDOM_SEED.value) def call_lm_preproc(repeat, split, random_seed): """Using functools.partial prevents the linter from doing its job.""" if FLAG_DATASET_NAME.value == constants.DatasetNameChoices.kilt_eli5: return task_specific.create_lm_ds_kilt_eli5( tokenizer=tokenizer, context_window_size=model.config.n_positions, dataset_name=FLAG_DATASET_NAME.value, # Batches are split over the replicas: batch_size=FLAG_BATCH_SIZE.value * actual_num_replicas, db_path=FLAG_DB_PATH.value, random_seed=random_seed, use_subset=FLAG_USE_SUBSET.value, subset_size=FLAG_SUBSET_SIZE.value, use_helper_words=FLAG_USE_HELPER_WORDS.value, approach_type=FLAG_APPROACH_TYPE.value, num_retrievals=FLAG_NUM_RETRIEVALS.value, retrieval_temperature=FLAG_RETRIEVAL_TEMPERATURE.value, retriever=retriever, repeat=repeat, split=split, enable_debug_checks=FLAG_DATASET_DEBUG.value, retrieval_bank_size=FLAG_RETRIEVAL_BANK_SIZE.value, dataset_type=FLAG_DATASET_TYPE.value, qty_shuffle=FLAG_QTY_SHUFFLE.value, tfr_prefix=FLAG_TFR_PREFIX.value, max_length_generation=FLAG_MAX_LENGTH_GENERATION.value, ) else: raise NotImplementedError( f"FLAG_DATASET_NAME.value unsupported: `{FLAG_DATASET_NAME.value}`" ) make_training_dataset: Callable[..., tf.data.Dataset] = functools.partial( call_lm_preproc, split="train", repeat=False, ) make_eval_dataset: Callable[..., tf.data.Dataset] = functools.partial( call_lm_preproc, split="eval", repeat=True, ) ############################################################################ # Prepare the step functions ############################################################################ utils.check_contained(FLAG_DISTRIBUTE_MODE.value, constants.DistributeModeChoices.choices()) tf_function_flags = dict( experimental_compile=FLAG_EXPERIMENTAL_COMPILE.value, experimental_relax_shapes=not FLAG_INPUT_FIXED_SIZE.value) training_step = build_regular_training_step( model, optimizer, strategy=model_specific.strategy, tf_function_kwargs=tf_function_flags) evaluation_step = build_evaluation_step(model, tf_function_flags) timestamp_last_ckpt_secs = time.time() # Model checkpoints are saved to the tmp_directory and then rsynced to GCS ############################################################################ # Prepare the statistics and the logging facilities. ############################################################################ # Tensorboard with model_specific.strategy.scope(): checkpoint = tf.train.Checkpoint(optimizer=optimizer, model=model) saver = Saver(instance_output_dir, checkpoint) train_log_dir = os.path.join(instance_output_dir, "tensorboard", "train") eval_log_dir = os.path.join(instance_output_dir, "tensorboard", "eval") flags_log_dir = os.path.join(instance_output_dir, "tensorboard", "params") writers = dict(train=tf.summary.create_file_writer(train_log_dir), eval=tf.summary.create_file_writer(eval_log_dir), flags=tf.summary.create_file_writer(flags_log_dir)) with writers["flags"].as_default(): tf.summary.text( "Flags", # Tensorboard takes Markdown: json.dumps(flags_dict, indent=4).replace("\n", "\n\n"), step=0) # Different information to log. ma_loss = dict(train=utils.MovingAverage(0.9), eval=utils.MovingAverage(0.9)) step_counters = dict(train=0, eval=0) batch_counters = dict(train=0, eval=0) prev_batch_end = time.time() ############################################################################ # Create the Eval DS object. # ========================================================================== # The eval ds has no real concept of epoch, repeats forever, shuffling # each time it reaches its end. ############################################################################ # Create with utils.log_duration(LOGGER, "main", "All of make_eval_dataset"): eval_ds_instance = make_eval_dataset(random_seed=rg.integers( -2**63, 2**63 - 1), ) # Maybe distribute LOGGER.debug("Distributing the eval dataset to the replicas.") if FLAG_DATASET_TYPE.value == "tfr": eval_ds_instance = ( model_specific.strategy.experimental_distribute_dataset( eval_ds_instance)) # Start the iteration. We step by calling `next(...)`. LOGGER.debug("Done distributing the eval dataset to the replicas.") eval_ds_instance = iter(eval_ds_instance) step_function = dict(train=training_step, eval=evaluation_step) ############################################################################ # Training Loop # ========================================================================== # Create a new training dataset object that lasts for one epoch. # This is different from the eval training dataset object, which loops # forever. ############################################################################ for epoch in itertools.count(): ########################################################################## # Epoch Setup ########################################################################## LOGGER.debug("EPOCH %d START", epoch) # Shuffle differently every epoch with utils.log_duration(LOGGER, "main", "All of make_training_dataset"): train_ds_instance = make_training_dataset( random_seed=rg.integers(-2**63, 2**63 - 1), ) LOGGER.debug( "Attempting to distribute the training dataset to the replicas." ) if FLAG_DATASET_TYPE.value == "tfr": train_ds_instance = ( model_specific.strategy.experimental_distribute_dataset( train_ds_instance)) LOGGER.debug( "Done distributing the training dataset to the replicas.") train_ds_instance = iter(train_ds_instance) # To change splits, we use `itertools.islice` over the dataset generator. # When the training dataset generator is done, a new loop of the following # while loop occurs, but no training batch is done because we are taking # an `islice` of a generator that is done. did_at_least_one_training_batch = True split = "eval" while did_at_least_one_training_batch: utils.check_operator(operator.ne, tf_utils.current_accelerator_type(), "CPU") # Invert split if split == "train": split = "eval" else: split = "train" # Prepare to test if we did at least one training batch if split == "train": did_at_least_one_training_batch = False ######################################################################## # Take slices from the dataset iterator # ====================================================================== # We only want to do a certain number of batches before switching splits # We do this by using an `itertools.islice` of the dataset iterators. ######################################################################## if split == "train": dataset_iterator = toolz.take( FLAG_BATCHES_BETWEEN_EVALS.value, train_ds_instance) else: # The evaluation dataset generator is infinite, reshuffles everytime # it gets to its end. # Still, we take a fixed size slice form that infinite generator. dataset_iterator = toolz.take( FLAG_NUMBER_EVAL_BATCHES.value, eval_ds_instance) LOGGER.debug("Batching") for batch in dataset_iterator: if FLAG_LOG_SAMPLES.value: #################################################################### # Print elements of the dataset #################################################################### # Make ourselves resistant to values possibly being a PerReplica # object LOGGER.warning( f"%(red)sLOGGING SAMPLES. THIS IS VERY SLOW.%(reset)s", dict( red=colorama.Fore.RED, reset=colorama.Style.RESET_ALL, )) is_distributed = isinstance(batch["input_ids"], values.PerReplica) for in_batch_idx in range(FLAG_BATCH_SIZE.value): for replica_idx in (range(actual_num_replicas) if is_distributed else [0]): if is_distributed: sample = { k: batch[k].values[replica_idx] for k in batch } else: sample = batch # input_sentence = tokenizer.decode( # [x for x in sample["input_ids"][i] if x != tokenizer.eos_token_id] # ) # LOGGER.debug( # "%sInput [%d / %d]%s:\n\"%s\"", # colorama.Fore.GREEN, # replica_idx + 1, # actual_num_replicas, # colorama.Style.RESET_ALL, # input_sentence, # ) # # answer = tokenizer.decode( # [(x if x != -100 else 0) for x in sample["label_ids"][i]] # ) # LOGGER.debug( # "%sLabel [%d / %d]%s:\n\"%s\"", # colorama.Fore.GREEN, # replica_idx + 1, # actual_num_replicas, # colorama.Style.RESET_ALL, # answer, # ) cons = console.Console() sentences = table.Table() sentences.add_column("BPE Index", justify="center") sentences.add_column("Inputs", justify="center") sentences.add_column("Labels", justify="center") for bpe_idx, (x, y) in enumerate( itertools.zip_longest( sample["input_ids"] [in_batch_idx].numpy(), sample["label_ids"] [in_batch_idx].numpy(), fillvalue=None, )): x_w = tokenizer.decode( [x]) if x >= 0 else f"[ {x} ]" y_w = tokenizer.decode( [y]) if y >= 0 else f"[ {y} ]" sentences.add_row(str(bpe_idx), x_w, y_w) cons.print(sentences) # We only care about training epochs as, obviously, we don't train # over eval samples; the number of eval samples seen only # contributes to lowering the variance in the evaluation of when to # do early stopping. if split == "train": did_at_least_one_training_batch = True input_ids = batch["input_ids"] label_ids = batch["label_ids"] # Per split step counter step_counters[ split] += FLAG_BATCH_SIZE.value * actual_num_replicas batch_counters[split] += 1 ###################################################################### # Model step function. ###################################################################### step_function_kwargs = dict( input_ids=input_ids, label_ids=label_ids, ) utils.print_mem(f"[{split}] - Mem before `strategy.run`", LOGGER) LOGGER.debug("[%s] - Calling `strategy.run`", split) loss = model_specific.strategy.run( step_function[split], kwargs=step_function_kwargs) LOGGER.debug("[%s] - Done `strategy.run`", split) utils.print_mem(f"[{split}] - Mem after `strategy.run`", LOGGER) #################################################################### # End of logging step code / Logging and saving the model. #################################################################### if (FLAG_DISTRIBUTE_MODE.value in constants.PURE_DATA_PARALLEL_STRATEGIES): utils.check_equal(len(loss.values), actual_num_replicas) LOGGER.debug("[%s] - Real num replicas: %s", split, actual_num_replicas) average_loss = float( tf.math.reduce_mean(loss.values).numpy()) LOGGER.debug("[%s] - Loss: %s", str(split), str(average_loss)) else: average_loss = float(loss.numpy()) tf.debugging.check_numerics( loss.values if isinstance(loss, values.PerReplica) else loss, "Numerics failed.") now = time.time() batch_duration = now - prev_batch_end prev_batch_end = now ma_loss[split].update(average_loss) LOGGER.info("[%s] - Epoch: # %d", split, epoch) LOGGER.info("[%s] - Tensorboard_dir: %s", split, instance_output_dir) LOGGER.info("[%s] - Batch: # %d", split, batch_counters[split]) LOGGER.info("[%s] - Step: # %d", split, step_counters[split]) if FLAG_USE_SUBSET.value: LOGGER.warning(">> USING A SUBSET OF THE DATASET <<") LOGGER.info( "[%(split)s] - Batch loss: %(metric)f", dict(split=split, metric=average_loss)) LOGGER.info( "[%(split)s] - Moving average loss: %(metric)f", dict(split=split, metric=ma_loss[split].average)) LOGGER.info( "[%(split)s] - Moving average ppl: %(metric)f", dict(split=split, metric=np.exp(ma_loss[split].average))) LOGGER.info( "[%(split)s] - Batch duration: %(duration)s", dict(split=split, duration=utils.TimeStamp.from_seconds( batch_duration).format())) # Write to Tensorboard with writers[split].as_default(): tf.summary.scalar(f"Loss/{split}", average_loss, step_counters[split]) tf.summary.scalar(f"PPL/{split}", np.exp(average_loss), step_counters[split]) writers[split].flush() ###################################################################### # Save every `FLAG_SAVE_PERIOD_MIN.value` minutes. ###################################################################### delta_sec = time.time() - timestamp_last_ckpt_secs utils.check_operator(operator.gt, delta_sec, 0) period_sec = 60 * FLAG_SAVE_PERIOD_MIN.value utils.check_operator(operator.gt, period_sec, 0) ratio = delta_sec / period_sec LOGGER.info( "[%(split)s] - RATIO: %(ratio)s", dict(split=split, ratio=str(ratio))) LOGGER.info( "[%(split)s] - Target: %(target)s, Present: %(present)s", dict( split=split, target=str(period_sec), present=str(delta_sec), )) if ratio >= 1: dur = delta_sec / 60 timestamp_last_ckpt_secs = time.time() LOGGER.debug( "SAVING MODEL - CAUSE: DURATION - %0.2f min", dur) # checkpoint.save(ckpt_prefix) saver.save_model( train_steps=step_counters["train"], model_or_replicas=model, optimizer=optimizer, ) ############################################################################ # Post Training Cleanup ############################################################################ for writer in writers.values(): writer.close()
__credits__ = ["???"] __license__ = "???" __version__ = "1.0.0" __status__ = "Production" from utils import check_empty, check_equal, check_equal_re, check_equals, check_not_empty, check_return_code, print_header, view_output, print_warning, print_info # print_header("6 System Access, Authentication and Authorization") # print_header("6.1 Configure cron and anacron") # print_header("6.1.1 Enable anacron Daemon (Scored)") check_equal("rpm -q anacron", "package anacron is not installed") print_info("Not installed syco servers.") print_header("6.1.2 Enable crond Daemon (Scored)") check_equal_re("chkconfig --list crond", "crond.*0:off.*1:off.*2:on.*3:on.*4:on.*5:on.*6:off") # print_header( "6.1.3 Set User/Group Owner and Permission on /etc/anacrontab (Scored)") check_equal('stat -c "%a %u %g" /etc/anacrontab | egrep "600 0 0"', "600 0 0") # print_header( "6.1.4 Set User/Group Owner and Permission on /etc/crontab (Scored)") check_equal('stat -c "%a %u %g" /etc/crontab | egrep "600 0 0"', "600 0 0")
import config # print_header("5 Logging and Auditing") # print_header("5.1 Configure Syslog") # print_header("5.1.1 Install the rsyslog package (Scored)") check_equal_re("rpm -q rsyslog", "rsyslog.*") # print_header("5.1.2 Activate the rsyslog Service (Scored)") check_equal("rpm -q syslog", "package syslog is not installed") check_empty("chkconfig --list | grep syslog") check_equal_re("chkconfig --list rsyslog", "rsyslog.*0:off.*1:off.*2:on.*3:on.*4:on.*5:on.*6:off") # print_header("5.1.3 Configure /etc/rsyslog.conf (Not Scored)") print_warning( "Manually review the contents of the /etc/rsyslog.conf file to ensure appropriate logging is set. " ) view_output("ls -l /var/log/") # print_header("5.1.4 Create and Set Permissions on rsyslog Log Files (Scored)") print_header(" TODO - Ensure that the log files are logging information")
__version__ = "1.0.0" __status__ = "Production" from utils import check_empty, check_equal, check_equal_re, check_equals, check_not_empty, check_return_code, print_header, view_output, print_warning, print_info # print_header("7. User Accounts and Environment") # print_header("7.1 Set Shadow Password Suite Parameters (/etc/login.defs)") # print_header("7.1.1 Set Password Expiration Days (Scored)") check_equal( "grep ^PASS_MAX_DAYS /etc/login.defs", "PASS_MAX_DAYS\t90" ) check_empty( 'awk -F: \'($3 > 0) {print $1}\' /etc/passwd | xargs -I {} ' + 'chage --list {}|' + 'grep "^Maximum number of days between password change"|'+ 'grep -v ": 99$"' ) # print_header("7.1.2 Set Password Change Minimum Number of Days (Scored)") check_equal( "grep ^PASS_MIN_DAYS /etc/login.defs", "PASS_MIN_DAYS\t7" )
from utils import check_empty, check_equal, check_equal_re, check_equals, check_not_empty, check_return_code, print_header, view_output, print_warning, print_info # print_header("9 System Maintenance") # print_header("9.1 Verify System File Permissions)") # print_header("9.1.1 Verify System File Permissions (Not Scored)") print_warning("Check manually for changed files.") view_output("rpm -Va --nomtime --nosize --nomd5 --nolinkto") # print_header("9.1.2 Verify Permissions on /etc/passwd (Scored)") check_equal('stat -c "%a %u %g" /etc/passwd | egrep "644 0 0"', "644 0 0") # print_header("9.1.3 Verify Permissions on /etc/shadow (Scored)") check_equal('stat -c "%a %u %g" /etc/shadow | egrep "0 0 0"', "0 0 0") # print_header("9.1.4 Verify Permissions on /etc/gshadow (Scored)") check_equal('stat -c "%a %u %g" /etc/gshadow | egrep "0 0 0"', "0 0 0") # print_header("9.1.5 Verify Permissions on /etc/group (Scored)") check_equal('stat -c "%a %u %g" /etc/group | egrep "644 0 0"', "644 0 0") # print_header("9.1.6 Verify User/Group Ownership on /etc/passwd (Scored)")
def main(argv): if len(argv) > 1: raise RuntimeError(argv[1:]) absl_logging.use_python_logging() utils.check_contained(_FLAG_APPROACH_TYPE.value, _ACCEPTABLE_APPROACHES) utils.check_operator(operator.xor, bool(_FLAG_H5_MODEL_PATH.value), bool(_FLAG_CKPT_MODEL_PATH.value)) if _FLAG_H5_MODEL_PATH.value: model_path = _FLAG_H5_MODEL_PATH.value mode = constants.SaveModeChoices.hfh5 elif _FLAG_CKPT_MODEL_PATH.value: model_path = _FLAG_CKPT_MODEL_PATH.value mode = constants.SaveModeChoices.ckpt else: raise RuntimeError("Logically should never happen.") utils.check_exists(model_path) device_type = tf_utils.devices_to_use()[0].device_type # ONLY GPU IS SUPPORTED utils.check_equal(device_type, "GPU") #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # Build the distribution strategy #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ if device_type == "TPU": # ONLY LOCAL TPU IS "SUPPORTED" utils.check_isinstance(_FLAG_IS_LOCAL_TPU.value, bool) assert _FLAG_IS_LOCAL_TPU.value tpu_config = tf_utils.init_tpus(local=True) utils.check_isinstance(tpu_config, tf_utils.TpuConfigType) utils.check_not_none(tpu_config) strategy = tf.distribute.TPUStrategy(tpu_config.resolver) elif device_type == "GPU": strategy = tf.distribute.MirroredStrategy( devices=tf.config.experimental.list_logical_devices('GPU')) else: raise RuntimeError(device_type) # ONLY GPU IS SUPPORTED print(tf.config.list_logical_devices()) utils.check_isinstance(strategy, tf.distribute.MirroredStrategy) ############################################################################## # Load Model ############################################################################## with utils.log_duration(LOGGER, main.__name__, "All of model preparation"): with strategy.scope(): # HF isn't able to read directly from GCS if (model_path.startswith("gs://") and mode == constants.SaveModeChoices.hfh5): with utils.log_duration(LOGGER, main.__name__, "Download model from GS"): with tempfile.TemporaryDirectory() as td: td += os.path.sep if os.path.exists("/root/google-cloud-sdk/bin/gsutil"): exec_ = "/root/google-cloud-sdk/bin/gsutil" else: exec_ = "gsutil" command = [ exec_, "-m", "cp", "-r", os.path.join(model_path, "*"), td, ] LOGGER.debug("Running bash command: %s", " ".join(command)) subprocess.check_call(command) LOGGER.debug("Files at the temp dir(%s): %s", td, str(os.listdir(td))) model = make_model_tf(td, mode=mode) else: model = make_model_tf(model_path, mode=mode) utils.check_not_none(model) ############################################################################## # Load Dataset Pipeline ############################################################################## utils.check_contained( _FLAG_APPROACH_TYPE.value, { constants.ApproachTypeChoices.naked_lm, constants.ApproachTypeChoices.cached_pretok }) devices = tf_utils.devices_to_use() num_replicas = (len(devices) if devices[0].device_type in {"GPU", "TPU"} else 1) utils.check_equal(devices[0].device_type, "GPU") # Only a batch size of 1 is currently supported. We need attention masks batch_size = _FLAG_BATCH_SIZE.value * num_replicas approach_type = _FLAG_APPROACH_TYPE.value logging.debug("Loading dataset.") tokenizer = transformers.GPT2TokenizerFast.from_pretrained("gpt2-xl") ds = prep_ds_for_generation( dict( tokenizer=tokenizer, context_window_size=1024, dataset_name="kilt_eli5", batch_size=1, # >> We set our own batch size elsewhere db_path=None, # None, random_seed=0, use_subset=False, subset_size=-1, use_helper_words=True, approach_type=approach_type, num_retrievals=5, # Will never change retrieval_temperature=1., retriever=None, # Cached retrievals don't need a retriever repeat=False, # Will never change split=_FLAG_SPLIT.value, enable_debug_checks=False, retrieval_bank_size=5, # Will never change dataset_type=_FLAG_DATASET_TYPE.value, tfr_prefix=_FLAG_TFR_PREFIX.value, qty_shuffle=1, # Will never change max_length_generation=350), tokenizer, _FLAG_SPLIT.value) ds = strategy.experimental_distribute_dataset(ds) ############################################################################## # Generate ############################################################################## LOGGER.debug("Generating.") generations = [] num_entries_in_split = ( task_specific.DATASET_CARDINALITIES["kilt_eli5"][_FLAG_SPLIT.value]) entries_counter = tqdm.tqdm(total=num_entries_in_split) for batch_no, batch in enumerate(ds): # Calling model.generate. We should make a config file with the # hyperparameters for generation, or make a facility in the one we already # have. I feel like a separate one would be better, separating concerns. output = strategy.run( model.generate, kwargs=dict( input_ids=batch, max_length=_FLAG_GENERATION_LENGTH_LIMIT.value, use_cache=True, attention_mask=tf.cast(batch != tokenizer.eos_token_id, tf.int32), repetition_penalty=2., num_beams=5, )) output = tf_utils.process_strat_output(strategy_outputs=output, current_batch_size=batch_size, strategy=strategy, name="generations") #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # Display the inputs and outputs. #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ rich_console = rich.console.Console(color_system="256") print_sample = make_print_sample() with utils.log_duration(LOGGER, "main", "all of tokenizer.decode for a batch."): for i in range(batch_size): input_text = tokenizer.decode(batch.numpy()[i]) output_text = tokenizer.decode(output.numpy()[i]) print("#" * 1000) print(f"Batch {batch_no} Generation {i}") print_sample(input_text, f"input batch_no {batch_no}", rich_console) print_sample(output_text, f"output batch_no {batch_no}", rich_console) generations.append(output_text) print("#" * 1000) entries_counter.update(batch.shape[0]) #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # Save the output to a JSON File. #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ utils.to_json_file( os.path.join(_FLAG_OUTPUT_PATH.value, _FLAG_SPLIT.value, _FLAG_APPROACH_TYPE.value, time.strftime("%Y%m%d-%H%M%S.json")), dict(flags={ flag.name: flag.value for flag in flags.FLAGS.flags_by_module_dict()[argv[0]] }, generations=generations)) logging.debug("Saved to: %s", _FLAG_OUTPUT_PATH.value)
__license__ = "???" __version__ = "1.0.0" __status__ = "Production" from utils import check_empty, check_equal, check_equal_re, check_equals, check_not_empty, check_return_code, print_header, view_output, print_warning, print_info import app # print_header("8 Warning Banners") # print_header("8.1 Set Warning Banner for Standard Login Services (Scored)") check_empty("diff %s/hardening/issue.net /etc/motd" % app.SYCO_VAR_PATH) check_empty("diff %s/hardening/issue.net /etc/issue" % app.SYCO_VAR_PATH) check_empty("diff %s/hardening/issue.net /etc/issue.net" % app.SYCO_VAR_PATH) check_equal('stat -c "%a %u %g" /etc/motd | egrep "644 0 0"', "644 0 0") check_equal('stat -c "%a %u %g" /etc/issue | egrep "644 0 0"', "644 0 0") check_equal('stat -c "%a %u %g" /etc/issue.net | egrep "644 0 0"', "644 0 0") # print_header("8.2 Remove OS Information from Login Warning Banners (Scored)") check_empty("egrep '(\\\\v|\\\\r|\\\\m|\\\\s)' /etc/issue") check_empty("egrep '(\\\\v|\\\\r|\\\\m|\\\\s)' /etc/motd") check_empty("egrep '(\\\\v|\\\\r|\\\\m|\\\\s)' /etc/issue.net") # print_header("8.3 Set GNOME Warning Banner (Not Scored)") print_info("Not using gnome.")