def count_file(self, path, verbose=False, add_eos=False): if verbose: print('counting file {} ...'.format(path)) assert exists(path) sents = [] with open(path, 'r') as f: for idx, line in enumerate(f): if verbose and idx > 0 and idx % 500000 == 0: print(' line {}'.format(idx)) symbols = self.tokenize(line, add_eos=True) self.counter.update(symbols) sents.append(symbols) return sents
def create_vocabulary(vocabulary_path, data_path, max_vocabulary_size, tokenizer=None, normalize_digits=True): """Create vocabulary file (if it does not exist yet) from data file. Data file is assumed to contain one sentence per line. Each sentence is tokenized and digits are normalized (if normalize_digits is set). Vocabulary contains the most-frequent tokens up to max_vocabulary_size. We write it to vocabulary_path in a one-token-per-line format, so that later token in the first line gets id=0, second line gets id=1, and so on. Args: vocabulary_path: path where the vocabulary will be created. data_path: data file that will be used to create vocabulary. max_vocabulary_size: limit on the size of the created vocabulary. tokenizer: a function to use to tokenize each data sentence; if None, basic_tokenizer will be used. normalize_digits: Boolean; if true, all digits are replaced by 0s. """ if not gfile.exists(vocabulary_path): print("Creating vocabulary %s from data %s" % (vocabulary_path, data_path)) vocab = {} with gfile.GFile(data_path, mode="rb") as f: counter = 0 for line in f: counter += 1 if counter % 100000 == 0: print(" processing line %d" % counter) line = tf.compat.as_bytes(line) tokens = tokenizer(line) if tokenizer else basic_tokenizer( line) for w in tokens: word = _DIGIT_RE.sub(b"0", w) if normalize_digits else w if word in vocab: vocab[word] += 1 else: vocab[word] = 1 vocab_list = _START_VOCAB + sorted( vocab, key=vocab.get, reverse=True) if len(vocab_list) > max_vocabulary_size: vocab_list = vocab_list[:max_vocabulary_size] with gfile.GFile(vocabulary_path, mode="wb") as vocab_file: for w in vocab_list: if not isinstance(w, str): vocab_file.write(w + b"\n") else: vocab_file.write(w.encode('utf-8') + b"\n")
def append_json_object(self, json_obj): """Append a json serializable object to the json file.""" if not self._json_path: raise ValueError('Attempting to write to a null json path') if gfile.exists(self._json_path): with gfile.GFile(self._json_path) as json_file: json_objs = json.loads(json_file.read()) json_objs.append(json_obj) else: json_objs = [json_obj] # TODO(gdahl,gilmer): Should this be an atomic file? with gfile.GFile(self._json_path, 'w') as json_file: json_file.write(json.dumps(json_objs))
def save_yaml(output_dir, config): """Save two yaml files. 1. 'config.yaml' is duplication of python config file as yaml. 2. 'meta.yaml' for application. The yaml's keys defined by `PARAMS_FOR_EXPORT`. """ if not gfile.exists(output_dir): gfile.makedirs(output_dir) config_yaml_path = _save_config_yaml(output_dir, config) meta_yaml_path = _save_meta_yaml(output_dir, config) return config_yaml_path, meta_yaml_path
def restore_state(output_dir): """Restore State.""" params_file = os.path.join(output_dir, "model.pkl") if not gfile.exists(params_file): return State(step=None, opt_state=None, history=trax_history.History(), model_state=None) pkl_module = utils.get_pickle_module() with gfile.GFile(params_file, "rb") as f: (opt_state, step, history, model_state) = pkl_module.load(f) log("Model loaded from %s at step %d" % (params_file, step)) logging.debug("From loaded model : history = %s", history) return State(step=step, opt_state=OptState(*opt_state), history=history, model_state=model_state)
def make_concepts_targets_and_randoms(source_dir, number_of_images_per_folder, number_of_random_folders): # Run script to download data to source_dir if not gfile.exists(source_dir): gfile.makedirs(source_dir) if not gfile.exists(os.path.join( source_dir, 'broden1_224/')) or not gfile.exists( os.path.join(source_dir, 'inception5h')): subprocess.call(['bash', 'FetchDataAndModels.sh', source_dir]) # Determine classes that we will fetch imagenet_classes = ['zebra'] broden_concepts = ['striped', 'dotted', 'zigzagged'] # make targets from imagenet imagenet_dataframe = fetcher.make_imagenet_dataframe( "./imagenet_url_map.csv") for image in imagenet_classes: fetcher.fetch_imagenet_class(source_dir, image, number_of_images_per_folder, imagenet_dataframe) # Make concepts from broden for concept in broden_concepts: fetcher.download_texture_to_working_folder( broden_path=os.path.join(source_dir, 'broden1_224'), saving_path=source_dir, texture_name=concept, number_of_images=number_of_images_per_folder) # Make random folders. If we want to run N random experiments with tcav, we need N+1 folders. fetcher.generate_random_folders( working_directory=source_dir, random_folder_prefix="random500", number_of_random_folders=number_of_random_folders + 1, number_of_examples_per_folder=number_of_images_per_folder, imagenet_dataframe=imagenet_dataframe)
def add_from_files(self, filestem, unbonded_right_tail_mass): """Adds distributions from a set of files. Files are expected to be named {filestem}.{atom_a}.{bond_type}.{atom_b} where * atom_a, atom_b: atomic numbers for H, C, N, O, F (smaller number first) * bond_type: {0, 1, 2, 3} for {unbonded, single, double, triple} Missing files are silently ignored. Contents are as expected by EmpiricalLengthDistribution.from_file Args: filestem: prefix of files to load unbonded_right_tail_mass: right_tail_mass (as described in EmpiricalLengthDistribution) for the unbonded cases. """ atom_types = [ dataset_pb2.BondTopology.ATOM_H, dataset_pb2.BondTopology.ATOM_C, dataset_pb2.BondTopology.ATOM_N, dataset_pb2.BondTopology.ATOM_O, dataset_pb2.BondTopology.ATOM_F, ] bond_types = [ dataset_pb2.BondTopology.BOND_UNDEFINED, dataset_pb2.BondTopology.BOND_SINGLE, dataset_pb2.BondTopology.BOND_DOUBLE, dataset_pb2.BondTopology.BOND_TRIPLE, ] for (atom_a, atom_b), bond_type in itertools.product( itertools.combinations_with_replacement(atom_types, 2), bond_types): fname = '{}.{}.{}.{}'.format( filestem, smu_utils_lib.ATOM_TYPE_TO_ATOMIC_NUMBER[atom_a], int(bond_type), smu_utils_lib.ATOM_TYPE_TO_ATOMIC_NUMBER[atom_b]) if not gfile.exists(fname): logging.info('Skipping non existent file %s', fname) continue right_tail_mass = None if bond_type == dataset_pb2.BondTopology.BOND_UNDEFINED: right_tail_mass = unbonded_right_tail_mass self.add(atom_a, atom_b, bond_type, EmpiricalLengthDistribution.from_file(fname, right_tail_mass))
def dump_object(object_to_dump, output_path): """Pickle the object and save to the output_path. Args: object_to_dump: Python object to be pickled output_path: (string) output path which can be Google Cloud Storage Returns: None """ path = f"gs://{output_path}" if not gfile.exists(path): gfile.makedirs(os.path.dirname(path)) with gfile.GFile(path, "w") as wf: joblib.dump(object_to_dump, wf)
def main(argv): del argv base_dir = FLAGS.base_dir # Load gin.config settings stored in model directory. It might take some time # for the train script to start up and actually write out a gin config file. # Wait 10 minutes (periodically checking for file existence) before giving up. gin_config_path = os.path.join(base_dir, 'config.gin') if not gfile.exists(gin_config_path): raise ValueError('Could not find config.gin in "%s"' % base_dir) gin.parse_config_file(gin_config_path, skip_unknown=True) gin.finalize() reranking_eval(base_dir)
def eval_agent(env_loop, task, eval_seed, increment_eval_seed, num_eval_episodes, loaded_ckpt, collapse_in_eval=True, stop_if_stuck=False, num_trained_episodes=None, total_steps=None, logdir=None, summary_writer=None, eval_id=''): """Evaluate success rate of a trained agent in an environment loop.""" if num_eval_episodes <= 0: return eval_path = None if logdir is not None or summary_writer is not None: eval_filename = format_eval_filename(task, eval_seed, num_eval_episodes, loaded_ckpt, collapse_in_eval, stop_if_stuck, eval_id) if logdir is not None: eval_path = os.path.join(logdir, 'eval', f'eval{eval_filename}') if gfile.exists(eval_path + '_success.txt'): print('Evaluation', eval_path, 'already exists; skipping') return print('Writing evaluation to', eval_path) finished_eval = False while not finished_eval: success_rate, finished_eval = env_loop.eval_policy( num_episodes=num_eval_episodes, collapse_policy=collapse_in_eval, eval_path=eval_path, num_videos_to_save=num_eval_episodes, seed=eval_seed, increment_seed=increment_eval_seed, stop_if_stuck=stop_if_stuck) if summary_writer is not None: with summary_writer.as_default(): if num_trained_episodes is not None: tf.summary.scalar(f'{eval_id}_success_rate', success_rate, step=num_trained_episodes) if total_steps is not None: tf.summary.scalar(f'{eval_id}_success_rate_env_steps', success_rate, step=total_steps)
def _input_generator(self): """Yields lines from from input_file.""" if not gfile.exists(self.input_file): raise FileNotFoundError if self.input_file.endswith('.bz2'): logging.info('Opening %s as bzip2 file', self.input_file) with gfile.GFile(self.input_file, 'rb') as compressed_f: with bz2.BZ2File(compressed_f, 'rb') as f: for bin_line in f: yield bin_line.decode('utf-8').rstrip('\n') else: logging.info('Opening %s via gfile', self.input_file) with gfile.GFile(self.input_file, 'r') as f: for line in f: yield line.rstrip('\n')
def __init__(self, forward_pass=None, use_pmap=True, save_every=1, metrics_logger=None): """Used to inspect a models forward and backward pass. The following keys are required in config - use_pmap: Whether or not to call jax.pmap on the forward pass. Args: forward_pass: A function mapping batch to a dict of intermediate values. use_pmap: Boolean which determines whether or not computations are meant to be pmapped. If true, then full_eval will expect all pytrees to be replicated. save_every: Stored metrics will be saved to disk every time step % save_every == 0 metrics_logger: utils.MetricsLogger object. If provided then all calculations will be saved to disk. """ if metrics_logger and (metrics_logger._json_path is None): raise ValueError( 'To use the ModelDebugger with a metrics_logger, a json' ' path must be specified when building metrics_logger') self._save_every = save_every self._metrics_logger = metrics_logger self._use_pmap = use_pmap self.forward_pass = None # In both the pmap case and non-pmap case, _tree_norm_fn_sql2 returns # unreplicated results on the host cpu. if use_pmap: self._tree_norm_fn_sql2 = pmap_then_unreplicate(tree_norm_sql2) if forward_pass is not None: self.forward_pass = pmap_then_unreplicate(forward_pass) else: self._tree_norm_fn_sql2 = tree_norm_sql2 self.forward_pass = forward_pass self._stored_metrics = {} # In the case of preemption we want to restore prior metrics. if metrics_logger: metrics_file = os.path.join(metrics_logger._pytree_path, 'training_metrics') if gfile.exists(metrics_file): self._stored_metrics = load_pytree(metrics_file)
def set_eval_paths(ckpt_dir, ckpt, custom_eval_id): """Set paths for evaluation and TensorBoard summaries.""" eval_path, summary_key, best_epoch = None, None, None if ckpt is not None: best_epoch_path = (ckpt.replace('ckpt_best_of_', 'best_epoch_of_').replace( 'ckpt', 'best_epoch')) best_epoch_path = os.path.join(ckpt_dir, best_epoch_path) if gfile.exists(best_epoch_path): with gfile.GFile(best_epoch_path) as f: best_epoch = int(f.read()) else: best_epoch = int(ckpt.replace('ckpt_', '')) print('best epoch:', best_epoch) eval_path = set_eval_path(ckpt_dir, custom_eval_id, ckpt) summary_key = set_summary_key(custom_eval_id, ckpt) return eval_path, summary_key, best_epoch
def _get_tfrecords(self, name): paths = self.params.data_dir.split(':') data_dir = None for path in paths: if gfile.exists(join(path, name)): data_dir = path break assert data_dir is not None, "data_dir not found" paths = list( map(lambda x: join(data_dir, name, x), self.params.data_pattern.split(','))) files = gfile.glob(paths) if not files: raise IOError("Unable to find files. data_pattern='{}'.".format( self.params.data_pattern)) logging.info("Number of TFRecord files: {}.".format(len(files))) return files
def load_trainer_state(output_dir): """Returns a TrainerState instance loaded from the given `output_dir`.""" params_file = os.path.join(output_dir, 'model.pkl') if not gfile.exists(params_file): return TrainerState(step=None, opt_state=None, history=trax_history.History(), model_state=None) pkl_module = utils.get_pickle_module() with gfile.GFile(params_file, 'rb') as f: (opt_state, step, history, model_state) = pkl_module.load(f) log('Model loaded from %s at step %d' % (params_file, step)) logging.debug('From loaded model : history = %s', history) return TrainerState(step=step, opt_state=OptState(*opt_state), history=history, model_state=model_state)
def dump_trajectories(self, force=False): """Dumps trajectories in a new shard. Should be called at most once per epoch. Args: force: (bool) Whether to complete unfinished trajectories and create a new shard even if we have not reached the minimum size. """ pkl_module = utils.get_pickle_module() if self.trajectory_dump_dir is None: return gfile.makedirs(self.trajectory_dump_dir) trajectories = self.train_env.trajectories if force: trajectories.complete_all_trajectories() # complete_all_trajectories() also adds trajectories that were just reset. # We don't want them since they have just the initial observation and no # actions, so we filter them out. def has_any_action(trajectory): return (trajectory.time_steps and trajectory.time_steps[0].action is not None) self._trajectory_buffer.extend( filter(has_any_action, trajectories.completed_trajectories)) trajectories.clear_completed_trajectories() ready = (len(self._trajectory_buffer) >= self._trajectory_dump_min_count_per_shard) if ready or force: shard_path = os.path.join(self.trajectory_dump_dir, "{}.pkl".format(self.epoch)) if gfile.exists(shard_path): # Since we do an extra dump at the end of the training loop, we # sometimes dump 2 times in the same epoch. When this happens, merge the # two sets of trajectories. with gfile.GFile(shard_path, "rb") as f: self._trajectory_buffer = pkl_module.load( f) + self._trajectory_buffer with gfile.GFile(shard_path, "wb") as f: pkl_module.dump(self._trajectory_buffer, f) self._trajectory_buffer = []
def gen_csv_from_annotations( input_dir: str, output_file=constants.DEFAULT_CSV_FILENAME, out_path_prefix='', dataset_type=constants.DEFAULT_DATASET_TYPE): """Generates AutoML dataset CSV from annotation files. Args: input_dir: Directory of annotation files. output_file: Output CSV filename. out_path_prefix: Filepath prefix to prepend to the image files. e.g. src_image_filename = '/tmp/path/to/image.jpg' out_path_prefix = 'gs://bucket/images' output_image_filename = 'gs://bucket/images/image.jpg' dataset_type: Dataset type (TRAIN, VAL, TEST, UNSPECIFIED) to use for all the parsed images. """ if not gfile.exists(input_dir): raise ValueError('Input directory not found.') with gfile.GFile(os.path.expanduser(output_file), 'w') as outf: writer = csv.writer(outf, delimiter=',') for filename in gfile.listdir(os.path.expanduser(input_dir)): filepath = os.path.join(input_dir, filename) image_filename, boxes = annotation.read(filepath) out_image_filename = os.path.join(out_path_prefix, image_filename) for b in boxes: row = [ dataset_type, out_image_filename, b.label, b.xmin, b.ymin, '', '', b.xmax, b.ymax, '', '', ] writer.writerow(row)
def write_eval_results(checkpoint_dir, all_gen_sentences, checkpoint_name, mean_train_prob, mean_valid_prob, mean_gen_prob, fid): """Write evaluation results to disk.""" to_write = ",".join( map(str, [ checkpoint_name, mean_train_prob, mean_valid_prob, mean_gen_prob, fid ])) eval_filepath = os.path.join(checkpoint_dir, EVAL_FILENAME) previous_eval_content = "" if gfile.exists(eval_filepath): with gfile.GFile(eval_filepath, "r") as f: previous_eval_content = f.read() with gfile.GFile(eval_filepath, "w") as f: f.write(previous_eval_content + to_write + "\n") with gfile.GFile( os.path.join(checkpoint_dir, checkpoint_name + "_sentences.txt"), "w") as f: f.write("\n".join(all_gen_sentences))
def prepare_dirs(recreate=False): """Prepare config dirs When recreate is True, if previous execution exists, remove them and recreate. When recreate is False, remain previous execution. """ experiment_dir = environment.EXPERIMENT_DIR tensorboard_dir = environment.TENSORBOARD_DIR checkpoints_dir = environment.CHECKPOINTS_DIR if recreate: message = """ Delete and recreate these dirs: experiment_dir: {experiment_dir} tensorboard_dir: {tensorboard_dir} checkpoints_dir: {checkpoints_dir} """.format(experiment_dir=experiment_dir, tensorboard_dir=tensorboard_dir, checkpoints_dir=checkpoints_dir) else: message = """ Create these dirs if the dirs dont exist: experiment_dir: {experiment_dir} tensorboard_dir: {tensorboard_dir} checkpoints_dir: {checkpoints_dir} """.format(experiment_dir=experiment_dir, tensorboard_dir=tensorboard_dir, checkpoints_dir=checkpoints_dir) print(message) if recreate: if gfile.exists(experiment_dir): gfile.rmtree(experiment_dir) if gfile.exists(tensorboard_dir): gfile.rmtree(tensorboard_dir) if gfile.exists(checkpoints_dir): gfile.rmtree(checkpoints_dir) if not gfile.exists(experiment_dir): gfile.makedirs(experiment_dir) if not gfile.exists(tensorboard_dir): gfile.makedirs(tensorboard_dir) if not gfile.exists(checkpoints_dir): gfile.makedirs(checkpoints_dir)
def save_checkpoint(optimizer, model_state, directory, epoch): """Saves a model and its state. Removes a checkpoint if it already exists for a given epoch. Args: optimizer: The optimizer containing the model that we are training. model_state: Current state associated with the model. directory: Directory where the checkpoints should be saved. epoch: Number of epochs the model has been trained for. """ train_state = dict(optimizer=optimizer, model_state=model_state, epoch=epoch) if gfile.exists(os.path.join(directory, 'checkpoint_' + str(epoch))): gfile.remove(os.path.join(directory, 'checkpoint_' + str(epoch))) checkpoints.save_checkpoint(directory, train_state, epoch, keep=2)
def _load_backgrounds(self, backgrounds_dir): """Loads backgrounds from a directory. Args: backgrounds_dir: path to directory containing foregrounds. Dir of the form `backrounds_dir`/$BACKGROUND_TYPE/$FILE_NAME. Produces: self.bgs: a list of the form [bg0, bg1, ...] where the backgrounds are `PIL.Image.Image`s. self.num_bgs: int, number of backgrounds. """ if not gfile.exists(backgrounds_dir): raise ValueError( f'Backgrounds directory {backgrounds_dir} does not exist.') bg_fnames = gfile.glob(path.join(backgrounds_dir, '*')) self.bgs = self._thread_pool.map(load_image, bg_fnames) self.bgs = self._thread_pool.map(self._preprocess_background, self.bgs) self.num_bgs = len(self.bgs) print('Backgrounds loaded.')
def _load_yaml(blueoil_config_filename): """load blueoil config yaml Args: blueoil_config_filename (str): File path of blueoil config yaml file. Returns: dict: blueoil config. """ if not gfile.exists(blueoil_config_filename): FileNotFoundError("File not found: {}".format(blueoil_config_filename)) with gfile.GFile(blueoil_config_filename, "r") as f: blueoil_config = yaml.load(f, Loader=yaml.SafeLoader) model_name, _ = os.path.splitext(os.path.basename(blueoil_config_filename)) blueoil_config["model_name"] = model_name return blueoil_config
def restore_checkpoint(ckpt_dir, target, step=None, prefix='checkpoint_'): """Restore last/best checkpoint from checkpoints in path. Sorts the checkpoint files naturally, returning the highest-valued file, e.g.: ckpt_1, ckpt_2, ckpt_3 --> ckpt_3 ckpt_0.01, ckpt_0.1, ckpt_0.001 --> ckpt_0.1 ckpt_-1.0, ckpt_1.0, ckpt_1e5 --> ckpt_1e5 Args: ckpt_dir: str: directory of checkpoints to restore from. target: matching object to rebuild via deserialized state-dict. If None, the deserialized state-dict is returned as-is. step: int: step number to load or None to load latest. prefix: str: name prefix of checkpoint files. Returns: Restored `target` updated from checkpoint file, or if no step specified and no checkpoint files present, returns the passed-in `target` unchanged. """ if step: ckpt_path = _checkpoint_path(ckpt_dir, step, prefix) if not gfile.exists(ckpt_path): raise ValueError(f'Matching checkpoint not found: {ckpt_path}') else: glob_path = os.path.join(ckpt_dir, f'{prefix}*') checkpoint_files = natural_sort(gfile.glob(glob_path)) ckpt_tmp_path = _checkpoint_path(ckpt_dir, 'tmp', prefix) checkpoint_files = [f for f in checkpoint_files if f != ckpt_tmp_path] if not checkpoint_files: return target ckpt_path = checkpoint_files[-1] logging.info('Restoring checkpoint from %s', ckpt_path) with gfile.GFile(ckpt_path, 'rb') as fp: if target is None: return serialization.msgpack_restore(fp.read()) else: return serialization.from_bytes(target, fp.read())
def encode_file(self, path, ordered=False, verbose=False, add_double_eos=False): if verbose: print('encoding file {} ...'.format(path)) assert exists(path) encoded = [] with open(path, 'r') as f: for idx, line in enumerate(f): if verbose and idx > 0 and idx % 500000 == 0: print(' line {}'.format(idx)) symbols = self.tokenize(line, add_eos=True, add_double_eos=add_double_eos) encoded.append(self.convert_to_nparray(symbols)) if ordered: encoded = np.concatenate(encoded) return encoded
def get_lm_corpus(data_dir, dataset): fn = os.path.join(data_dir, "cache.pkl") if exists(fn): print("Loading cached dataset...") with open(fn, "rb") as fp: corpus = pickle.load(fp) else: print("Producing dataset...") kwargs = {} if dataset in ["wt103", "wt2"]: kwargs["special"] = ["<eos>"] kwargs["lower_case"] = False elif dataset == "ptb": kwargs["special"] = ["<eos>"] kwargs["lower_case"] = True elif dataset == "lm1b": kwargs["special"] = [] kwargs["lower_case"] = False kwargs["vocab_file"] = os.path.join(data_dir, "1b_word_vocab.txt") elif dataset in ["enwik8", "text8"]: pass corpus = Corpus(data_dir, dataset, **kwargs) print("Saving dataset...") with open(fn, "wb") as fp: pickle.dump(corpus, fp, protocol=2) corpus_info = { "vocab_size": len(corpus.vocab), "cutoffs": corpus.cutoffs, "dataset": corpus.dataset } with open(os.path.join(data_dir, "corpus-info.json"), "w") as fp: json.dump(corpus_info, fp) return corpus
def predict(self, trained_model_chckpt: str = None): """ Inference of model on test datasets """ # Load a trained checkpoint if a valid model checkpoint if trained_model_chckpt and gfile.exists(trained_model_chckpt): logger.info( f"Running predictions using: {trained_model_chckpt}. This may take 3 minutes." ) self.load(trained_model_chckpt) logger.info("Checkpoint loaded.") self.config.batch_size_eval = 128 self.config.use_glue_format = True # test eval for idx, dataset in enumerate(self.test_datasets_list): prefix = dataset.split("_")[0] results = self._predict(idx, prefix, dataset, eval_type='test') if results: logger.info(f"[new test scores saved for {dataset}.]") else: logger.info(f"Data not found for {dataset}.")
def _load_foregrounds(self, foregrounds_dir): """Loads foregrounds from a directory. Args: foregrounds_dir: path to directory containing foregrounds. Directory of the form `foregrounds_dir`/$OBJECT_CLASS/$FILE_NAME. Produces: self.fg_classes: a list of names of foreground object classes, e.g. ['ambulance', 'bagel', ...] self.num_fgs_per_class: a dict of the form {foreground_obj_class_name: num_fgs_in_that_class} self.fgs: a list of the form [fg0, fg1, ...] where the foregrounds are `PIL.PngImagePlugin.PngImageFile`s. self.fgs_dict: a dict of the form {fg_class_name: [img0, img1, ...]} where the images are `PIL.PngImagePlugin.PngImageFile`s. """ if not gfile.exists(foregrounds_dir): raise ValueError( f'Foregrounds directory {foregrounds_dir} does not exist.') fg_fnames = gfile.glob(path.join(foregrounds_dir, '*/*')) fg_labels = [x.split('/')[-2] for x in fg_fnames] # e.g. 'car', 'cow' self.fg_classes = sorted(list(set(fg_labels))) self.num_fgs_per_class = { fg_class: len(gfile.glob(path.join(foregrounds_dir, fg_class, '*'))) for fg_class in self.fg_classes } self.num_fgs_per_class_list = [ self.num_fgs_per_class[fg_class] for fg_class in self.fg_classes ] self.fgs = self._thread_pool.map(load_image, fg_fnames) self.fgs_dict = {fg_class: [] for fg_class in self.fg_classes} for i, label in enumerate(fg_labels): self.fgs_dict[label].append(self.fgs[i]) print('Foregrounds loaded.')
def main(unused_argv): del unused_argv # Unused corpus = get_lm_corpus(FLAGS.data_dir, FLAGS.dataset) # save_dir = os.path.join(FLAGS.data_dir, "tfrecords") if not exists(save_dir): makedirs(save_dir) # test mode if FLAGS.per_host_test_bsz > 0: corpus.convert_to_tfrecords("test", save_dir, FLAGS.per_host_test_bsz, FLAGS.tgt_len, FLAGS.num_core_per_host, FLAGS=FLAGS) return for split, batch_size in zip( ["train", "valid"], [FLAGS.per_host_train_bsz, FLAGS.per_host_valid_bsz]): if batch_size <= 0: continue print("Converting {} set...".format(split)) corpus.convert_to_tfrecords(split, save_dir, batch_size, FLAGS.tgt_len, FLAGS.num_core_per_host, FLAGS=FLAGS)
def main(argv): """Main function.""" if len(argv) > 1: raise app.UsageError('Too many command-line arguments.') # TODO(mohitreddy): Change to flags.mark_flag_as_required('model_dir'). assert FLAGS.model_dir is not None, 'Please provide model_dir.' if not gfile.exists(FLAGS.model_dir): gfile.makedirs(FLAGS.model_dir) train_and_evaluate(seed=FLAGS.seed, model_dir=FLAGS.model_dir, num_epochs=FLAGS.num_epochs, batch_size=FLAGS.batch_size, embedding_size=FLAGS.embedding_size, hidden_size=FLAGS.hidden_size, min_freq=FLAGS.min_freq, max_seq_len=FLAGS.max_seq_len, dropout=FLAGS.dropout, emb_dropout=FLAGS.emb_dropout, word_dropout_rate=FLAGS.word_dropout_rate, learning_rate=FLAGS.learning_rate, checkpoints_to_keep=FLAGS.checkpoints_to_keep, l2_reg=FLAGS.l2_reg)
def test_whole_pipeline(self): test_subdirectory = self.create_tempdir() output_stem = os.path.join(test_subdirectory, 'testout') input_stage1_dat_glob = os.path.join(TESTDATA_PATH, 'pipeline_input_stage1.dat') input_stage2_dat_glob = os.path.join(TESTDATA_PATH, 'pipeline_input_stage2.dat') input_equivalent_glob = os.path.join(TESTDATA_PATH, 'pipeline_equivalent.dat') input_bond_topology_csv = os.path.join(TESTDATA_PATH, 'pipeline_bond_topology.csv') with flagsaver.flagsaver( input_stage1_dat_glob=input_stage1_dat_glob, input_stage2_dat_glob=input_stage2_dat_glob, input_equivalent_glob=input_equivalent_glob, input_bond_topology_csv=input_bond_topology_csv, output_stem=output_stem, output_shards=1): # If you have custom beam options, add them here. beam_options = None with beam.Pipeline(beam_options) as root: pipeline.pipeline(root) metrics = root.result.metrics().query() counters_dict = { m.key.metric.name: m.committed for m in metrics['counters'] } self.assertEqual(counters_dict['attempted_topology_matches'], 3) # Conformer 620517 will not match because bond lengths are not extracted # from conformers with serious errors like this. self.assertEqual(counters_dict['no_topology_matches'], 1) self.assertNotIn('topology_match_smiles_failure', counters_dict) logging.info( 'Files in output: %s', '\n'.join(gfile.glob(os.path.join(test_subdirectory, '*')))) for stage in ['stage1', 'stage2']: self.assertTrue( gfile.exists(output_stem + '_' + stage + '_original_known_error-00000-of-00001.dat')) self.assertTrue( gfile.exists(output_stem + '_' + stage + '_original_unknown_error-00000-of-00001.dat')) self.assertTrue( gfile.exists(output_stem + '_' + stage + '_mismatched_original-00000-of-00001.dat')) self.assertTrue( gfile.exists(output_stem + '_' + stage + '_mismatched_regen-00000-of-00001.dat')) # Check the merge conflicts file with gfile.GFile(output_stem + '_conflicts-00000-of-00001.csv') as f: conflicts_lines = f.readlines() self.assertIn('conformer_id,', conflicts_lines[0]) self.assertEqual( conflicts_lines[1], '618451001,1,1,1,1,' '-406.51179,9.999999,-406.522079,9.999999,True,True,' '-406.51179,0.052254,-406.522079,2.5e-05,True,True\n') # Check a couple of the stats. with gfile.GFile(output_stem + '_stats-00000-of-00001.csv') as f: stats_lines = f.readlines() self.assertIn('errors.status,0,2\n', stats_lines) self.assertIn('errors.warn_t1,0,4\n', stats_lines) self.assertIn('fate,FATE_SUCCESS,2\n', stats_lines) self.assertIn('fate,FATE_DUPLICATE_DIFFERENT_TOPOLOGY,1\n', stats_lines) self.assertIn('num_initial_geometries,1,4\n', stats_lines) self.assertIn('num_duplicates,1,1\n', stats_lines) self.assertIn('zero_field,single_point_energy_pbe0d3_6_311gd,1\n', stats_lines) # Check the smiles comparison output with gfile.GFile(output_stem + '_smiles_compare-00000-of-00001.csv') as f: smiles_lines = f.readlines() self.assertIn( '620517002,MISMATCH,NotAValidSmilesString,' '[H]C1=C2OC2=C(F)O1,FC1=C2OC2=CO1\n', smiles_lines) # Make sure that a bond topology with a matching smiles doesn't show for line in smiles_lines: self.assertNotIn('618451001', line) # Check the bond topology summary with gfile.GFile(output_stem + '_bt_summary-00000-of-00001.csv') as f: bt_summary_lines = f.readlines() # Check part of the header line self.assertIn('bt_id', bt_summary_lines[0]) self.assertIn('count_attempted_conformers', bt_summary_lines[0]) # This is the bond topology that has no conformer self.assertIn('10,0,0,0,0,0,0,0,0,0,0,0,0,0\n', bt_summary_lines) # This is a bond topology with 1 conformer self.assertIn('620517,1,0,0,0,1,0,1,0,0,0,0,0,0\n', bt_summary_lines) # This is a bond topology with 2 conformers self.assertIn('618451,2,0,0,0,2,0,0,0,2,0,0,0,0\n', bt_summary_lines) # Check the bond lengths file with gfile.GFile(output_stem + '_bond_lengths.csv') as f: bond_length_lines = f.readlines() self.assertEqual( 'atom_char_0,atom_char_1,bond_type,length_str,count\n', bond_length_lines[0]) self.assertIn('c,c,2,1.336,1\n', bond_length_lines) self.assertIn('c,o,1,1.422,2\n', bond_length_lines) # For the gzip files below, we check >100 because even an empty gzip file # has non-zero length. 100 is kind of arbitrary to be bigger than the # expected header of 20. # Check that the generated TFRecord files contain some expected outputs standard_dataset = tf.data.TFRecordDataset( output_stem + '_standard_tfrecord-00000-of-00001') standard_output = [ dataset_pb2.Conformer.FromString(raw) for raw in standard_dataset.as_numpy_iterator() ] self.assertCountEqual([c.conformer_id for c in standard_output], [618451001, 618451123]) # Check that fields are filtered the way we expect self.assertFalse( standard_output[0].properties.HasField('compute_cluster_info')) self.assertFalse( standard_output[0].properties.HasField('homo_pbe0_aug_pc_1')) self.assertTrue( standard_output[0].properties.HasField('rotational_constants')) complete_dataset = tf.data.TFRecordDataset( output_stem + '_complete_tfrecord-00000-of-00001') complete_output = [ dataset_pb2.Conformer.FromString(raw) for raw in complete_dataset.as_numpy_iterator() ] self.assertCountEqual([c.conformer_id for c in complete_output], [618451001, 618451123, 620517002, 79593005]) # Check that fields are filtered the way we expect # The DirectRunner randomizes the order of output so we need to make sure # that we get a full record. complete_entry = [ c for c in complete_output if c.conformer_id == 618451001 ][0] self.assertFalse( complete_entry.properties.HasField('compute_cluster_info')) self.assertTrue( complete_entry.properties.HasField('homo_pbe0_aug_pc_1')) self.assertTrue( complete_entry.properties.HasField('rotational_constants')) complete_entry_for_smiles = [ c for c in complete_output if c.conformer_id == 620517002 ][0] self.assertEqual(complete_entry_for_smiles.properties.smiles_openbabel, 'NotAValidSmilesString')