Ejemplo n.º 1
0
    def count_file(self, path, verbose=False, add_eos=False):
        if verbose: print('counting file {} ...'.format(path))
        assert exists(path)

        sents = []
        with open(path, 'r') as f:
            for idx, line in enumerate(f):
                if verbose and idx > 0 and idx % 500000 == 0:
                    print('  line {}'.format(idx))
                symbols = self.tokenize(line, add_eos=True)
                self.counter.update(symbols)
                sents.append(symbols)

        return sents
Ejemplo n.º 2
0
def create_vocabulary(vocabulary_path,
                      data_path,
                      max_vocabulary_size,
                      tokenizer=None,
                      normalize_digits=True):
    """Create vocabulary file (if it does not exist yet) from data file.

    Data file is assumed to contain one sentence per line. Each sentence is
    tokenized and digits are normalized (if normalize_digits is set).
    Vocabulary contains the most-frequent tokens up to max_vocabulary_size.
    We write it to vocabulary_path in a one-token-per-line format, so that later
    token in the first line gets id=0, second line gets id=1, and so on.

    Args:
        vocabulary_path: path where the vocabulary will be created.
        data_path: data file that will be used to create vocabulary.
        max_vocabulary_size: limit on the size of the created vocabulary.
        tokenizer: a function to use to tokenize each data sentence;
            if None, basic_tokenizer will be used.
        normalize_digits: Boolean; if true, all digits are replaced by 0s.
    """
    if not gfile.exists(vocabulary_path):
        print("Creating vocabulary %s from data %s" %
              (vocabulary_path, data_path))
        vocab = {}
        with gfile.GFile(data_path, mode="rb") as f:
            counter = 0
            for line in f:
                counter += 1
                if counter % 100000 == 0:
                    print("  processing line %d" % counter)
                line = tf.compat.as_bytes(line)
                tokens = tokenizer(line) if tokenizer else basic_tokenizer(
                    line)
                for w in tokens:
                    word = _DIGIT_RE.sub(b"0", w) if normalize_digits else w
                    if word in vocab:
                        vocab[word] += 1
                    else:
                        vocab[word] = 1
            vocab_list = _START_VOCAB + sorted(
                vocab, key=vocab.get, reverse=True)
            if len(vocab_list) > max_vocabulary_size:
                vocab_list = vocab_list[:max_vocabulary_size]
            with gfile.GFile(vocabulary_path, mode="wb") as vocab_file:
                for w in vocab_list:
                    if not isinstance(w, str):
                        vocab_file.write(w + b"\n")
                    else:
                        vocab_file.write(w.encode('utf-8') + b"\n")
Ejemplo n.º 3
0
    def append_json_object(self, json_obj):
        """Append a json serializable object to the json file."""

        if not self._json_path:
            raise ValueError('Attempting to write to a null json path')
        if gfile.exists(self._json_path):
            with gfile.GFile(self._json_path) as json_file:
                json_objs = json.loads(json_file.read())
            json_objs.append(json_obj)
        else:
            json_objs = [json_obj]
        # TODO(gdahl,gilmer): Should this be an atomic file?
        with gfile.GFile(self._json_path, 'w') as json_file:
            json_file.write(json.dumps(json_objs))
Ejemplo n.º 4
0
def save_yaml(output_dir, config):
    """Save two yaml files.

    1. 'config.yaml' is duplication of python config file as yaml.
    2. 'meta.yaml' for application. The yaml's keys defined by `PARAMS_FOR_EXPORT`.
    """

    if not gfile.exists(output_dir):
        gfile.makedirs(output_dir)

    config_yaml_path = _save_config_yaml(output_dir, config)
    meta_yaml_path = _save_meta_yaml(output_dir, config)

    return config_yaml_path, meta_yaml_path
Ejemplo n.º 5
0
def restore_state(output_dir):
  """Restore State."""
  params_file = os.path.join(output_dir, "model.pkl")
  if not gfile.exists(params_file):
    return State(step=None, opt_state=None, history=trax_history.History(),
                 model_state=None)

  pkl_module = utils.get_pickle_module()
  with gfile.GFile(params_file, "rb") as f:
    (opt_state, step, history, model_state) = pkl_module.load(f)
  log("Model loaded from %s at step %d" % (params_file, step))
  logging.debug("From loaded model : history = %s", history)
  return State(step=step, opt_state=OptState(*opt_state), history=history,
               model_state=model_state)
Ejemplo n.º 6
0
def make_concepts_targets_and_randoms(source_dir, number_of_images_per_folder,
                                      number_of_random_folders):
    # Run script to download data to source_dir
    if not gfile.exists(source_dir):
        gfile.makedirs(source_dir)
    if not gfile.exists(os.path.join(
            source_dir, 'broden1_224/')) or not gfile.exists(
                os.path.join(source_dir, 'inception5h')):
        subprocess.call(['bash', 'FetchDataAndModels.sh', source_dir])

    # Determine classes that we will fetch
    imagenet_classes = ['zebra']
    broden_concepts = ['striped', 'dotted', 'zigzagged']

    # make targets from imagenet
    imagenet_dataframe = fetcher.make_imagenet_dataframe(
        "./imagenet_url_map.csv")
    for image in imagenet_classes:
        fetcher.fetch_imagenet_class(source_dir, image,
                                     number_of_images_per_folder,
                                     imagenet_dataframe)

    # Make concepts from broden
    for concept in broden_concepts:
        fetcher.download_texture_to_working_folder(
            broden_path=os.path.join(source_dir, 'broden1_224'),
            saving_path=source_dir,
            texture_name=concept,
            number_of_images=number_of_images_per_folder)

    # Make random folders. If we want to run N random experiments with tcav, we need N+1 folders.
    fetcher.generate_random_folders(
        working_directory=source_dir,
        random_folder_prefix="random500",
        number_of_random_folders=number_of_random_folders + 1,
        number_of_examples_per_folder=number_of_images_per_folder,
        imagenet_dataframe=imagenet_dataframe)
Ejemplo n.º 7
0
  def add_from_files(self, filestem,
                     unbonded_right_tail_mass):
    """Adds distributions from a set of files.

    Files are expected to be named {filestem}.{atom_a}.{bond_type}.{atom_b}
    where
    * atom_a, atom_b: atomic numbers for H, C, N, O, F (smaller number first)
    * bond_type: {0, 1, 2, 3} for {unbonded, single, double, triple}

    Missing files are silently ignored.

    Contents are as expected by EmpiricalLengthDistribution.from_file

    Args:
      filestem: prefix of files to load
      unbonded_right_tail_mass: right_tail_mass (as described in
        EmpiricalLengthDistribution) for the unbonded cases.
    """
    atom_types = [
        dataset_pb2.BondTopology.ATOM_H,
        dataset_pb2.BondTopology.ATOM_C,
        dataset_pb2.BondTopology.ATOM_N,
        dataset_pb2.BondTopology.ATOM_O,
        dataset_pb2.BondTopology.ATOM_F,
    ]

    bond_types = [
        dataset_pb2.BondTopology.BOND_UNDEFINED,
        dataset_pb2.BondTopology.BOND_SINGLE,
        dataset_pb2.BondTopology.BOND_DOUBLE,
        dataset_pb2.BondTopology.BOND_TRIPLE,
    ]

    for (atom_a, atom_b), bond_type in itertools.product(
        itertools.combinations_with_replacement(atom_types, 2), bond_types):
      fname = '{}.{}.{}.{}'.format(
          filestem, smu_utils_lib.ATOM_TYPE_TO_ATOMIC_NUMBER[atom_a],
          int(bond_type), smu_utils_lib.ATOM_TYPE_TO_ATOMIC_NUMBER[atom_b])

      if not gfile.exists(fname):
        logging.info('Skipping non existent file %s', fname)
        continue

      right_tail_mass = None
      if bond_type == dataset_pb2.BondTopology.BOND_UNDEFINED:
        right_tail_mass = unbonded_right_tail_mass

      self.add(atom_a, atom_b, bond_type,
               EmpiricalLengthDistribution.from_file(fname, right_tail_mass))
Ejemplo n.º 8
0
def dump_object(object_to_dump, output_path):
    """Pickle the object and save to the output_path.

    Args:
      object_to_dump: Python object to be pickled
      output_path: (string) output path which can be Google Cloud Storage

    Returns:
      None
    """
    path = f"gs://{output_path}"
    if not gfile.exists(path):
        gfile.makedirs(os.path.dirname(path))
    with gfile.GFile(path, "w") as wf:
        joblib.dump(object_to_dump, wf)
def main(argv):
  del argv
  base_dir = FLAGS.base_dir

  # Load gin.config settings stored in model directory. It might take some time
  # for the train script to start up and actually write out a gin config file.
  # Wait 10 minutes (periodically checking for file existence) before giving up.
  gin_config_path = os.path.join(base_dir, 'config.gin')
  if not gfile.exists(gin_config_path):
    raise ValueError('Could not find config.gin in "%s"' % base_dir)

  gin.parse_config_file(gin_config_path, skip_unknown=True)
  gin.finalize()

  reranking_eval(base_dir)
Ejemplo n.º 10
0
def eval_agent(env_loop,
               task,
               eval_seed,
               increment_eval_seed,
               num_eval_episodes,
               loaded_ckpt,
               collapse_in_eval=True,
               stop_if_stuck=False,
               num_trained_episodes=None,
               total_steps=None,
               logdir=None,
               summary_writer=None,
               eval_id=''):
    """Evaluate success rate of a trained agent in an environment loop."""
    if num_eval_episodes <= 0:
        return
    eval_path = None
    if logdir is not None or summary_writer is not None:
        eval_filename = format_eval_filename(task, eval_seed,
                                             num_eval_episodes, loaded_ckpt,
                                             collapse_in_eval, stop_if_stuck,
                                             eval_id)
    if logdir is not None:
        eval_path = os.path.join(logdir, 'eval', f'eval{eval_filename}')
        if gfile.exists(eval_path + '_success.txt'):
            print('Evaluation', eval_path, 'already exists; skipping')
            return
        print('Writing evaluation to', eval_path)
    finished_eval = False
    while not finished_eval:
        success_rate, finished_eval = env_loop.eval_policy(
            num_episodes=num_eval_episodes,
            collapse_policy=collapse_in_eval,
            eval_path=eval_path,
            num_videos_to_save=num_eval_episodes,
            seed=eval_seed,
            increment_seed=increment_eval_seed,
            stop_if_stuck=stop_if_stuck)
    if summary_writer is not None:
        with summary_writer.as_default():
            if num_trained_episodes is not None:
                tf.summary.scalar(f'{eval_id}_success_rate',
                                  success_rate,
                                  step=num_trained_episodes)
            if total_steps is not None:
                tf.summary.scalar(f'{eval_id}_success_rate_env_steps',
                                  success_rate,
                                  step=total_steps)
Ejemplo n.º 11
0
  def _input_generator(self):
    """Yields lines from from input_file."""
    if not gfile.exists(self.input_file):
      raise FileNotFoundError

    if self.input_file.endswith('.bz2'):
      logging.info('Opening %s as bzip2 file', self.input_file)
      with gfile.GFile(self.input_file, 'rb') as compressed_f:
        with bz2.BZ2File(compressed_f, 'rb') as f:
          for bin_line in f:
            yield bin_line.decode('utf-8').rstrip('\n')
    else:
      logging.info('Opening %s via gfile', self.input_file)
      with gfile.GFile(self.input_file, 'r') as f:
        for line in f:
          yield line.rstrip('\n')
Ejemplo n.º 12
0
    def __init__(self,
                 forward_pass=None,
                 use_pmap=True,
                 save_every=1,
                 metrics_logger=None):
        """Used to inspect a models forward and backward pass.

    The following keys are required in config -
      use_pmap: Whether or not to call jax.pmap on the forward pass.

    Args:
      forward_pass: A function mapping batch to a dict of intermediate values.
      use_pmap: Boolean which determines whether or not computations are meant
        to be pmapped. If true, then full_eval will expect all pytrees to be
        replicated.
      save_every: Stored metrics will be saved to disk every time
        step % save_every == 0
      metrics_logger: utils.MetricsLogger object. If provided then all
        calculations will be saved to disk.
    """
        if metrics_logger and (metrics_logger._json_path is None):
            raise ValueError(
                'To use the ModelDebugger with a metrics_logger, a json'
                ' path must be specified when building metrics_logger')
        self._save_every = save_every
        self._metrics_logger = metrics_logger
        self._use_pmap = use_pmap
        self.forward_pass = None

        # In both the pmap case and non-pmap case, _tree_norm_fn_sql2 returns
        # unreplicated results on the host cpu.
        if use_pmap:
            self._tree_norm_fn_sql2 = pmap_then_unreplicate(tree_norm_sql2)
            if forward_pass is not None:
                self.forward_pass = pmap_then_unreplicate(forward_pass)
        else:
            self._tree_norm_fn_sql2 = tree_norm_sql2
            self.forward_pass = forward_pass

        self._stored_metrics = {}

        # In the case of preemption we want to restore prior metrics.
        if metrics_logger:
            metrics_file = os.path.join(metrics_logger._pytree_path,
                                        'training_metrics')
            if gfile.exists(metrics_file):
                self._stored_metrics = load_pytree(metrics_file)
Ejemplo n.º 13
0
def set_eval_paths(ckpt_dir, ckpt, custom_eval_id):
    """Set paths for evaluation and TensorBoard summaries."""
    eval_path, summary_key, best_epoch = None, None, None
    if ckpt is not None:
        best_epoch_path = (ckpt.replace('ckpt_best_of_',
                                        'best_epoch_of_').replace(
                                            'ckpt', 'best_epoch'))
        best_epoch_path = os.path.join(ckpt_dir, best_epoch_path)
        if gfile.exists(best_epoch_path):
            with gfile.GFile(best_epoch_path) as f:
                best_epoch = int(f.read())
        else:
            best_epoch = int(ckpt.replace('ckpt_', ''))
        print('best epoch:', best_epoch)
        eval_path = set_eval_path(ckpt_dir, custom_eval_id, ckpt)
        summary_key = set_summary_key(custom_eval_id, ckpt)
    return eval_path, summary_key, best_epoch
Ejemplo n.º 14
0
 def _get_tfrecords(self, name):
     paths = self.params.data_dir.split(':')
     data_dir = None
     for path in paths:
         if gfile.exists(join(path, name)):
             data_dir = path
             break
     assert data_dir is not None, "data_dir not found"
     paths = list(
         map(lambda x: join(data_dir, name, x),
             self.params.data_pattern.split(',')))
     files = gfile.glob(paths)
     if not files:
         raise IOError("Unable to find files. data_pattern='{}'.".format(
             self.params.data_pattern))
     logging.info("Number of TFRecord files: {}.".format(len(files)))
     return files
Ejemplo n.º 15
0
def load_trainer_state(output_dir):
    """Returns a TrainerState instance loaded from the given `output_dir`."""
    params_file = os.path.join(output_dir, 'model.pkl')
    if not gfile.exists(params_file):
        return TrainerState(step=None,
                            opt_state=None,
                            history=trax_history.History(),
                            model_state=None)

    pkl_module = utils.get_pickle_module()
    with gfile.GFile(params_file, 'rb') as f:
        (opt_state, step, history, model_state) = pkl_module.load(f)
    log('Model loaded from %s at step %d' % (params_file, step))
    logging.debug('From loaded model : history = %s', history)
    return TrainerState(step=step,
                        opt_state=OptState(*opt_state),
                        history=history,
                        model_state=model_state)
Ejemplo n.º 16
0
    def dump_trajectories(self, force=False):
        """Dumps trajectories in a new shard.

    Should be called at most once per epoch.

    Args:
      force: (bool) Whether to complete unfinished trajectories and create
        a new shard even if we have not reached the minimum size.
    """
        pkl_module = utils.get_pickle_module()
        if self.trajectory_dump_dir is None:
            return
        gfile.makedirs(self.trajectory_dump_dir)

        trajectories = self.train_env.trajectories
        if force:
            trajectories.complete_all_trajectories()

        # complete_all_trajectories() also adds trajectories that were just reset.
        # We don't want them since they have just the initial observation and no
        # actions, so we filter them out.
        def has_any_action(trajectory):
            return (trajectory.time_steps
                    and trajectory.time_steps[0].action is not None)

        self._trajectory_buffer.extend(
            filter(has_any_action, trajectories.completed_trajectories))

        trajectories.clear_completed_trajectories()
        ready = (len(self._trajectory_buffer) >=
                 self._trajectory_dump_min_count_per_shard)
        if ready or force:
            shard_path = os.path.join(self.trajectory_dump_dir,
                                      "{}.pkl".format(self.epoch))
            if gfile.exists(shard_path):
                # Since we do an extra dump at the end of the training loop, we
                # sometimes dump 2 times in the same epoch. When this happens, merge the
                # two sets of trajectories.
                with gfile.GFile(shard_path, "rb") as f:
                    self._trajectory_buffer = pkl_module.load(
                        f) + self._trajectory_buffer
            with gfile.GFile(shard_path, "wb") as f:
                pkl_module.dump(self._trajectory_buffer, f)
            self._trajectory_buffer = []
Ejemplo n.º 17
0
def gen_csv_from_annotations(
    input_dir: str,
    output_file=constants.DEFAULT_CSV_FILENAME,
    out_path_prefix='',
    dataset_type=constants.DEFAULT_DATASET_TYPE):
  """Generates AutoML dataset CSV from annotation files.

  Args:
    input_dir: Directory of annotation files.
    output_file: Output CSV filename.
    out_path_prefix: Filepath prefix to prepend to the image files.
      e.g.
      src_image_filename = '/tmp/path/to/image.jpg'
      out_path_prefix = 'gs://bucket/images'
      output_image_filename = 'gs://bucket/images/image.jpg'
    dataset_type: Dataset type (TRAIN, VAL, TEST, UNSPECIFIED)
      to use for all the parsed images.
  """

  if not gfile.exists(input_dir):
    raise ValueError('Input directory not found.')

  with gfile.GFile(os.path.expanduser(output_file), 'w') as outf:
    writer = csv.writer(outf, delimiter=',')
    for filename in gfile.listdir(os.path.expanduser(input_dir)):
      filepath = os.path.join(input_dir, filename)
      image_filename, boxes = annotation.read(filepath)
      out_image_filename = os.path.join(out_path_prefix, image_filename)
      for b in boxes:
        row = [
            dataset_type,
            out_image_filename,
            b.label,
            b.xmin,
            b.ymin,
            '',
            '',
            b.xmax,
            b.ymax,
            '',
            '',
        ]
        writer.writerow(row)
Ejemplo n.º 18
0
def write_eval_results(checkpoint_dir, all_gen_sentences, checkpoint_name,
                       mean_train_prob, mean_valid_prob, mean_gen_prob, fid):
    """Write evaluation results to disk."""
    to_write = ",".join(
        map(str, [
            checkpoint_name, mean_train_prob, mean_valid_prob, mean_gen_prob,
            fid
        ]))
    eval_filepath = os.path.join(checkpoint_dir, EVAL_FILENAME)
    previous_eval_content = ""
    if gfile.exists(eval_filepath):
        with gfile.GFile(eval_filepath, "r") as f:
            previous_eval_content = f.read()
    with gfile.GFile(eval_filepath, "w") as f:
        f.write(previous_eval_content + to_write + "\n")

    with gfile.GFile(
            os.path.join(checkpoint_dir, checkpoint_name + "_sentences.txt"),
            "w") as f:
        f.write("\n".join(all_gen_sentences))
Ejemplo n.º 19
0
def prepare_dirs(recreate=False):
    """Prepare config dirs

    When recreate is True, if previous execution exists, remove them and recreate.
    When recreate is False, remain previous execution.
    """
    experiment_dir = environment.EXPERIMENT_DIR
    tensorboard_dir = environment.TENSORBOARD_DIR
    checkpoints_dir = environment.CHECKPOINTS_DIR

    if recreate:
        message = """
Delete and recreate these dirs:
experiment_dir: {experiment_dir}
tensorboard_dir: {tensorboard_dir}
checkpoints_dir: {checkpoints_dir}
        """.format(experiment_dir=experiment_dir,
                   tensorboard_dir=tensorboard_dir,
                   checkpoints_dir=checkpoints_dir)
    else:
        message = """
Create these dirs if the dirs dont exist:
experiment_dir: {experiment_dir}
tensorboard_dir: {tensorboard_dir}
checkpoints_dir: {checkpoints_dir}
        """.format(experiment_dir=experiment_dir,
                   tensorboard_dir=tensorboard_dir,
                   checkpoints_dir=checkpoints_dir)

    print(message)

    if recreate:
        if gfile.exists(experiment_dir):
            gfile.rmtree(experiment_dir)

        if gfile.exists(tensorboard_dir):
            gfile.rmtree(tensorboard_dir)

        if gfile.exists(checkpoints_dir):
            gfile.rmtree(checkpoints_dir)

    if not gfile.exists(experiment_dir):
        gfile.makedirs(experiment_dir)

    if not gfile.exists(tensorboard_dir):
        gfile.makedirs(tensorboard_dir)

    if not gfile.exists(checkpoints_dir):
        gfile.makedirs(checkpoints_dir)
Ejemplo n.º 20
0
def save_checkpoint(optimizer,
                    model_state,
                    directory,
                    epoch):
  """Saves a model and its state.

  Removes a checkpoint if it already exists for a given epoch.

  Args:
    optimizer: The optimizer containing the model that we are training.
    model_state: Current state associated with the model.
    directory: Directory where the checkpoints should be saved.
    epoch: Number of epochs the model has been trained for.
  """
  train_state = dict(optimizer=optimizer,
                     model_state=model_state,
                     epoch=epoch)
  if gfile.exists(os.path.join(directory, 'checkpoint_' + str(epoch))):
    gfile.remove(os.path.join(directory, 'checkpoint_' + str(epoch)))
  checkpoints.save_checkpoint(directory, train_state, epoch, keep=2)
Ejemplo n.º 21
0
    def _load_backgrounds(self, backgrounds_dir):
        """Loads backgrounds from a directory.

    Args:
      backgrounds_dir: path to directory containing foregrounds.
        Dir of the form `backrounds_dir`/$BACKGROUND_TYPE/$FILE_NAME.

    Produces:
      self.bgs: a list of the form [bg0, bg1, ...] where the backgrounds
        are `PIL.Image.Image`s.
      self.num_bgs: int, number of backgrounds.
    """
        if not gfile.exists(backgrounds_dir):
            raise ValueError(
                f'Backgrounds directory {backgrounds_dir} does not exist.')
        bg_fnames = gfile.glob(path.join(backgrounds_dir, '*'))
        self.bgs = self._thread_pool.map(load_image, bg_fnames)
        self.bgs = self._thread_pool.map(self._preprocess_background, self.bgs)
        self.num_bgs = len(self.bgs)

        print('Backgrounds loaded.')
Ejemplo n.º 22
0
def _load_yaml(blueoil_config_filename):
    """load blueoil config yaml

    Args:
        blueoil_config_filename (str): File path of blueoil config yaml file.

    Returns:
        dict: blueoil config.

    """
    if not gfile.exists(blueoil_config_filename):
        FileNotFoundError("File not found: {}".format(blueoil_config_filename))

    with gfile.GFile(blueoil_config_filename, "r") as f:
        blueoil_config = yaml.load(f, Loader=yaml.SafeLoader)

    model_name, _ = os.path.splitext(os.path.basename(blueoil_config_filename))

    blueoil_config["model_name"] = model_name

    return blueoil_config
Ejemplo n.º 23
0
def restore_checkpoint(ckpt_dir, target, step=None, prefix='checkpoint_'):
    """Restore last/best checkpoint from checkpoints in path.

  Sorts the checkpoint files naturally, returning the highest-valued
  file, e.g.:
    ckpt_1, ckpt_2, ckpt_3 --> ckpt_3
    ckpt_0.01, ckpt_0.1, ckpt_0.001 --> ckpt_0.1
    ckpt_-1.0, ckpt_1.0, ckpt_1e5 --> ckpt_1e5

  Args:
    ckpt_dir: str: directory of checkpoints to restore from.
    target: matching object to rebuild via deserialized state-dict. If None,
      the deserialized state-dict is returned as-is.
    step: int: step number to load or None to load latest.
    prefix: str: name prefix of checkpoint files.

  Returns:
    Restored `target` updated from checkpoint file, or if no step specified and
    no checkpoint files present, returns the passed-in `target` unchanged.
  """
    if step:
        ckpt_path = _checkpoint_path(ckpt_dir, step, prefix)
        if not gfile.exists(ckpt_path):
            raise ValueError(f'Matching checkpoint not found: {ckpt_path}')
    else:
        glob_path = os.path.join(ckpt_dir, f'{prefix}*')
        checkpoint_files = natural_sort(gfile.glob(glob_path))
        ckpt_tmp_path = _checkpoint_path(ckpt_dir, 'tmp', prefix)
        checkpoint_files = [f for f in checkpoint_files if f != ckpt_tmp_path]
        if not checkpoint_files:
            return target
        ckpt_path = checkpoint_files[-1]

    logging.info('Restoring checkpoint from %s', ckpt_path)
    with gfile.GFile(ckpt_path, 'rb') as fp:
        if target is None:
            return serialization.msgpack_restore(fp.read())
        else:
            return serialization.from_bytes(target, fp.read())
Ejemplo n.º 24
0
    def encode_file(self,
                    path,
                    ordered=False,
                    verbose=False,
                    add_double_eos=False):
        if verbose: print('encoding file {} ...'.format(path))
        assert exists(path)
        encoded = []
        with open(path, 'r') as f:
            for idx, line in enumerate(f):
                if verbose and idx > 0 and idx % 500000 == 0:
                    print('  line {}'.format(idx))
                symbols = self.tokenize(line,
                                        add_eos=True,
                                        add_double_eos=add_double_eos)

                encoded.append(self.convert_to_nparray(symbols))

        if ordered:
            encoded = np.concatenate(encoded)

        return encoded
def get_lm_corpus(data_dir, dataset):
    fn = os.path.join(data_dir, "cache.pkl")

    if exists(fn):
        print("Loading cached dataset...")
        with open(fn, "rb") as fp:
            corpus = pickle.load(fp)
    else:
        print("Producing dataset...")
        kwargs = {}
        if dataset in ["wt103", "wt2"]:
            kwargs["special"] = ["<eos>"]
            kwargs["lower_case"] = False
        elif dataset == "ptb":
            kwargs["special"] = ["<eos>"]
            kwargs["lower_case"] = True
        elif dataset == "lm1b":
            kwargs["special"] = []
            kwargs["lower_case"] = False
            kwargs["vocab_file"] = os.path.join(data_dir, "1b_word_vocab.txt")
        elif dataset in ["enwik8", "text8"]:
            pass

        corpus = Corpus(data_dir, dataset, **kwargs)

        print("Saving dataset...")
        with open(fn, "wb") as fp:
            pickle.dump(corpus, fp, protocol=2)

        corpus_info = {
            "vocab_size": len(corpus.vocab),
            "cutoffs": corpus.cutoffs,
            "dataset": corpus.dataset
        }
        with open(os.path.join(data_dir, "corpus-info.json"), "w") as fp:
            json.dump(corpus_info, fp)

    return corpus
    def predict(self, trained_model_chckpt: str = None):
        """
        Inference of model on test datasets
        """
        # Load a trained checkpoint if a valid model checkpoint
        if trained_model_chckpt and gfile.exists(trained_model_chckpt):
            logger.info(
                f"Running predictions using: {trained_model_chckpt}. This may take 3 minutes."
            )
            self.load(trained_model_chckpt)
            logger.info("Checkpoint loaded.")

        self.config.batch_size_eval = 128
        self.config.use_glue_format = True

        # test eval
        for idx, dataset in enumerate(self.test_datasets_list):
            prefix = dataset.split("_")[0]
            results = self._predict(idx, prefix, dataset, eval_type='test')
            if results:
                logger.info(f"[new test scores saved for {dataset}.]")
            else:
                logger.info(f"Data not found for {dataset}.")
Ejemplo n.º 27
0
    def _load_foregrounds(self, foregrounds_dir):
        """Loads foregrounds from a directory.

    Args:
      foregrounds_dir: path to directory containing foregrounds.
        Directory of the form `foregrounds_dir`/$OBJECT_CLASS/$FILE_NAME.

    Produces:
      self.fg_classes: a list of names of foreground object classes, e.g.
        ['ambulance', 'bagel', ...]
      self.num_fgs_per_class: a dict of the form {foreground_obj_class_name:
        num_fgs_in_that_class}
      self.fgs: a list of the form [fg0, fg1, ...] where the foregrounds are
        `PIL.PngImagePlugin.PngImageFile`s.
      self.fgs_dict: a dict of the form {fg_class_name: [img0, img1, ...]} where
        the images are `PIL.PngImagePlugin.PngImageFile`s.
    """
        if not gfile.exists(foregrounds_dir):
            raise ValueError(
                f'Foregrounds directory {foregrounds_dir} does not exist.')
        fg_fnames = gfile.glob(path.join(foregrounds_dir, '*/*'))
        fg_labels = [x.split('/')[-2] for x in fg_fnames]  # e.g. 'car', 'cow'
        self.fg_classes = sorted(list(set(fg_labels)))
        self.num_fgs_per_class = {
            fg_class: len(gfile.glob(path.join(foregrounds_dir, fg_class,
                                               '*')))
            for fg_class in self.fg_classes
        }
        self.num_fgs_per_class_list = [
            self.num_fgs_per_class[fg_class] for fg_class in self.fg_classes
        ]
        self.fgs = self._thread_pool.map(load_image, fg_fnames)
        self.fgs_dict = {fg_class: [] for fg_class in self.fg_classes}
        for i, label in enumerate(fg_labels):
            self.fgs_dict[label].append(self.fgs[i])

        print('Foregrounds loaded.')
def main(unused_argv):
    del unused_argv  # Unused

    corpus = get_lm_corpus(FLAGS.data_dir, FLAGS.dataset)  #

    save_dir = os.path.join(FLAGS.data_dir, "tfrecords")
    if not exists(save_dir):
        makedirs(save_dir)

    # test mode
    if FLAGS.per_host_test_bsz > 0:
        corpus.convert_to_tfrecords("test", save_dir, FLAGS.per_host_test_bsz,
                                    FLAGS.tgt_len, FLAGS.num_core_per_host,
                                    FLAGS=FLAGS)
        return

    for split, batch_size in zip(
            ["train", "valid"],
            [FLAGS.per_host_train_bsz, FLAGS.per_host_valid_bsz]):

        if batch_size <= 0: continue
        print("Converting {} set...".format(split))
        corpus.convert_to_tfrecords(split, save_dir, batch_size, FLAGS.tgt_len,
                                    FLAGS.num_core_per_host, FLAGS=FLAGS)
Ejemplo n.º 29
0
def main(argv):
    """Main function."""
    if len(argv) > 1:
        raise app.UsageError('Too many command-line arguments.')

    # TODO(mohitreddy): Change to flags.mark_flag_as_required('model_dir').
    assert FLAGS.model_dir is not None, 'Please provide model_dir.'
    if not gfile.exists(FLAGS.model_dir):
        gfile.makedirs(FLAGS.model_dir)

    train_and_evaluate(seed=FLAGS.seed,
                       model_dir=FLAGS.model_dir,
                       num_epochs=FLAGS.num_epochs,
                       batch_size=FLAGS.batch_size,
                       embedding_size=FLAGS.embedding_size,
                       hidden_size=FLAGS.hidden_size,
                       min_freq=FLAGS.min_freq,
                       max_seq_len=FLAGS.max_seq_len,
                       dropout=FLAGS.dropout,
                       emb_dropout=FLAGS.emb_dropout,
                       word_dropout_rate=FLAGS.word_dropout_rate,
                       learning_rate=FLAGS.learning_rate,
                       checkpoints_to_keep=FLAGS.checkpoints_to_keep,
                       l2_reg=FLAGS.l2_reg)
Ejemplo n.º 30
0
    def test_whole_pipeline(self):
        test_subdirectory = self.create_tempdir()
        output_stem = os.path.join(test_subdirectory, 'testout')
        input_stage1_dat_glob = os.path.join(TESTDATA_PATH,
                                             'pipeline_input_stage1.dat')
        input_stage2_dat_glob = os.path.join(TESTDATA_PATH,
                                             'pipeline_input_stage2.dat')
        input_equivalent_glob = os.path.join(TESTDATA_PATH,
                                             'pipeline_equivalent.dat')
        input_bond_topology_csv = os.path.join(TESTDATA_PATH,
                                               'pipeline_bond_topology.csv')
        with flagsaver.flagsaver(
                input_stage1_dat_glob=input_stage1_dat_glob,
                input_stage2_dat_glob=input_stage2_dat_glob,
                input_equivalent_glob=input_equivalent_glob,
                input_bond_topology_csv=input_bond_topology_csv,
                output_stem=output_stem,
                output_shards=1):
            # If you have custom beam options, add them here.
            beam_options = None
            with beam.Pipeline(beam_options) as root:
                pipeline.pipeline(root)

        metrics = root.result.metrics().query()
        counters_dict = {
            m.key.metric.name: m.committed
            for m in metrics['counters']
        }

        self.assertEqual(counters_dict['attempted_topology_matches'], 3)
        # Conformer 620517 will not match because bond lengths are not extracted
        # from conformers with serious errors like this.
        self.assertEqual(counters_dict['no_topology_matches'], 1)
        self.assertNotIn('topology_match_smiles_failure', counters_dict)

        logging.info(
            'Files in output: %s',
            '\n'.join(gfile.glob(os.path.join(test_subdirectory, '*'))))
        for stage in ['stage1', 'stage2']:
            self.assertTrue(
                gfile.exists(output_stem + '_' + stage +
                             '_original_known_error-00000-of-00001.dat'))
            self.assertTrue(
                gfile.exists(output_stem + '_' + stage +
                             '_original_unknown_error-00000-of-00001.dat'))
            self.assertTrue(
                gfile.exists(output_stem + '_' + stage +
                             '_mismatched_original-00000-of-00001.dat'))
            self.assertTrue(
                gfile.exists(output_stem + '_' + stage +
                             '_mismatched_regen-00000-of-00001.dat'))

        # Check the merge conflicts file
        with gfile.GFile(output_stem + '_conflicts-00000-of-00001.csv') as f:
            conflicts_lines = f.readlines()
            self.assertIn('conformer_id,', conflicts_lines[0])
            self.assertEqual(
                conflicts_lines[1], '618451001,1,1,1,1,'
                '-406.51179,9.999999,-406.522079,9.999999,True,True,'
                '-406.51179,0.052254,-406.522079,2.5e-05,True,True\n')

        # Check a couple of the stats.
        with gfile.GFile(output_stem + '_stats-00000-of-00001.csv') as f:
            stats_lines = f.readlines()
            self.assertIn('errors.status,0,2\n', stats_lines)
            self.assertIn('errors.warn_t1,0,4\n', stats_lines)
            self.assertIn('fate,FATE_SUCCESS,2\n', stats_lines)
            self.assertIn('fate,FATE_DUPLICATE_DIFFERENT_TOPOLOGY,1\n',
                          stats_lines)
            self.assertIn('num_initial_geometries,1,4\n', stats_lines)
            self.assertIn('num_duplicates,1,1\n', stats_lines)
            self.assertIn('zero_field,single_point_energy_pbe0d3_6_311gd,1\n',
                          stats_lines)

        # Check the smiles comparison output
        with gfile.GFile(output_stem +
                         '_smiles_compare-00000-of-00001.csv') as f:
            smiles_lines = f.readlines()
            self.assertIn(
                '620517002,MISMATCH,NotAValidSmilesString,'
                '[H]C1=C2OC2=C(F)O1,FC1=C2OC2=CO1\n', smiles_lines)
            # Make sure that a bond topology with a matching smiles doesn't show
            for line in smiles_lines:
                self.assertNotIn('618451001', line)

        # Check the bond topology summary
        with gfile.GFile(output_stem + '_bt_summary-00000-of-00001.csv') as f:
            bt_summary_lines = f.readlines()
            # Check part of the header line
            self.assertIn('bt_id', bt_summary_lines[0])
            self.assertIn('count_attempted_conformers', bt_summary_lines[0])
            # This is the bond topology that has no conformer
            self.assertIn('10,0,0,0,0,0,0,0,0,0,0,0,0,0\n', bt_summary_lines)
            # This is a bond topology with 1 conformer
            self.assertIn('620517,1,0,0,0,1,0,1,0,0,0,0,0,0\n',
                          bt_summary_lines)
            # This is a bond topology with 2 conformers
            self.assertIn('618451,2,0,0,0,2,0,0,0,2,0,0,0,0\n',
                          bt_summary_lines)

        # Check the bond lengths file
        with gfile.GFile(output_stem + '_bond_lengths.csv') as f:
            bond_length_lines = f.readlines()
            self.assertEqual(
                'atom_char_0,atom_char_1,bond_type,length_str,count\n',
                bond_length_lines[0])
            self.assertIn('c,c,2,1.336,1\n', bond_length_lines)
            self.assertIn('c,o,1,1.422,2\n', bond_length_lines)

        # For the gzip files below, we check >100 because even an empty gzip file
        # has non-zero length. 100 is kind of arbitrary to be bigger than the
        # expected header of 20.

        # Check that the generated TFRecord files contain some expected outputs
        standard_dataset = tf.data.TFRecordDataset(
            output_stem + '_standard_tfrecord-00000-of-00001')
        standard_output = [
            dataset_pb2.Conformer.FromString(raw)
            for raw in standard_dataset.as_numpy_iterator()
        ]
        self.assertCountEqual([c.conformer_id for c in standard_output],
                              [618451001, 618451123])
        # Check that fields are filtered the way we expect
        self.assertFalse(
            standard_output[0].properties.HasField('compute_cluster_info'))
        self.assertFalse(
            standard_output[0].properties.HasField('homo_pbe0_aug_pc_1'))
        self.assertTrue(
            standard_output[0].properties.HasField('rotational_constants'))

        complete_dataset = tf.data.TFRecordDataset(
            output_stem + '_complete_tfrecord-00000-of-00001')
        complete_output = [
            dataset_pb2.Conformer.FromString(raw)
            for raw in complete_dataset.as_numpy_iterator()
        ]
        self.assertCountEqual([c.conformer_id for c in complete_output],
                              [618451001, 618451123, 620517002, 79593005])
        # Check that fields are filtered the way we expect
        # The DirectRunner randomizes the order of output so we need to make sure
        # that we get a full record.
        complete_entry = [
            c for c in complete_output if c.conformer_id == 618451001
        ][0]
        self.assertFalse(
            complete_entry.properties.HasField('compute_cluster_info'))
        self.assertTrue(
            complete_entry.properties.HasField('homo_pbe0_aug_pc_1'))
        self.assertTrue(
            complete_entry.properties.HasField('rotational_constants'))

        complete_entry_for_smiles = [
            c for c in complete_output if c.conformer_id == 620517002
        ][0]
        self.assertEqual(complete_entry_for_smiles.properties.smiles_openbabel,
                         'NotAValidSmilesString')